summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile14
-rw-r--r--kernel/acct.c14
-rw-r--r--kernel/async.c119
-rw-r--r--kernel/audit.c155
-rw-r--r--kernel/audit.h43
-rw-r--r--kernel/audit_tree.c77
-rw-r--r--kernel/audit_watch.c543
-rw-r--r--kernel/auditfilter.c534
-rw-r--r--kernel/auditsc.c65
-rw-r--r--kernel/cgroup.c606
-rw-r--r--kernel/cgroup_debug.c2
-rw-r--r--kernel/compat.c11
-rw-r--r--kernel/cpu.c19
-rw-r--r--kernel/cpuset.c521
-rw-r--r--kernel/cred.c297
-rw-r--r--kernel/dma-coherent.c47
-rw-r--r--kernel/exec_domain.c23
-rw-r--r--kernel/exit.c578
-rw-r--r--kernel/extable.c48
-rw-r--r--kernel/fork.c210
-rw-r--r--kernel/freezer.c7
-rw-r--r--kernel/futex.c1490
-rw-r--r--kernel/futex_compat.c6
-rw-r--r--kernel/gcov/Kconfig48
-rw-r--r--kernel/gcov/Makefile3
-rw-r--r--kernel/gcov/base.c148
-rw-r--r--kernel/gcov/fs.c673
-rw-r--r--kernel/gcov/gcc_3_4.c447
-rw-r--r--kernel/gcov/gcov.h128
-rw-r--r--kernel/groups.c288
-rw-r--r--kernel/hrtimer.c188
-rw-r--r--kernel/hung_task.c217
-rw-r--r--kernel/irq/Makefile3
-rw-r--r--kernel/irq/chip.c100
-rw-r--r--kernel/irq/devres.c16
-rw-r--r--kernel/irq/handle.c210
-rw-r--r--kernel/irq/internals.h27
-rw-r--r--kernel/irq/manage.c586
-rw-r--r--kernel/irq/migration.c20
-rw-r--r--kernel/irq/numa_migrate.c72
-rw-r--r--kernel/irq/pm.c79
-rw-r--r--kernel/irq/proc.c4
-rw-r--r--kernel/irq/resend.c3
-rw-r--r--kernel/irq/spurious.c15
-rw-r--r--kernel/itimer.c4
-rw-r--r--kernel/kallsyms.c167
-rw-r--r--kernel/kexec.c37
-rw-r--r--kernel/kfifo.c4
-rw-r--r--kernel/kgdb.c4
-rw-r--r--kernel/kmod.c26
-rw-r--r--kernel/kprobes.c351
-rw-r--r--kernel/kthread.c119
-rw-r--r--kernel/latencytop.c83
-rw-r--r--kernel/lockdep.c1368
-rw-r--r--kernel/lockdep_internals.h51
-rw-r--r--kernel/lockdep_proc.c153
-rw-r--r--kernel/lockdep_states.h9
-rw-r--r--kernel/module.c526
-rw-r--r--kernel/mutex-debug.c9
-rw-r--r--kernel/mutex-debug.h18
-rw-r--r--kernel/mutex.c149
-rw-r--r--kernel/mutex.h22
-rw-r--r--kernel/ns_cgroup.c14
-rw-r--r--kernel/nsproxy.c19
-rw-r--r--kernel/panic.c172
-rw-r--r--kernel/params.c60
-rw-r--r--kernel/perf_counter.c4962
-rw-r--r--kernel/pid.c50
-rw-r--r--kernel/pid_namespace.c39
-rw-r--r--kernel/posix-cpu-timers.c218
-rw-r--r--kernel/posix-timers.c7
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/Makefile7
-rw-r--r--kernel/power/console.c6
-rw-r--r--kernel/power/hibernate.c (renamed from kernel/power/disk.c)227
-rw-r--r--kernel/power/hibernate_nvs.c135
-rw-r--r--kernel/power/main.c503
-rw-r--r--kernel/power/power.h25
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/process.c5
-rw-r--r--kernel/power/snapshot.c89
-rw-r--r--kernel/power/suspend.c300
-rw-r--r--kernel/power/suspend_test.c187
-rw-r--r--kernel/power/swap.c7
-rw-r--r--kernel/power/swsusp.c200
-rw-r--r--kernel/power/user.c18
-rw-r--r--kernel/printk.c249
-rw-r--r--kernel/profile.c20
-rw-r--r--kernel/ptrace.c300
-rw-r--r--kernel/rcuclassic.c788
-rw-r--r--kernel/rcupdate.c94
-rw-r--r--kernel/rcupreempt.c1502
-rw-r--r--kernel/rcupreempt_trace.c334
-rw-r--r--kernel/rcutorture.c227
-rw-r--r--kernel/rcutree.c333
-rw-r--r--kernel/rcutree.h259
-rw-r--r--kernel/rcutree_plugin.h532
-rw-r--r--kernel/rcutree_trace.c148
-rw-r--r--kernel/relay.c18
-rw-r--r--kernel/res_counter.c12
-rw-r--r--kernel/resource.c48
-rw-r--r--kernel/rtmutex.c248
-rw-r--r--kernel/rtmutex_common.h8
-rw-r--r--kernel/sched.c3024
-rw-r--r--kernel/sched_clock.c41
-rw-r--r--kernel/sched_cpupri.c58
-rw-r--r--kernel/sched_cpupri.h2
-rw-r--r--kernel/sched_debug.c18
-rw-r--r--kernel/sched_fair.c221
-rw-r--r--kernel/sched_features.h6
-rw-r--r--kernel/sched_idletask.c3
-rw-r--r--kernel/sched_rt.c620
-rw-r--r--kernel/sched_stats.h55
-rw-r--r--kernel/seccomp.c7
-rw-r--r--kernel/signal.c200
-rw-r--r--kernel/slow-work.c645
-rw-r--r--kernel/smp.c408
-rw-r--r--kernel/softirq.c125
-rw-r--r--kernel/softlockup.c109
-rw-r--r--kernel/spinlock.c240
-rw-r--r--kernel/stop_machine.c4
-rw-r--r--kernel/sys.c366
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/sysctl.c175
-rw-r--r--kernel/sysctl_check.c1
-rw-r--r--kernel/taskstats.c10
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clockevents.c39
-rw-r--r--kernel/time/clocksource.c107
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/ntp.c444
-rw-r--r--kernel/time/tick-broadcast.c9
-rw-r--r--kernel/time/tick-common.c38
-rw-r--r--kernel/time/tick-oneshot.c17
-rw-r--r--kernel/time/tick-sched.c21
-rw-r--r--kernel/time/timecompare.c191
-rw-r--r--kernel/time/timekeeping.c21
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/time/timer_stats.c16
-rw-r--r--kernel/timer.c317
-rw-r--r--kernel/trace/Kconfig301
-rw-r--r--kernel/trace/Makefile21
-rw-r--r--kernel/trace/blktrace.c1728
-rw-r--r--kernel/trace/ftrace.c2095
-rw-r--r--kernel/trace/kmemtrace.c511
-rw-r--r--kernel/trace/ring_buffer.c2375
-rw-r--r--kernel/trace/ring_buffer_benchmark.c419
-rw-r--r--kernel/trace/trace.c3727
-rw-r--r--kernel/trace/trace.h423
-rw-r--r--kernel/trace/trace_boot.c49
-rw-r--r--kernel/trace/trace_branch.c288
-rw-r--r--kernel/trace/trace_clock.c109
-rw-r--r--kernel/trace/trace_event_profile.c39
-rw-r--r--kernel/trace/trace_event_types.h178
-rw-r--r--kernel/trace/trace_events.c1505
-rw-r--r--kernel/trace/trace_events_filter.c1223
-rw-r--r--kernel/trace/trace_export.c206
-rw-r--r--kernel/trace/trace_functions.c368
-rw-r--r--kernel/trace/trace_functions_graph.c855
-rw-r--r--kernel/trace/trace_hw_branches.c246
-rw-r--r--kernel/trace/trace_irqsoff.c58
-rw-r--r--kernel/trace/trace_mmiotrace.c67
-rw-r--r--kernel/trace/trace_nop.c6
-rw-r--r--kernel/trace/trace_output.c1202
-rw-r--r--kernel/trace/trace_output.h51
-rw-r--r--kernel/trace/trace_power.c215
-rw-r--r--kernel/trace/trace_printk.c252
-rw-r--r--kernel/trace/trace_sched_switch.c94
-rw-r--r--kernel/trace/trace_sched_wakeup.c116
-rw-r--r--kernel/trace/trace_selftest.c247
-rw-r--r--kernel/trace/trace_stack.c86
-rw-r--r--kernel/trace/trace_stat.c387
-rw-r--r--kernel/trace/trace_stat.h33
-rw-r--r--kernel/trace/trace_syscalls.c523
-rw-r--r--kernel/trace/trace_sysprof.c30
-rw-r--r--kernel/trace/trace_workqueue.c295
-rw-r--r--kernel/tracepoint.c57
-rw-r--r--kernel/tsacct.c6
-rw-r--r--kernel/user.c102
-rw-r--r--kernel/user_namespace.c21
-rw-r--r--kernel/utsname.c13
-rw-r--r--kernel/utsname_sysctl.c2
-rw-r--r--kernel/wait.c64
-rw-r--r--kernel/workqueue.c107
184 files changed, 38784 insertions, 13664 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 170a9213c1b..b833bd5cc12 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
async.o
+obj-y += groups.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
@@ -51,6 +52,7 @@ obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_PM) += power/
+obj-$(CONFIG_FREEZER) += power/
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
@@ -67,20 +69,20 @@ obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
-obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
+obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
+obj-$(CONFIG_GCOV_KERNEL) += gcov/
obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
+obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
-obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
obj-$(CONFIG_TREE_RCU) += rcutree.o
-obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
+obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
-obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -91,7 +93,11 @@ obj-$(CONFIG_LATENCYTOP) += latencytop.o
obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
+obj-$(CONFIG_X86_DS) += trace/
+obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
+obj-$(CONFIG_SLOW_WORK) += slow-work.o
+obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 7afa3156416..9a4715a2f6b 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -215,6 +215,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
static int acct_on(char *name)
{
struct file *file;
+ struct vfsmount *mnt;
int error;
struct pid_namespace *ns;
struct bsd_acct_struct *acct = NULL;
@@ -256,11 +257,12 @@ static int acct_on(char *name)
acct = NULL;
}
- mnt_pin(file->f_path.mnt);
+ mnt = file->f_path.mnt;
+ mnt_pin(mnt);
acct_file_reopen(ns->bacct, file, ns);
spin_unlock(&acct_lock);
- mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
+ mntput(mnt); /* it's pinned, now give up active reference */
kfree(acct);
return 0;
@@ -489,13 +491,17 @@ static void do_acct_process(struct bsd_acct_struct *acct,
u64 run_time;
struct timespec uptime;
struct tty_struct *tty;
+ const struct cred *orig_cred;
+
+ /* Perform file operations on behalf of whoever enabled accounting */
+ orig_cred = override_creds(file->f_cred);
/*
* First check to see if there is enough free_space to continue
* the process accounting system.
*/
if (!check_free_space(acct, file))
- return;
+ goto out;
/*
* Fill the accounting struct with the needed info as recorded
@@ -576,6 +582,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
sizeof(acct_t), &file->f_pos);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
set_fs(fs);
+out:
+ revert_creds(orig_cred);
}
/**
diff --git a/kernel/async.c b/kernel/async.c
index 608b32b4281..27235f5de19 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,11 +49,13 @@ asynchronous and synchronous parts of the kernel.
*/
#include <linux/async.h>
+#include <linux/bug.h>
#include <linux/module.h>
#include <linux/wait.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/kthread.h>
+#include <linux/delay.h>
#include <asm/atomic.h>
static async_cookie_t next_cookie = 1;
@@ -90,19 +92,18 @@ extern int initcall_debug;
static async_cookie_t __lowest_in_progress(struct list_head *running)
{
struct async_entry *entry;
+
if (!list_empty(running)) {
entry = list_first_entry(running,
struct async_entry, list);
return entry->cookie;
- } else if (!list_empty(&async_pending)) {
- entry = list_first_entry(&async_pending,
- struct async_entry, list);
- return entry->cookie;
- } else {
- /* nothing in progress... next_cookie is "infinity" */
- return next_cookie;
}
+ list_for_each_entry(entry, &async_pending, list)
+ if (entry->running == running)
+ return entry->cookie;
+
+ return next_cookie; /* "infinity" value */
}
static async_cookie_t lowest_in_progress(struct list_head *running)
@@ -132,21 +133,23 @@ static void run_one_entry(void)
entry = list_first_entry(&async_pending, struct async_entry, list);
/* 2) move it to the running queue */
- list_del(&entry->list);
- list_add_tail(&entry->list, &async_running);
+ list_move_tail(&entry->list, entry->running);
spin_unlock_irqrestore(&async_lock, flags);
/* 3) run it (and print duration)*/
if (initcall_debug && system_state == SYSTEM_BOOTING) {
- printk("calling %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current));
+ printk("calling %lli_%pF @ %i\n", (long long)entry->cookie,
+ entry->func, task_pid_nr(current));
calltime = ktime_get();
}
entry->func(entry->data, entry->cookie);
if (initcall_debug && system_state == SYSTEM_BOOTING) {
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
- printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie,
- entry->func, ktime_to_ns(delta) >> 10);
+ printk("initcall %lli_%pF returned 0 after %lld usecs\n",
+ (long long)entry->cookie,
+ entry->func,
+ (long long)ktime_to_ns(delta) >> 10);
}
/* 4) remove it from the running queue */
@@ -205,18 +208,44 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
return newcookie;
}
+/**
+ * async_schedule - schedule a function for asynchronous execution
+ * @ptr: function to execute asynchronously
+ * @data: data pointer to pass to the function
+ *
+ * Returns an async_cookie_t that may be used for checkpointing later.
+ * Note: This function may be called from atomic or non-atomic contexts.
+ */
async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
{
- return __async_schedule(ptr, data, &async_pending);
+ return __async_schedule(ptr, data, &async_running);
}
EXPORT_SYMBOL_GPL(async_schedule);
-async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running)
+/**
+ * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
+ * @ptr: function to execute asynchronously
+ * @data: data pointer to pass to the function
+ * @running: running list for the domain
+ *
+ * Returns an async_cookie_t that may be used for checkpointing later.
+ * @running may be used in the async_synchronize_*_domain() functions
+ * to wait within a certain synchronization domain rather than globally.
+ * A synchronization domain is specified via the running queue @running to use.
+ * Note: This function may be called from atomic or non-atomic contexts.
+ */
+async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
+ struct list_head *running)
{
return __async_schedule(ptr, data, running);
}
-EXPORT_SYMBOL_GPL(async_schedule_special);
+EXPORT_SYMBOL_GPL(async_schedule_domain);
+/**
+ * async_synchronize_full - synchronize all asynchronous function calls
+ *
+ * This function waits until all asynchronous function calls have been done.
+ */
void async_synchronize_full(void)
{
do {
@@ -225,13 +254,30 @@ void async_synchronize_full(void)
}
EXPORT_SYMBOL_GPL(async_synchronize_full);
-void async_synchronize_full_special(struct list_head *list)
+/**
+ * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
+ * @list: running list to synchronize on
+ *
+ * This function waits until all asynchronous function calls for the
+ * synchronization domain specified by the running list @list have been done.
+ */
+void async_synchronize_full_domain(struct list_head *list)
{
- async_synchronize_cookie_special(next_cookie, list);
+ async_synchronize_cookie_domain(next_cookie, list);
}
-EXPORT_SYMBOL_GPL(async_synchronize_full_special);
+EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
-void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running)
+/**
+ * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
+ * @cookie: async_cookie_t to use as checkpoint
+ * @running: running list to synchronize on
+ *
+ * This function waits until all asynchronous function calls for the
+ * synchronization domain specified by the running list @list submitted
+ * prior to @cookie have been done.
+ */
+void async_synchronize_cookie_domain(async_cookie_t cookie,
+ struct list_head *running)
{
ktime_t starttime, delta, endtime;
@@ -247,14 +293,22 @@ void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *r
delta = ktime_sub(endtime, starttime);
printk("async_continuing @ %i after %lli usec\n",
- task_pid_nr(current), ktime_to_ns(delta) >> 10);
+ task_pid_nr(current),
+ (long long)ktime_to_ns(delta) >> 10);
}
}
-EXPORT_SYMBOL_GPL(async_synchronize_cookie_special);
+EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
+/**
+ * async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing
+ * @cookie: async_cookie_t to use as checkpoint
+ *
+ * This function waits until all asynchronous function calls prior to @cookie
+ * have been done.
+ */
void async_synchronize_cookie(async_cookie_t cookie)
{
- async_synchronize_cookie_special(cookie, &async_running);
+ async_synchronize_cookie_domain(cookie, &async_running);
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie);
@@ -315,7 +369,11 @@ static int async_manager_thread(void *unused)
ec = atomic_read(&entry_count);
while (tc < ec && tc < MAX_THREADS) {
- kthread_run(async_thread, NULL, "async/%i", tc);
+ if (IS_ERR(kthread_run(async_thread, NULL, "async/%i",
+ tc))) {
+ msleep(100);
+ continue;
+ }
atomic_inc(&thread_count);
tc++;
}
@@ -329,18 +387,11 @@ static int async_manager_thread(void *unused)
static int __init async_init(void)
{
- if (async_enabled)
- kthread_run(async_manager_thread, NULL, "async/mgr");
- return 0;
-}
+ async_enabled =
+ !IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr"));
-static int __init setup_async(char *str)
-{
- async_enabled = 1;
- return 1;
+ WARN_ON(!async_enabled);
+ return 0;
}
-__setup("fastboot", setup_async);
-
-
core_initcall(async_init);
diff --git a/kernel/audit.c b/kernel/audit.c
index ce6d8ea3131..defc2e6f1e3 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -115,9 +115,6 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
/* The netlink socket. */
static struct sock *audit_sock;
-/* Inotify handle. */
-struct inotify_handle *audit_ih;
-
/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -136,7 +133,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
/* Serialize requests from userspace. */
-static DEFINE_MUTEX(audit_cmd_mutex);
+DEFINE_MUTEX(audit_cmd_mutex);
/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
* audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -375,6 +372,25 @@ static void audit_hold_skb(struct sk_buff *skb)
kfree_skb(skb);
}
+/*
+ * For one reason or another this nlh isn't getting delivered to the userspace
+ * audit daemon, just send it to printk.
+ */
+static void audit_printk_skb(struct sk_buff *skb)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(skb);
+ char *data = NLMSG_DATA(nlh);
+
+ if (nlh->nlmsg_type != AUDIT_EOE) {
+ if (printk_ratelimit())
+ printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data);
+ else
+ audit_log_lost("printk limit exceeded\n");
+ }
+
+ audit_hold_skb(skb);
+}
+
static void kauditd_send_skb(struct sk_buff *skb)
{
int err;
@@ -427,14 +443,8 @@ static int kauditd_thread(void *dummy)
if (skb) {
if (audit_pid)
kauditd_send_skb(skb);
- else {
- if (printk_ratelimit())
- printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
- else
- audit_log_lost("printk limit exceeded\n");
-
- audit_hold_skb(skb);
- }
+ else
+ audit_printk_skb(skb);
} else {
DECLARE_WAITQUEUE(wait, current);
set_current_state(TASK_INTERRUPTIBLE);
@@ -495,42 +505,25 @@ int audit_send_list(void *_dest)
return 0;
}
-#ifdef CONFIG_AUDIT_TREE
-static int prune_tree_thread(void *unused)
-{
- mutex_lock(&audit_cmd_mutex);
- audit_prune_trees();
- mutex_unlock(&audit_cmd_mutex);
- return 0;
-}
-
-void audit_schedule_prune(void)
-{
- kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
-}
-#endif
-
struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
int multi, void *payload, int size)
{
struct sk_buff *skb;
struct nlmsghdr *nlh;
- int len = NLMSG_SPACE(size);
void *data;
int flags = multi ? NLM_F_MULTI : 0;
int t = done ? NLMSG_DONE : type;
- skb = alloc_skb(len, GFP_KERNEL);
+ skb = nlmsg_new(size, GFP_KERNEL);
if (!skb)
return NULL;
- nlh = NLMSG_PUT(skb, pid, seq, t, size);
- nlh->nlmsg_flags = flags;
- data = NLMSG_DATA(nlh);
+ nlh = NLMSG_NEW(skb, pid, seq, t, size, flags);
+ data = NLMSG_DATA(nlh);
memcpy(data, payload, size);
return skb;
-nlmsg_failure: /* Used by NLMSG_PUT */
+nlmsg_failure: /* Used by NLMSG_NEW */
if (skb)
kfree_skb(skb);
return NULL;
@@ -766,6 +759,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
audit_log_format(ab, " msg=");
size = nlmsg_len(nlh);
+ if (size > 0 &&
+ ((unsigned char *)data)[size - 1] == '\0')
+ size--;
audit_log_n_untrustedstring(ab, data, size);
}
audit_set_pid(ab, pid);
@@ -923,28 +919,29 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
}
/*
- * Get message from skb (based on rtnetlink_rcv_skb). Each message is
- * processed by audit_receive_msg. Malformed skbs with wrong length are
- * discarded silently.
+ * Get message from skb. Each message is processed by audit_receive_msg.
+ * Malformed skbs with wrong length are discarded silently.
*/
static void audit_receive_skb(struct sk_buff *skb)
{
- int err;
- struct nlmsghdr *nlh;
- u32 rlen;
+ struct nlmsghdr *nlh;
+ /*
+ * len MUST be signed for NLMSG_NEXT to be able to dec it below 0
+ * if the nlmsg_len was not aligned
+ */
+ int len;
+ int err;
- while (skb->len >= NLMSG_SPACE(0)) {
- nlh = nlmsg_hdr(skb);
- if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
- return;
- rlen = NLMSG_ALIGN(nlh->nlmsg_len);
- if (rlen > skb->len)
- rlen = skb->len;
- if ((err = audit_receive_msg(skb, nlh))) {
+ nlh = nlmsg_hdr(skb);
+ len = skb->len;
+
+ while (NLMSG_OK(nlh, len)) {
+ err = audit_receive_msg(skb, nlh);
+ /* if err or if this message says it wants a response */
+ if (err || (nlh->nlmsg_flags & NLM_F_ACK))
netlink_ack(skb, nlh, err);
- } else if (nlh->nlmsg_flags & NLM_F_ACK)
- netlink_ack(skb, nlh, 0);
- skb_pull(skb, rlen);
+
+ nlh = NLMSG_NEXT(nlh, len);
}
}
@@ -956,13 +953,6 @@ static void audit_receive(struct sk_buff *skb)
mutex_unlock(&audit_cmd_mutex);
}
-#ifdef CONFIG_AUDITSYSCALL
-static const struct inotify_operations audit_inotify_ops = {
- .handle_event = audit_handle_ievent,
- .destroy_watch = audit_free_parent,
-};
-#endif
-
/* Initialize audit support at boot time. */
static int __init audit_init(void)
{
@@ -988,12 +978,6 @@ static int __init audit_init(void)
audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
-#ifdef CONFIG_AUDITSYSCALL
- audit_ih = inotify_init(&audit_inotify_ops);
- if (IS_ERR(audit_ih))
- audit_panic("cannot initialize inotify handle");
-#endif
-
for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
INIT_LIST_HEAD(&audit_inode_hash[i]);
@@ -1067,18 +1051,20 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
goto err;
}
- ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
- if (!ab->skb)
- goto err;
-
ab->ctx = ctx;
ab->gfp_mask = gfp_mask;
- nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
- nlh->nlmsg_type = type;
- nlh->nlmsg_flags = 0;
- nlh->nlmsg_pid = 0;
- nlh->nlmsg_seq = 0;
+
+ ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
+ if (!ab->skb)
+ goto nlmsg_failure;
+
+ nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
+
return ab;
+
+nlmsg_failure: /* Used by NLMSG_NEW */
+ kfree_skb(ab->skb);
+ ab->skb = NULL;
err:
audit_buffer_free(ab);
return NULL;
@@ -1382,7 +1368,7 @@ void audit_log_n_string(struct audit_buffer *ab, const char *string,
int audit_string_contains_control(const char *string, size_t len)
{
const unsigned char *p;
- for (p = string; p < (const unsigned char *)string + len && *p; p++) {
+ for (p = string; p < (const unsigned char *)string + len; p++) {
if (*p == '"' || *p < 0x21 || *p > 0x7e)
return 1;
}
@@ -1437,18 +1423,27 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
/* We will allow 11 spaces for ' (deleted)' to be appended */
pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
if (!pathname) {
- audit_log_format(ab, "<no memory>");
+ audit_log_string(ab, "<no_memory>");
return;
}
p = d_path(path, pathname, PATH_MAX+11);
if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
/* FIXME: can we save some information here? */
- audit_log_format(ab, "<too long>");
+ audit_log_string(ab, "<too_long>");
} else
audit_log_untrustedstring(ab, p);
kfree(pathname);
}
+void audit_log_key(struct audit_buffer *ab, char *key)
+{
+ audit_log_format(ab, " key=");
+ if (key)
+ audit_log_untrustedstring(ab, key);
+ else
+ audit_log_format(ab, "(null)");
+}
+
/**
* audit_log_end - end one audit record
* @ab: the audit_buffer
@@ -1472,15 +1467,7 @@ void audit_log_end(struct audit_buffer *ab)
skb_queue_tail(&audit_skb_queue, ab->skb);
wake_up_interruptible(&kauditd_wait);
} else {
- if (nlh->nlmsg_type != AUDIT_EOE) {
- if (printk_ratelimit()) {
- printk(KERN_NOTICE "type=%d %s\n",
- nlh->nlmsg_type,
- ab->skb->data + NLMSG_SPACE(0));
- } else
- audit_log_lost("printk limit exceeded\n");
- }
- audit_hold_skb(ab->skb);
+ audit_printk_skb(ab->skb);
}
ab->skb = NULL;
}
diff --git a/kernel/audit.h b/kernel/audit.h
index 16f18cac661..208687be4f3 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -53,18 +53,7 @@ enum audit_state {
};
/* Rule lists */
-struct audit_parent;
-
-struct audit_watch {
- atomic_t count; /* reference count */
- char *path; /* insertion path */
- dev_t dev; /* associated superblock device */
- unsigned long ino; /* associated inode number */
- struct audit_parent *parent; /* associated parent */
- struct list_head wlist; /* entry in parent->watches list */
- struct list_head rules; /* associated rules */
-};
-
+struct audit_watch;
struct audit_tree;
struct audit_chunk;
@@ -108,19 +97,28 @@ struct audit_netlink_list {
int audit_send_list(void *);
-struct inotify_watch;
-/* Inotify handle */
-extern struct inotify_handle *audit_ih;
-
-extern void audit_free_parent(struct inotify_watch *);
-extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
- const char *, struct inode *);
extern int selinux_audit_rule_update(void);
extern struct mutex audit_filter_mutex;
extern void audit_free_rule_rcu(struct rcu_head *);
extern struct list_head audit_filter_list[];
+/* audit watch functions */
+extern unsigned long audit_watch_inode(struct audit_watch *watch);
+extern dev_t audit_watch_dev(struct audit_watch *watch);
+extern void audit_put_watch(struct audit_watch *watch);
+extern void audit_get_watch(struct audit_watch *watch);
+extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
+extern int audit_add_watch(struct audit_krule *krule);
+extern void audit_remove_watch(struct audit_watch *watch);
+extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
+extern void audit_inotify_unregister(struct list_head *in_list);
+extern char *audit_watch_path(struct audit_watch *watch);
+extern struct list_head *audit_watch_rules(struct audit_watch *watch);
+
+extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
+ struct audit_watch *watch);
+
#ifdef CONFIG_AUDIT_TREE
extern struct audit_chunk *audit_tree_lookup(const struct inode *);
extern void audit_put_chunk(struct audit_chunk *);
@@ -130,10 +128,9 @@ extern int audit_add_tree_rule(struct audit_krule *);
extern int audit_remove_tree_rule(struct audit_krule *);
extern void audit_trim_trees(void);
extern int audit_tag_tree(char *old, char *new);
-extern void audit_schedule_prune(void);
-extern void audit_prune_trees(void);
extern const char *audit_tree_path(struct audit_tree *);
extern void audit_put_tree(struct audit_tree *);
+extern void audit_kill_trees(struct list_head *);
#else
#define audit_remove_tree_rule(rule) BUG()
#define audit_add_tree_rule(rule) -EINVAL
@@ -142,6 +139,7 @@ extern void audit_put_tree(struct audit_tree *);
#define audit_put_tree(tree) (void)0
#define audit_tag_tree(old, new) -EINVAL
#define audit_tree_path(rule) "" /* never called */
+#define audit_kill_trees(list) BUG()
#endif
extern char *audit_unpack_string(void **, size_t *, size_t);
@@ -160,7 +158,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
return 0;
}
extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
+extern struct list_head *audit_killed_trees(void);
#else
#define audit_signal_info(s,t) AUDIT_DISABLED
#define audit_filter_inodes(t,c) AUDIT_DISABLED
#endif
+
+extern struct mutex audit_cmd_mutex;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 8ad9545b8db..2451dc6f328 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -2,6 +2,7 @@
#include <linux/inotify.h>
#include <linux/namei.h>
#include <linux/mount.h>
+#include <linux/kthread.h>
struct audit_tree;
struct audit_chunk;
@@ -385,6 +386,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
mutex_lock(&inode->inotify_mutex);
if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
mutex_unlock(&inode->inotify_mutex);
+ put_inotify_watch(&old->watch);
free_chunk(chunk);
return -ENOSPC;
}
@@ -394,6 +396,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk->dead = 1;
inotify_evict_watch(&chunk->watch);
mutex_unlock(&inode->inotify_mutex);
+ put_inotify_watch(&old->watch);
put_inotify_watch(&chunk->watch);
return 0;
}
@@ -439,13 +442,11 @@ static void kill_rules(struct audit_tree *tree)
if (rule->tree) {
/* not a half-baked one */
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "op=remove rule dir=");
+ audit_log_format(ab, "op=");
+ audit_log_string(ab, "remove rule");
+ audit_log_format(ab, " dir=");
audit_log_untrustedstring(ab, rule->tree->pathname);
- if (rule->filterkey) {
- audit_log_format(ab, " key=");
- audit_log_untrustedstring(ab, rule->filterkey);
- } else
- audit_log_format(ab, " key=(null)");
+ audit_log_key(ab, rule->filterkey);
audit_log_format(ab, " list=%d res=1", rule->listnr);
audit_log_end(ab);
rule->tree = NULL;
@@ -517,6 +518,8 @@ static void trim_marked(struct audit_tree *tree)
}
}
+static void audit_schedule_prune(void);
+
/* called with audit_filter_mutex */
int audit_remove_tree_rule(struct audit_krule *rule)
{
@@ -566,7 +569,7 @@ void audit_trim_trees(void)
if (err)
goto skip_it;
- root_mnt = collect_mounts(path.mnt, path.dentry);
+ root_mnt = collect_mounts(&path);
path_put(&path);
if (!root_mnt)
goto skip_it;
@@ -658,7 +661,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
err = kern_path(tree->pathname, 0, &path);
if (err)
goto Err;
- mnt = collect_mounts(path.mnt, path.dentry);
+ mnt = collect_mounts(&path);
path_put(&path);
if (!mnt) {
err = -ENOMEM;
@@ -718,7 +721,7 @@ int audit_tag_tree(char *old, char *new)
err = kern_path(new, 0, &path);
if (err)
return err;
- tagged = collect_mounts(path.mnt, path.dentry);
+ tagged = collect_mounts(&path);
path_put(&path);
if (!tagged)
return -ENOMEM;
@@ -732,9 +735,6 @@ int audit_tag_tree(char *old, char *new)
dentry = dget(path.dentry);
path_put(&path);
- if (dentry == tagged->mnt_root && dentry == mnt->mnt_root)
- follow_up(&mnt, &dentry);
-
list_add_tail(&list, &tagged->mnt_list);
mutex_lock(&audit_filter_mutex);
@@ -825,10 +825,11 @@ int audit_tag_tree(char *old, char *new)
/*
* That gets run when evict_chunk() ends up needing to kill audit_tree.
- * Runs from a separate thread, with audit_cmd_mutex held.
+ * Runs from a separate thread.
*/
-void audit_prune_trees(void)
+static int prune_tree_thread(void *unused)
{
+ mutex_lock(&audit_cmd_mutex);
mutex_lock(&audit_filter_mutex);
while (!list_empty(&prune_list)) {
@@ -845,6 +846,40 @@ void audit_prune_trees(void)
}
mutex_unlock(&audit_filter_mutex);
+ mutex_unlock(&audit_cmd_mutex);
+ return 0;
+}
+
+static void audit_schedule_prune(void)
+{
+ kthread_run(prune_tree_thread, NULL, "audit_prune_tree");
+}
+
+/*
+ * ... and that one is done if evict_chunk() decides to delay until the end
+ * of syscall. Runs synchronously.
+ */
+void audit_kill_trees(struct list_head *list)
+{
+ mutex_lock(&audit_cmd_mutex);
+ mutex_lock(&audit_filter_mutex);
+
+ while (!list_empty(list)) {
+ struct audit_tree *victim;
+
+ victim = list_entry(list->next, struct audit_tree, list);
+ kill_rules(victim);
+ list_del_init(&victim->list);
+
+ mutex_unlock(&audit_filter_mutex);
+
+ prune_one(victim);
+
+ mutex_lock(&audit_filter_mutex);
+ }
+
+ mutex_unlock(&audit_filter_mutex);
+ mutex_unlock(&audit_cmd_mutex);
}
/*
@@ -855,6 +890,8 @@ void audit_prune_trees(void)
static void evict_chunk(struct audit_chunk *chunk)
{
struct audit_tree *owner;
+ struct list_head *postponed = audit_killed_trees();
+ int need_prune = 0;
int n;
if (chunk->dead)
@@ -870,15 +907,21 @@ static void evict_chunk(struct audit_chunk *chunk)
owner->root = NULL;
list_del_init(&owner->same_root);
spin_unlock(&hash_lock);
- kill_rules(owner);
- list_move(&owner->list, &prune_list);
- audit_schedule_prune();
+ if (!postponed) {
+ kill_rules(owner);
+ list_move(&owner->list, &prune_list);
+ need_prune = 1;
+ } else {
+ list_move(&owner->list, postponed);
+ }
spin_lock(&hash_lock);
}
list_del_rcu(&chunk->hash);
for (n = 0; n < chunk->count; n++)
list_del_init(&chunk->owners[n].list);
spin_unlock(&hash_lock);
+ if (need_prune)
+ audit_schedule_prune();
mutex_unlock(&audit_filter_mutex);
}
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
new file mode 100644
index 00000000000..0e96dbc60ea
--- /dev/null
+++ b/kernel/audit_watch.c
@@ -0,0 +1,543 @@
+/* audit_watch.c -- watching inodes
+ *
+ * Copyright 2003-2009 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/audit.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/netlink.h>
+#include <linux/sched.h>
+#include <linux/inotify.h>
+#include <linux/security.h>
+#include "audit.h"
+
+/*
+ * Reference counting:
+ *
+ * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
+ * event. Each audit_watch holds a reference to its associated parent.
+ *
+ * audit_watch: if added to lists, lifetime is from audit_init_watch() to
+ * audit_remove_watch(). Additionally, an audit_watch may exist
+ * temporarily to assist in searching existing filter data. Each
+ * audit_krule holds a reference to its associated watch.
+ */
+
+struct audit_watch {
+ atomic_t count; /* reference count */
+ char *path; /* insertion path */
+ dev_t dev; /* associated superblock device */
+ unsigned long ino; /* associated inode number */
+ struct audit_parent *parent; /* associated parent */
+ struct list_head wlist; /* entry in parent->watches list */
+ struct list_head rules; /* associated rules */
+};
+
+struct audit_parent {
+ struct list_head ilist; /* entry in inotify registration list */
+ struct list_head watches; /* associated watches */
+ struct inotify_watch wdata; /* inotify watch data */
+ unsigned flags; /* status flags */
+};
+
+/* Inotify handle. */
+struct inotify_handle *audit_ih;
+
+/*
+ * audit_parent status flags:
+ *
+ * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
+ * a filesystem event to ensure we're adding audit watches to a valid parent.
+ * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
+ * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
+ * we can receive while holding nameidata.
+ */
+#define AUDIT_PARENT_INVALID 0x001
+
+/* Inotify events we care about. */
+#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
+
+static void audit_free_parent(struct inotify_watch *i_watch)
+{
+ struct audit_parent *parent;
+
+ parent = container_of(i_watch, struct audit_parent, wdata);
+ WARN_ON(!list_empty(&parent->watches));
+ kfree(parent);
+}
+
+void audit_get_watch(struct audit_watch *watch)
+{
+ atomic_inc(&watch->count);
+}
+
+void audit_put_watch(struct audit_watch *watch)
+{
+ if (atomic_dec_and_test(&watch->count)) {
+ WARN_ON(watch->parent);
+ WARN_ON(!list_empty(&watch->rules));
+ kfree(watch->path);
+ kfree(watch);
+ }
+}
+
+void audit_remove_watch(struct audit_watch *watch)
+{
+ list_del(&watch->wlist);
+ put_inotify_watch(&watch->parent->wdata);
+ watch->parent = NULL;
+ audit_put_watch(watch); /* match initial get */
+}
+
+char *audit_watch_path(struct audit_watch *watch)
+{
+ return watch->path;
+}
+
+struct list_head *audit_watch_rules(struct audit_watch *watch)
+{
+ return &watch->rules;
+}
+
+unsigned long audit_watch_inode(struct audit_watch *watch)
+{
+ return watch->ino;
+}
+
+dev_t audit_watch_dev(struct audit_watch *watch)
+{
+ return watch->dev;
+}
+
+/* Initialize a parent watch entry. */
+static struct audit_parent *audit_init_parent(struct nameidata *ndp)
+{
+ struct audit_parent *parent;
+ s32 wd;
+
+ parent = kzalloc(sizeof(*parent), GFP_KERNEL);
+ if (unlikely(!parent))
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&parent->watches);
+ parent->flags = 0;
+
+ inotify_init_watch(&parent->wdata);
+ /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
+ get_inotify_watch(&parent->wdata);
+ wd = inotify_add_watch(audit_ih, &parent->wdata,
+ ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
+ if (wd < 0) {
+ audit_free_parent(&parent->wdata);
+ return ERR_PTR(wd);
+ }
+
+ return parent;
+}
+
+/* Initialize a watch entry. */
+static struct audit_watch *audit_init_watch(char *path)
+{
+ struct audit_watch *watch;
+
+ watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+ if (unlikely(!watch))
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&watch->rules);
+ atomic_set(&watch->count, 1);
+ watch->path = path;
+ watch->dev = (dev_t)-1;
+ watch->ino = (unsigned long)-1;
+
+ return watch;
+}
+
+/* Translate a watch string to kernel respresentation. */
+int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
+{
+ struct audit_watch *watch;
+
+ if (!audit_ih)
+ return -EOPNOTSUPP;
+
+ if (path[0] != '/' || path[len-1] == '/' ||
+ krule->listnr != AUDIT_FILTER_EXIT ||
+ op != Audit_equal ||
+ krule->inode_f || krule->watch || krule->tree)
+ return -EINVAL;
+
+ watch = audit_init_watch(path);
+ if (IS_ERR(watch))
+ return PTR_ERR(watch);
+
+ audit_get_watch(watch);
+ krule->watch = watch;
+
+ return 0;
+}
+
+/* Duplicate the given audit watch. The new watch's rules list is initialized
+ * to an empty list and wlist is undefined. */
+static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
+{
+ char *path;
+ struct audit_watch *new;
+
+ path = kstrdup(old->path, GFP_KERNEL);
+ if (unlikely(!path))
+ return ERR_PTR(-ENOMEM);
+
+ new = audit_init_watch(path);
+ if (IS_ERR(new)) {
+ kfree(path);
+ goto out;
+ }
+
+ new->dev = old->dev;
+ new->ino = old->ino;
+ get_inotify_watch(&old->parent->wdata);
+ new->parent = old->parent;
+
+out:
+ return new;
+}
+
+static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op)
+{
+ if (audit_enabled) {
+ struct audit_buffer *ab;
+ ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, "auid=%u ses=%u op=",
+ audit_get_loginuid(current),
+ audit_get_sessionid(current));
+ audit_log_string(ab, op);
+ audit_log_format(ab, " path=");
+ audit_log_untrustedstring(ab, w->path);
+ audit_log_key(ab, r->filterkey);
+ audit_log_format(ab, " list=%d res=1", r->listnr);
+ audit_log_end(ab);
+ }
+}
+
+/* Update inode info in audit rules based on filesystem event. */
+static void audit_update_watch(struct audit_parent *parent,
+ const char *dname, dev_t dev,
+ unsigned long ino, unsigned invalidating)
+{
+ struct audit_watch *owatch, *nwatch, *nextw;
+ struct audit_krule *r, *nextr;
+ struct audit_entry *oentry, *nentry;
+
+ mutex_lock(&audit_filter_mutex);
+ list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
+ if (audit_compare_dname_path(dname, owatch->path, NULL))
+ continue;
+
+ /* If the update involves invalidating rules, do the inode-based
+ * filtering now, so we don't omit records. */
+ if (invalidating && current->audit_context)
+ audit_filter_inodes(current, current->audit_context);
+
+ nwatch = audit_dupe_watch(owatch);
+ if (IS_ERR(nwatch)) {
+ mutex_unlock(&audit_filter_mutex);
+ audit_panic("error updating watch, skipping");
+ return;
+ }
+ nwatch->dev = dev;
+ nwatch->ino = ino;
+
+ list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
+
+ oentry = container_of(r, struct audit_entry, rule);
+ list_del(&oentry->rule.rlist);
+ list_del_rcu(&oentry->list);
+
+ nentry = audit_dupe_rule(&oentry->rule, nwatch);
+ if (IS_ERR(nentry)) {
+ list_del(&oentry->rule.list);
+ audit_panic("error updating watch, removing");
+ } else {
+ int h = audit_hash_ino((u32)ino);
+ list_add(&nentry->rule.rlist, &nwatch->rules);
+ list_add_rcu(&nentry->list, &audit_inode_hash[h]);
+ list_replace(&oentry->rule.list,
+ &nentry->rule.list);
+ }
+
+ audit_watch_log_rule_change(r, owatch, "updated rules");
+
+ call_rcu(&oentry->rcu, audit_free_rule_rcu);
+ }
+
+ audit_remove_watch(owatch);
+ goto add_watch_to_parent; /* event applies to a single watch */
+ }
+ mutex_unlock(&audit_filter_mutex);
+ return;
+
+add_watch_to_parent:
+ list_add(&nwatch->wlist, &parent->watches);
+ mutex_unlock(&audit_filter_mutex);
+ return;
+}
+
+/* Remove all watches & rules associated with a parent that is going away. */
+static void audit_remove_parent_watches(struct audit_parent *parent)
+{
+ struct audit_watch *w, *nextw;
+ struct audit_krule *r, *nextr;
+ struct audit_entry *e;
+
+ mutex_lock(&audit_filter_mutex);
+ parent->flags |= AUDIT_PARENT_INVALID;
+ list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
+ list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
+ e = container_of(r, struct audit_entry, rule);
+ audit_watch_log_rule_change(r, w, "remove rule");
+ list_del(&r->rlist);
+ list_del(&r->list);
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu, audit_free_rule_rcu);
+ }
+ audit_remove_watch(w);
+ }
+ mutex_unlock(&audit_filter_mutex);
+}
+
+/* Unregister inotify watches for parents on in_list.
+ * Generates an IN_IGNORED event. */
+void audit_inotify_unregister(struct list_head *in_list)
+{
+ struct audit_parent *p, *n;
+
+ list_for_each_entry_safe(p, n, in_list, ilist) {
+ list_del(&p->ilist);
+ inotify_rm_watch(audit_ih, &p->wdata);
+ /* the unpin matching the pin in audit_do_del_rule() */
+ unpin_inotify_watch(&p->wdata);
+ }
+}
+
+/* Get path information necessary for adding watches. */
+static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
+{
+ struct nameidata *ndparent, *ndwatch;
+ int err;
+
+ ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
+ if (unlikely(!ndparent))
+ return -ENOMEM;
+
+ ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
+ if (unlikely(!ndwatch)) {
+ kfree(ndparent);
+ return -ENOMEM;
+ }
+
+ err = path_lookup(path, LOOKUP_PARENT, ndparent);
+ if (err) {
+ kfree(ndparent);
+ kfree(ndwatch);
+ return err;
+ }
+
+ err = path_lookup(path, 0, ndwatch);
+ if (err) {
+ kfree(ndwatch);
+ ndwatch = NULL;
+ }
+
+ *ndp = ndparent;
+ *ndw = ndwatch;
+
+ return 0;
+}
+
+/* Release resources used for watch path information. */
+static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
+{
+ if (ndp) {
+ path_put(&ndp->path);
+ kfree(ndp);
+ }
+ if (ndw) {
+ path_put(&ndw->path);
+ kfree(ndw);
+ }
+}
+
+/* Associate the given rule with an existing parent inotify_watch.
+ * Caller must hold audit_filter_mutex. */
+static void audit_add_to_parent(struct audit_krule *krule,
+ struct audit_parent *parent)
+{
+ struct audit_watch *w, *watch = krule->watch;
+ int watch_found = 0;
+
+ list_for_each_entry(w, &parent->watches, wlist) {
+ if (strcmp(watch->path, w->path))
+ continue;
+
+ watch_found = 1;
+
+ /* put krule's and initial refs to temporary watch */
+ audit_put_watch(watch);
+ audit_put_watch(watch);
+
+ audit_get_watch(w);
+ krule->watch = watch = w;
+ break;
+ }
+
+ if (!watch_found) {
+ get_inotify_watch(&parent->wdata);
+ watch->parent = parent;
+
+ list_add(&watch->wlist, &parent->watches);
+ }
+ list_add(&krule->rlist, &watch->rules);
+}
+
+/* Find a matching watch entry, or add this one.
+ * Caller must hold audit_filter_mutex. */
+int audit_add_watch(struct audit_krule *krule)
+{
+ struct audit_watch *watch = krule->watch;
+ struct inotify_watch *i_watch;
+ struct audit_parent *parent;
+ struct nameidata *ndp = NULL, *ndw = NULL;
+ int ret = 0;
+
+ mutex_unlock(&audit_filter_mutex);
+
+ /* Avoid calling path_lookup under audit_filter_mutex. */
+ ret = audit_get_nd(watch->path, &ndp, &ndw);
+ if (ret) {
+ /* caller expects mutex locked */
+ mutex_lock(&audit_filter_mutex);
+ goto error;
+ }
+
+ /* update watch filter fields */
+ if (ndw) {
+ watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
+ watch->ino = ndw->path.dentry->d_inode->i_ino;
+ }
+
+ /* The audit_filter_mutex must not be held during inotify calls because
+ * we hold it during inotify event callback processing. If an existing
+ * inotify watch is found, inotify_find_watch() grabs a reference before
+ * returning.
+ */
+ if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
+ &i_watch) < 0) {
+ parent = audit_init_parent(ndp);
+ if (IS_ERR(parent)) {
+ /* caller expects mutex locked */
+ mutex_lock(&audit_filter_mutex);
+ ret = PTR_ERR(parent);
+ goto error;
+ }
+ } else
+ parent = container_of(i_watch, struct audit_parent, wdata);
+
+ mutex_lock(&audit_filter_mutex);
+
+ /* parent was moved before we took audit_filter_mutex */
+ if (parent->flags & AUDIT_PARENT_INVALID)
+ ret = -ENOENT;
+ else
+ audit_add_to_parent(krule, parent);
+
+ /* match get in audit_init_parent or inotify_find_watch */
+ put_inotify_watch(&parent->wdata);
+
+error:
+ audit_put_nd(ndp, ndw); /* NULL args OK */
+ return ret;
+
+}
+
+void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
+{
+ struct audit_watch *watch = krule->watch;
+ struct audit_parent *parent = watch->parent;
+
+ list_del(&krule->rlist);
+
+ if (list_empty(&watch->rules)) {
+ audit_remove_watch(watch);
+
+ if (list_empty(&parent->watches)) {
+ /* Put parent on the inotify un-registration
+ * list. Grab a reference before releasing
+ * audit_filter_mutex, to be released in
+ * audit_inotify_unregister().
+ * If filesystem is going away, just leave
+ * the sucker alone, eviction will take
+ * care of it. */
+ if (pin_inotify_watch(&parent->wdata))
+ list_add(&parent->ilist, list);
+ }
+ }
+}
+
+/* Update watch data in audit rules based on inotify events. */
+static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
+ u32 cookie, const char *dname, struct inode *inode)
+{
+ struct audit_parent *parent;
+
+ parent = container_of(i_watch, struct audit_parent, wdata);
+
+ if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
+ audit_update_watch(parent, dname, inode->i_sb->s_dev,
+ inode->i_ino, 0);
+ else if (mask & (IN_DELETE|IN_MOVED_FROM))
+ audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
+ /* inotify automatically removes the watch and sends IN_IGNORED */
+ else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
+ audit_remove_parent_watches(parent);
+ /* inotify does not remove the watch, so remove it manually */
+ else if(mask & IN_MOVE_SELF) {
+ audit_remove_parent_watches(parent);
+ inotify_remove_watch_locked(audit_ih, i_watch);
+ } else if (mask & IN_IGNORED)
+ put_inotify_watch(i_watch);
+}
+
+static const struct inotify_operations audit_inotify_ops = {
+ .handle_event = audit_handle_ievent,
+ .destroy_watch = audit_free_parent,
+};
+
+static int __init audit_watch_init(void)
+{
+ audit_ih = inotify_init(&audit_inotify_ops);
+ if (IS_ERR(audit_ih))
+ audit_panic("cannot initialize inotify handle");
+ return 0;
+}
+subsys_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index fbf24d121d9..a70604047f3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,7 +27,6 @@
#include <linux/namei.h>
#include <linux/netlink.h>
#include <linux/sched.h>
-#include <linux/inotify.h>
#include <linux/security.h>
#include "audit.h"
@@ -44,36 +43,6 @@
* be written directly provided audit_filter_mutex is held.
*/
-/*
- * Reference counting:
- *
- * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
- * event. Each audit_watch holds a reference to its associated parent.
- *
- * audit_watch: if added to lists, lifetime is from audit_init_watch() to
- * audit_remove_watch(). Additionally, an audit_watch may exist
- * temporarily to assist in searching existing filter data. Each
- * audit_krule holds a reference to its associated watch.
- */
-
-struct audit_parent {
- struct list_head ilist; /* entry in inotify registration list */
- struct list_head watches; /* associated watches */
- struct inotify_watch wdata; /* inotify watch data */
- unsigned flags; /* status flags */
-};
-
-/*
- * audit_parent status flags:
- *
- * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
- * a filesystem event to ensure we're adding audit watches to a valid parent.
- * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
- * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
- * we can receive while holding nameidata.
- */
-#define AUDIT_PARENT_INVALID 0x001
-
/* Audit filter lists, defined in <linux/audit.h> */
struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_filter_list[0]),
@@ -97,56 +66,21 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
DEFINE_MUTEX(audit_filter_mutex);
-/* Inotify events we care about. */
-#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
-
-void audit_free_parent(struct inotify_watch *i_watch)
-{
- struct audit_parent *parent;
-
- parent = container_of(i_watch, struct audit_parent, wdata);
- WARN_ON(!list_empty(&parent->watches));
- kfree(parent);
-}
-
-static inline void audit_get_watch(struct audit_watch *watch)
-{
- atomic_inc(&watch->count);
-}
-
-static void audit_put_watch(struct audit_watch *watch)
-{
- if (atomic_dec_and_test(&watch->count)) {
- WARN_ON(watch->parent);
- WARN_ON(!list_empty(&watch->rules));
- kfree(watch->path);
- kfree(watch);
- }
-}
-
-static void audit_remove_watch(struct audit_watch *watch)
-{
- list_del(&watch->wlist);
- put_inotify_watch(&watch->parent->wdata);
- watch->parent = NULL;
- audit_put_watch(watch); /* match initial get */
-}
-
static inline void audit_free_rule(struct audit_entry *e)
{
int i;
-
+ struct audit_krule *erule = &e->rule;
/* some rules don't have associated watches */
- if (e->rule.watch)
- audit_put_watch(e->rule.watch);
- if (e->rule.fields)
- for (i = 0; i < e->rule.field_count; i++) {
- struct audit_field *f = &e->rule.fields[i];
+ if (erule->watch)
+ audit_put_watch(erule->watch);
+ if (erule->fields)
+ for (i = 0; i < erule->field_count; i++) {
+ struct audit_field *f = &erule->fields[i];
kfree(f->lsm_str);
security_audit_rule_free(f->lsm_rule);
}
- kfree(e->rule.fields);
- kfree(e->rule.filterkey);
+ kfree(erule->fields);
+ kfree(erule->filterkey);
kfree(e);
}
@@ -156,50 +90,6 @@ void audit_free_rule_rcu(struct rcu_head *head)
audit_free_rule(e);
}
-/* Initialize a parent watch entry. */
-static struct audit_parent *audit_init_parent(struct nameidata *ndp)
-{
- struct audit_parent *parent;
- s32 wd;
-
- parent = kzalloc(sizeof(*parent), GFP_KERNEL);
- if (unlikely(!parent))
- return ERR_PTR(-ENOMEM);
-
- INIT_LIST_HEAD(&parent->watches);
- parent->flags = 0;
-
- inotify_init_watch(&parent->wdata);
- /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
- get_inotify_watch(&parent->wdata);
- wd = inotify_add_watch(audit_ih, &parent->wdata,
- ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
- if (wd < 0) {
- audit_free_parent(&parent->wdata);
- return ERR_PTR(wd);
- }
-
- return parent;
-}
-
-/* Initialize a watch entry. */
-static struct audit_watch *audit_init_watch(char *path)
-{
- struct audit_watch *watch;
-
- watch = kzalloc(sizeof(*watch), GFP_KERNEL);
- if (unlikely(!watch))
- return ERR_PTR(-ENOMEM);
-
- INIT_LIST_HEAD(&watch->rules);
- atomic_set(&watch->count, 1);
- watch->path = path;
- watch->dev = (dev_t)-1;
- watch->ino = (unsigned long)-1;
-
- return watch;
-}
-
/* Initialize an audit filterlist entry. */
static inline struct audit_entry *audit_init_entry(u32 field_count)
{
@@ -260,31 +150,6 @@ static inline int audit_to_inode(struct audit_krule *krule,
return 0;
}
-/* Translate a watch string to kernel respresentation. */
-static int audit_to_watch(struct audit_krule *krule, char *path, int len,
- u32 op)
-{
- struct audit_watch *watch;
-
- if (!audit_ih)
- return -EOPNOTSUPP;
-
- if (path[0] != '/' || path[len-1] == '/' ||
- krule->listnr != AUDIT_FILTER_EXIT ||
- op != Audit_equal ||
- krule->inode_f || krule->watch || krule->tree)
- return -EINVAL;
-
- watch = audit_init_watch(path);
- if (IS_ERR(watch))
- return PTR_ERR(watch);
-
- audit_get_watch(watch);
- krule->watch = watch;
-
- return 0;
-}
-
static __u32 *classes[AUDIT_SYSCALL_CLASSES];
int __init audit_register_class(int class, unsigned *list)
@@ -766,7 +631,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
break;
case AUDIT_WATCH:
data->buflen += data->values[i] =
- audit_pack_string(&bufp, krule->watch->path);
+ audit_pack_string(&bufp,
+ audit_watch_path(krule->watch));
break;
case AUDIT_DIR:
data->buflen += data->values[i] =
@@ -818,7 +684,8 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
return 1;
break;
case AUDIT_WATCH:
- if (strcmp(a->watch->path, b->watch->path))
+ if (strcmp(audit_watch_path(a->watch),
+ audit_watch_path(b->watch)))
return 1;
break;
case AUDIT_DIR:
@@ -844,32 +711,6 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
return 0;
}
-/* Duplicate the given audit watch. The new watch's rules list is initialized
- * to an empty list and wlist is undefined. */
-static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
-{
- char *path;
- struct audit_watch *new;
-
- path = kstrdup(old->path, GFP_KERNEL);
- if (unlikely(!path))
- return ERR_PTR(-ENOMEM);
-
- new = audit_init_watch(path);
- if (IS_ERR(new)) {
- kfree(path);
- goto out;
- }
-
- new->dev = old->dev;
- new->ino = old->ino;
- get_inotify_watch(&old->parent->wdata);
- new->parent = old->parent;
-
-out:
- return new;
-}
-
/* Duplicate LSM field information. The lsm_rule is opaque, so must be
* re-initialized. */
static inline int audit_dupe_lsm_field(struct audit_field *df,
@@ -904,8 +745,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
* rule with the new rule in the filterlist, then free the old rule.
* The rlist element is undefined; list manipulations are handled apart from
* the initial copy. */
-static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
- struct audit_watch *watch)
+struct audit_entry *audit_dupe_rule(struct audit_krule *old,
+ struct audit_watch *watch)
{
u32 fcount = old->field_count;
struct audit_entry *entry;
@@ -977,137 +818,6 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
return entry;
}
-/* Update inode info in audit rules based on filesystem event. */
-static void audit_update_watch(struct audit_parent *parent,
- const char *dname, dev_t dev,
- unsigned long ino, unsigned invalidating)
-{
- struct audit_watch *owatch, *nwatch, *nextw;
- struct audit_krule *r, *nextr;
- struct audit_entry *oentry, *nentry;
-
- mutex_lock(&audit_filter_mutex);
- list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
- if (audit_compare_dname_path(dname, owatch->path, NULL))
- continue;
-
- /* If the update involves invalidating rules, do the inode-based
- * filtering now, so we don't omit records. */
- if (invalidating && current->audit_context)
- audit_filter_inodes(current, current->audit_context);
-
- nwatch = audit_dupe_watch(owatch);
- if (IS_ERR(nwatch)) {
- mutex_unlock(&audit_filter_mutex);
- audit_panic("error updating watch, skipping");
- return;
- }
- nwatch->dev = dev;
- nwatch->ino = ino;
-
- list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
-
- oentry = container_of(r, struct audit_entry, rule);
- list_del(&oentry->rule.rlist);
- list_del_rcu(&oentry->list);
-
- nentry = audit_dupe_rule(&oentry->rule, nwatch);
- if (IS_ERR(nentry)) {
- list_del(&oentry->rule.list);
- audit_panic("error updating watch, removing");
- } else {
- int h = audit_hash_ino((u32)ino);
- list_add(&nentry->rule.rlist, &nwatch->rules);
- list_add_rcu(&nentry->list, &audit_inode_hash[h]);
- list_replace(&oentry->rule.list,
- &nentry->rule.list);
- }
-
- call_rcu(&oentry->rcu, audit_free_rule_rcu);
- }
-
- if (audit_enabled) {
- struct audit_buffer *ab;
- ab = audit_log_start(NULL, GFP_KERNEL,
- AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "auid=%u ses=%u",
- audit_get_loginuid(current),
- audit_get_sessionid(current));
- audit_log_format(ab,
- " op=updated rules specifying path=");
- audit_log_untrustedstring(ab, owatch->path);
- audit_log_format(ab, " with dev=%u ino=%lu\n",
- dev, ino);
- audit_log_format(ab, " list=%d res=1", r->listnr);
- audit_log_end(ab);
- }
- audit_remove_watch(owatch);
- goto add_watch_to_parent; /* event applies to a single watch */
- }
- mutex_unlock(&audit_filter_mutex);
- return;
-
-add_watch_to_parent:
- list_add(&nwatch->wlist, &parent->watches);
- mutex_unlock(&audit_filter_mutex);
- return;
-}
-
-/* Remove all watches & rules associated with a parent that is going away. */
-static void audit_remove_parent_watches(struct audit_parent *parent)
-{
- struct audit_watch *w, *nextw;
- struct audit_krule *r, *nextr;
- struct audit_entry *e;
-
- mutex_lock(&audit_filter_mutex);
- parent->flags |= AUDIT_PARENT_INVALID;
- list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
- list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
- e = container_of(r, struct audit_entry, rule);
- if (audit_enabled) {
- struct audit_buffer *ab;
- ab = audit_log_start(NULL, GFP_KERNEL,
- AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "auid=%u ses=%u",
- audit_get_loginuid(current),
- audit_get_sessionid(current));
- audit_log_format(ab, " op=remove rule path=");
- audit_log_untrustedstring(ab, w->path);
- if (r->filterkey) {
- audit_log_format(ab, " key=");
- audit_log_untrustedstring(ab,
- r->filterkey);
- } else
- audit_log_format(ab, " key=(null)");
- audit_log_format(ab, " list=%d res=1",
- r->listnr);
- audit_log_end(ab);
- }
- list_del(&r->rlist);
- list_del(&r->list);
- list_del_rcu(&e->list);
- call_rcu(&e->rcu, audit_free_rule_rcu);
- }
- audit_remove_watch(w);
- }
- mutex_unlock(&audit_filter_mutex);
-}
-
-/* Unregister inotify watches for parents on in_list.
- * Generates an IN_IGNORED event. */
-static void audit_inotify_unregister(struct list_head *in_list)
-{
- struct audit_parent *p, *n;
-
- list_for_each_entry_safe(p, n, in_list, ilist) {
- list_del(&p->ilist);
- inotify_rm_watch(audit_ih, &p->wdata);
- /* the unpin matching the pin in audit_do_del_rule() */
- unpin_inotify_watch(&p->wdata);
- }
-}
-
/* Find an existing audit rule.
* Caller must hold audit_filter_mutex to prevent stale rule data. */
static struct audit_entry *audit_find_rule(struct audit_entry *entry,
@@ -1145,134 +855,6 @@ out:
return found;
}
-/* Get path information necessary for adding watches. */
-static int audit_get_nd(char *path, struct nameidata **ndp,
- struct nameidata **ndw)
-{
- struct nameidata *ndparent, *ndwatch;
- int err;
-
- ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
- if (unlikely(!ndparent))
- return -ENOMEM;
-
- ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
- if (unlikely(!ndwatch)) {
- kfree(ndparent);
- return -ENOMEM;
- }
-
- err = path_lookup(path, LOOKUP_PARENT, ndparent);
- if (err) {
- kfree(ndparent);
- kfree(ndwatch);
- return err;
- }
-
- err = path_lookup(path, 0, ndwatch);
- if (err) {
- kfree(ndwatch);
- ndwatch = NULL;
- }
-
- *ndp = ndparent;
- *ndw = ndwatch;
-
- return 0;
-}
-
-/* Release resources used for watch path information. */
-static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
-{
- if (ndp) {
- path_put(&ndp->path);
- kfree(ndp);
- }
- if (ndw) {
- path_put(&ndw->path);
- kfree(ndw);
- }
-}
-
-/* Associate the given rule with an existing parent inotify_watch.
- * Caller must hold audit_filter_mutex. */
-static void audit_add_to_parent(struct audit_krule *krule,
- struct audit_parent *parent)
-{
- struct audit_watch *w, *watch = krule->watch;
- int watch_found = 0;
-
- list_for_each_entry(w, &parent->watches, wlist) {
- if (strcmp(watch->path, w->path))
- continue;
-
- watch_found = 1;
-
- /* put krule's and initial refs to temporary watch */
- audit_put_watch(watch);
- audit_put_watch(watch);
-
- audit_get_watch(w);
- krule->watch = watch = w;
- break;
- }
-
- if (!watch_found) {
- get_inotify_watch(&parent->wdata);
- watch->parent = parent;
-
- list_add(&watch->wlist, &parent->watches);
- }
- list_add(&krule->rlist, &watch->rules);
-}
-
-/* Find a matching watch entry, or add this one.
- * Caller must hold audit_filter_mutex. */
-static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
- struct nameidata *ndw)
-{
- struct audit_watch *watch = krule->watch;
- struct inotify_watch *i_watch;
- struct audit_parent *parent;
- int ret = 0;
-
- /* update watch filter fields */
- if (ndw) {
- watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
- watch->ino = ndw->path.dentry->d_inode->i_ino;
- }
-
- /* The audit_filter_mutex must not be held during inotify calls because
- * we hold it during inotify event callback processing. If an existing
- * inotify watch is found, inotify_find_watch() grabs a reference before
- * returning.
- */
- mutex_unlock(&audit_filter_mutex);
-
- if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
- &i_watch) < 0) {
- parent = audit_init_parent(ndp);
- if (IS_ERR(parent)) {
- /* caller expects mutex locked */
- mutex_lock(&audit_filter_mutex);
- return PTR_ERR(parent);
- }
- } else
- parent = container_of(i_watch, struct audit_parent, wdata);
-
- mutex_lock(&audit_filter_mutex);
-
- /* parent was moved before we took audit_filter_mutex */
- if (parent->flags & AUDIT_PARENT_INVALID)
- ret = -ENOENT;
- else
- audit_add_to_parent(krule, parent);
-
- /* match get in audit_init_parent or inotify_find_watch */
- put_inotify_watch(&parent->wdata);
- return ret;
-}
-
static u64 prio_low = ~0ULL/2;
static u64 prio_high = ~0ULL/2 - 1;
@@ -1282,7 +864,6 @@ static inline int audit_add_rule(struct audit_entry *entry)
struct audit_entry *e;
struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
- struct nameidata *ndp = NULL, *ndw = NULL;
struct list_head *list;
int h, err;
#ifdef CONFIG_AUDITSYSCALL
@@ -1296,8 +877,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
mutex_lock(&audit_filter_mutex);
e = audit_find_rule(entry, &list);
- mutex_unlock(&audit_filter_mutex);
if (e) {
+ mutex_unlock(&audit_filter_mutex);
err = -EEXIST;
/* normally audit_add_tree_rule() will free it on failure */
if (tree)
@@ -1305,22 +886,16 @@ static inline int audit_add_rule(struct audit_entry *entry)
goto error;
}
- /* Avoid calling path_lookup under audit_filter_mutex. */
- if (watch) {
- err = audit_get_nd(watch->path, &ndp, &ndw);
- if (err)
- goto error;
- }
-
- mutex_lock(&audit_filter_mutex);
if (watch) {
/* audit_filter_mutex is dropped and re-taken during this call */
- err = audit_add_watch(&entry->rule, ndp, ndw);
+ err = audit_add_watch(&entry->rule);
if (err) {
mutex_unlock(&audit_filter_mutex);
goto error;
}
- h = audit_hash_ino((u32)watch->ino);
+ /* entry->rule.watch may have changed during audit_add_watch() */
+ watch = entry->rule.watch;
+ h = audit_hash_ino((u32)audit_watch_inode(watch));
list = &audit_inode_hash[h];
}
if (tree) {
@@ -1358,11 +933,9 @@ static inline int audit_add_rule(struct audit_entry *entry)
#endif
mutex_unlock(&audit_filter_mutex);
- audit_put_nd(ndp, ndw); /* NULL args OK */
return 0;
error:
- audit_put_nd(ndp, ndw); /* NULL args OK */
if (watch)
audit_put_watch(watch); /* tmp watch, matches initial get */
return err;
@@ -1372,7 +945,7 @@ error:
static inline int audit_del_rule(struct audit_entry *entry)
{
struct audit_entry *e;
- struct audit_watch *watch, *tmp_watch = entry->rule.watch;
+ struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
struct list_head *list;
LIST_HEAD(inotify_list);
@@ -1394,29 +967,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
goto out;
}
- watch = e->rule.watch;
- if (watch) {
- struct audit_parent *parent = watch->parent;
-
- list_del(&e->rule.rlist);
-
- if (list_empty(&watch->rules)) {
- audit_remove_watch(watch);
-
- if (list_empty(&parent->watches)) {
- /* Put parent on the inotify un-registration
- * list. Grab a reference before releasing
- * audit_filter_mutex, to be released in
- * audit_inotify_unregister().
- * If filesystem is going away, just leave
- * the sucker alone, eviction will take
- * care of it.
- */
- if (pin_inotify_watch(&parent->wdata))
- list_add(&parent->ilist, &inotify_list);
- }
- }
- }
+ if (e->rule.watch)
+ audit_remove_watch_rule(&e->rule, &inotify_list);
if (e->rule.tree)
audit_remove_tree_rule(&e->rule);
@@ -1438,8 +990,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
audit_inotify_unregister(&inotify_list);
out:
- if (tmp_watch)
- audit_put_watch(tmp_watch); /* match initial get */
+ if (watch)
+ audit_put_watch(watch); /* match initial get */
if (tree)
audit_put_tree(tree); /* that's the temporary one */
@@ -1527,11 +1079,9 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
security_release_secctx(ctx, len);
}
}
- audit_log_format(ab, " op=%s rule key=", action);
- if (rule->filterkey)
- audit_log_untrustedstring(ab, rule->filterkey);
- else
- audit_log_format(ab, "(null)");
+ audit_log_format(ab, " op=");
+ audit_log_string(ab, action);
+ audit_log_key(ab, rule->filterkey);
audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
audit_log_end(ab);
}
@@ -1595,7 +1145,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
return PTR_ERR(entry);
err = audit_add_rule(entry);
- audit_log_rule_change(loginuid, sessionid, sid, "add",
+ audit_log_rule_change(loginuid, sessionid, sid, "add rule",
&entry->rule, !err);
if (err)
@@ -1611,7 +1161,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
return PTR_ERR(entry);
err = audit_del_rule(entry);
- audit_log_rule_change(loginuid, sessionid, sid, "remove",
+ audit_log_rule_change(loginuid, sessionid, sid, "remove rule",
&entry->rule, !err);
audit_free_rule(entry);
@@ -1793,7 +1343,7 @@ static int update_lsm_rule(struct audit_krule *r)
list_del(&r->list);
} else {
if (watch) {
- list_add(&nentry->rule.rlist, &watch->rules);
+ list_add(&nentry->rule.rlist, audit_watch_rules(watch));
list_del(&r->rlist);
} else if (tree)
list_replace_init(&r->rlist, &nentry->rule.rlist);
@@ -1829,27 +1379,3 @@ int audit_update_lsm_rules(void)
return err;
}
-
-/* Update watch data in audit rules based on inotify events. */
-void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
- u32 cookie, const char *dname, struct inode *inode)
-{
- struct audit_parent *parent;
-
- parent = container_of(i_watch, struct audit_parent, wdata);
-
- if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
- audit_update_watch(parent, dname, inode->i_sb->s_dev,
- inode->i_ino, 0);
- else if (mask & (IN_DELETE|IN_MOVED_FROM))
- audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
- /* inotify automatically removes the watch and sends IN_IGNORED */
- else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
- audit_remove_parent_watches(parent);
- /* inotify does not remove the watch, so remove it manually */
- else if(mask & IN_MOVE_SELF) {
- audit_remove_parent_watches(parent);
- inotify_remove_watch_locked(audit_ih, i_watch);
- } else if (mask & IN_IGNORED)
- put_inotify_watch(i_watch);
-}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8cbddff6c28..68d3c6a0ecd 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -66,6 +66,7 @@
#include <linux/syscalls.h>
#include <linux/inotify.h>
#include <linux/capability.h>
+#include <linux/fs_struct.h>
#include "audit.h"
@@ -198,6 +199,7 @@ struct audit_context {
struct audit_tree_refs *trees, *first_trees;
int tree_count;
+ struct list_head killed_trees;
int type;
union {
@@ -328,6 +330,14 @@ static int audit_match_filetype(struct audit_context *ctx, int which)
*/
#ifdef CONFIG_AUDIT_TREE
+static void audit_set_auditable(struct audit_context *ctx)
+{
+ if (!ctx->prio) {
+ ctx->prio = 1;
+ ctx->current_state = AUDIT_RECORD_CONTEXT;
+ }
+}
+
static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
{
struct audit_tree_refs *p = ctx->trees;
@@ -539,9 +549,9 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_WATCH:
- if (name && rule->watch->ino != (unsigned long)-1)
- result = (name->dev == rule->watch->dev &&
- name->ino == rule->watch->ino);
+ if (name && audit_watch_inode(rule->watch) != (unsigned long)-1)
+ result = (name->dev == audit_watch_dev(rule->watch) &&
+ name->ino == audit_watch_inode(rule->watch));
break;
case AUDIT_DIR:
if (ctx)
@@ -741,17 +751,9 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
rcu_read_unlock();
}
-static void audit_set_auditable(struct audit_context *ctx)
-{
- if (!ctx->prio) {
- ctx->prio = 1;
- ctx->current_state = AUDIT_RECORD_CONTEXT;
- }
-}
-
static inline struct audit_context *audit_get_context(struct task_struct *tsk,
int return_valid,
- int return_code)
+ long return_code)
{
struct audit_context *context = tsk->audit_context;
@@ -852,6 +854,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
return NULL;
audit_zero_context(context, state);
+ INIT_LIST_HEAD(&context->killed_trees);
return context;
}
@@ -1023,8 +1026,8 @@ static int audit_log_single_execve_arg(struct audit_context *context,
{
char arg_num_len_buf[12];
const char __user *tmp_p = p;
- /* how many digits are in arg_num? 3 is the length of a=\n */
- size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3;
+ /* how many digits are in arg_num? 5 is the length of ' a=""' */
+ size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
size_t len, len_left, to_send;
size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
unsigned int i, has_cntl = 0, too_long = 0;
@@ -1109,7 +1112,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
* so we can be sure nothing was lost.
*/
if ((i == 0) && (too_long))
- audit_log_format(*ab, "a%d_len=%zu ", arg_num,
+ audit_log_format(*ab, " a%d_len=%zu", arg_num,
has_cntl ? 2*len : len);
/*
@@ -1129,15 +1132,14 @@ static int audit_log_single_execve_arg(struct audit_context *context,
buf[to_send] = '\0';
/* actually log it */
- audit_log_format(*ab, "a%d", arg_num);
+ audit_log_format(*ab, " a%d", arg_num);
if (too_long)
audit_log_format(*ab, "[%d]", i);
audit_log_format(*ab, "=");
if (has_cntl)
audit_log_n_hex(*ab, buf, to_send);
else
- audit_log_format(*ab, "\"%s\"", buf);
- audit_log_format(*ab, "\n");
+ audit_log_string(*ab, buf);
p += to_send;
len_left -= to_send;
@@ -1165,7 +1167,7 @@ static void audit_log_execve_info(struct audit_context *context,
p = (const char __user *)axi->mm->arg_start;
- audit_log_format(*ab, "argc=%d ", axi->argc);
+ audit_log_format(*ab, "argc=%d", axi->argc);
/*
* we need some kernel buffer to hold the userspace args. Just
@@ -1372,11 +1374,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
audit_log_task_info(ab, tsk);
- if (context->filterkey) {
- audit_log_format(ab, " key=");
- audit_log_untrustedstring(ab, context->filterkey);
- } else
- audit_log_format(ab, " key=(null)");
+ audit_log_key(ab, context->filterkey);
audit_log_end(ab);
for (aux = context->aux; aux; aux = aux->next) {
@@ -1478,7 +1476,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
case 0:
/* name was specified as a relative path and the
* directory component is the cwd */
- audit_log_d_path(ab, " name=", &context->pwd);
+ audit_log_d_path(ab, "name=", &context->pwd);
break;
default:
/* log the name's directory component */
@@ -1549,6 +1547,8 @@ void audit_free(struct task_struct *tsk)
/* that can happen only if we are called from do_exit() */
if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
audit_log_exit(context, tsk);
+ if (!list_empty(&context->killed_trees))
+ audit_kill_trees(&context->killed_trees);
audit_free_context(context);
}
@@ -1692,6 +1692,9 @@ void audit_syscall_exit(int valid, long return_code)
context->in_syscall = 0;
context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
+ if (!list_empty(&context->killed_trees))
+ audit_kill_trees(&context->killed_trees);
+
if (context->previous) {
struct audit_context *new_context = context->previous;
context->previous = NULL;
@@ -2149,7 +2152,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
* __audit_mq_open - record audit data for a POSIX MQ open
* @oflag: open flag
* @mode: mode bits
- * @u_attr: queue attributes
+ * @attr: queue attributes
*
*/
void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
@@ -2196,7 +2199,7 @@ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
/**
* __audit_mq_notify - record audit data for a POSIX MQ notify
* @mqdes: MQ descriptor
- * @u_notification: Notification event
+ * @notification: Notification event
*
*/
@@ -2525,3 +2528,11 @@ void audit_core_dumps(long signr)
audit_log_format(ab, " sig=%ld", signr);
audit_log_end(ab);
}
+
+struct list_head *audit_killed_trees(void)
+{
+ struct audit_context *ctx = current->audit_context;
+ if (likely(!ctx || !ctx->in_syscall))
+ return NULL;
+ return &ctx->killed_trees;
+}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c29831076e7..c7ece8f027f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,6 +46,8 @@
#include <linux/cgroupstats.h>
#include <linux/hash.h>
#include <linux/namei.h>
+#include <linux/smp_lock.h>
+#include <linux/pid_namespace.h>
#include <asm/atomic.h>
@@ -94,7 +96,6 @@ struct cgroupfs_root {
char release_agent_path[PATH_MAX];
};
-
/*
* The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
* subsystems that are otherwise unattached - it never has more than a
@@ -102,6 +103,39 @@ struct cgroupfs_root {
*/
static struct cgroupfs_root rootnode;
+/*
+ * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
+ * cgroup_subsys->use_id != 0.
+ */
+#define CSS_ID_MAX (65535)
+struct css_id {
+ /*
+ * The css to which this ID points. This pointer is set to valid value
+ * after cgroup is populated. If cgroup is removed, this will be NULL.
+ * This pointer is expected to be RCU-safe because destroy()
+ * is called after synchronize_rcu(). But for safe use, css_is_removed()
+ * css_tryget() should be used for avoiding race.
+ */
+ struct cgroup_subsys_state *css;
+ /*
+ * ID of this css.
+ */
+ unsigned short id;
+ /*
+ * Depth in hierarchy which this ID belongs to.
+ */
+ unsigned short depth;
+ /*
+ * ID is freed by RCU. (and lookup routine is RCU safe.)
+ */
+ struct rcu_head rcu_head;
+ /*
+ * Hierarchy of CSS ID belongs to.
+ */
+ unsigned short stack[0]; /* Array of Length (depth+1) */
+};
+
+
/* The list of hierarchy roots */
static LIST_HEAD(roots);
@@ -185,6 +219,8 @@ struct cg_cgroup_link {
static struct css_set init_css_set;
static struct cg_cgroup_link init_css_set_link;
+static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
+
/* css_set_lock protects the list of css_set objects, and the
* chain of tasks off each css_set. Nests outside task->alloc_lock
* due to cgroup_iter_start() */
@@ -564,9 +600,13 @@ static struct inode_operations cgroup_dir_inode_operations;
static struct file_operations proc_cgroupstats_operations;
static struct backing_dev_info cgroup_backing_dev_info = {
+ .name = "cgroup",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
+static int alloc_css_id(struct cgroup_subsys *ss,
+ struct cgroup *parent, struct cgroup *child);
+
static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
{
struct inode *inode = new_inode(sb);
@@ -585,13 +625,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
* Call subsys's pre_destroy handler.
* This is called before css refcnt check.
*/
-static void cgroup_call_pre_destroy(struct cgroup *cgrp)
+static int cgroup_call_pre_destroy(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
+ int ret = 0;
+
for_each_subsys(cgrp->root, ss)
- if (ss->pre_destroy)
- ss->pre_destroy(ss, cgrp);
- return;
+ if (ss->pre_destroy) {
+ ret = ss->pre_destroy(ss, cgrp);
+ if (ret)
+ break;
+ }
+ return ret;
}
static void free_cgroup_rcu(struct rcu_head *obj)
@@ -685,6 +730,34 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
remove_dir(dentry);
}
+/*
+ * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
+ * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
+ * reference to css->refcnt. In general, this refcnt is expected to goes down
+ * to zero, soon.
+ *
+ * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
+ */
+DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+
+static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
+{
+ if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
+ wake_up_all(&cgroup_rmdir_waitq);
+}
+
+void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
+{
+ css_get(css);
+}
+
+void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
+{
+ cgroup_wakeup_rmdir_waiter(css->cgroup);
+ css_put(css);
+}
+
+
static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long final_bits)
{
@@ -784,6 +857,11 @@ static int parse_cgroupfs_options(char *data,
struct cgroup_sb_opts *opts)
{
char *token, *o = data ?: "all";
+ unsigned long mask = (unsigned long)-1;
+
+#ifdef CONFIG_CPUSETS
+ mask = ~(1UL << cpuset_subsys_id);
+#endif
opts->subsys_bits = 0;
opts->flags = 0;
@@ -828,6 +906,15 @@ static int parse_cgroupfs_options(char *data,
}
}
+ /*
+ * Option noprefix was introduced just for backward compatibility
+ * with the old cpuset, so we allow noprefix only if mounting just
+ * the cpuset subsystem.
+ */
+ if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
+ (opts->subsys_bits & mask))
+ return -EINVAL;
+
/* We can't have an empty hierarchy */
if (!opts->subsys_bits)
return -EINVAL;
@@ -842,6 +929,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
struct cgroup *cgrp = &root->top_cgroup;
struct cgroup_sb_opts opts;
+ lock_kernel();
mutex_lock(&cgrp->dentry->d_inode->i_mutex);
mutex_lock(&cgroup_mutex);
@@ -857,18 +945,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
}
ret = rebind_subsystems(root, opts.subsys_bits);
+ if (ret)
+ goto out_unlock;
/* (re)populate subsystem files */
- if (!ret)
- cgroup_populate_dir(cgrp);
+ cgroup_populate_dir(cgrp);
if (opts.release_agent)
strcpy(root->release_agent_path, opts.release_agent);
out_unlock:
- if (opts.release_agent)
- kfree(opts.release_agent);
+ kfree(opts.release_agent);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+ unlock_kernel();
return ret;
}
@@ -885,6 +974,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->css_sets);
INIT_LIST_HEAD(&cgrp->release_list);
+ INIT_LIST_HEAD(&cgrp->pids_list);
init_rwsem(&cgrp->pids_mutex);
}
static void init_cgroup_root(struct cgroupfs_root *root)
@@ -969,15 +1059,13 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
/* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);
if (ret) {
- if (opts.release_agent)
- kfree(opts.release_agent);
+ kfree(opts.release_agent);
return ret;
}
root = kzalloc(sizeof(*root), GFP_KERNEL);
if (!root) {
- if (opts.release_agent)
- kfree(opts.release_agent);
+ kfree(opts.release_agent);
return -ENOMEM;
}
@@ -1071,13 +1159,13 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
mutex_unlock(&cgroup_mutex);
}
- return simple_set_mnt(mnt, sb);
+ simple_set_mnt(mnt, sb);
+ return 0;
free_cg_links:
free_cg_links(&tmp_cg_links);
drop_new_super:
- up_write(&sb->s_umount);
- deactivate_super(sb);
+ deactivate_locked_super(sb);
return ret;
}
@@ -1115,13 +1203,15 @@ static void cgroup_kill_sb(struct super_block *sb) {
}
write_unlock(&css_set_lock);
- list_del(&root->root_list);
- root_count--;
+ if (!list_empty(&root->root_list)) {
+ list_del(&root->root_list);
+ root_count--;
+ }
mutex_unlock(&cgroup_mutex);
- kfree(root);
kill_litter_super(sb);
+ kfree(root);
}
static struct file_system_type cgroup_fs_type = {
@@ -1277,6 +1367,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
synchronize_rcu();
put_css_set(cg);
+
+ /*
+ * wake up rmdir() waiter. the rmdir should fail since the cgroup
+ * is no longer empty.
+ */
+ cgroup_wakeup_rmdir_waiter(cgrp);
return 0;
}
@@ -1622,10 +1718,10 @@ static struct inode_operations cgroup_dir_inode_operations = {
.rename = cgroup_rename,
};
-static int cgroup_create_file(struct dentry *dentry, int mode,
+static int cgroup_create_file(struct dentry *dentry, mode_t mode,
struct super_block *sb)
{
- static struct dentry_operations cgroup_dops = {
+ static const struct dentry_operations cgroup_dops = {
.d_iput = cgroup_diput,
};
@@ -1668,7 +1764,7 @@ static int cgroup_create_file(struct dentry *dentry, int mode,
* @mode: mode to set on new directory.
*/
static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
- int mode)
+ mode_t mode)
{
struct dentry *parent;
int error = 0;
@@ -1686,6 +1782,33 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
return error;
}
+/**
+ * cgroup_file_mode - deduce file mode of a control file
+ * @cft: the control file in question
+ *
+ * returns cft->mode if ->mode is not 0
+ * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
+ * returns S_IRUGO if it has only a read handler
+ * returns S_IWUSR if it has only a write hander
+ */
+static mode_t cgroup_file_mode(const struct cftype *cft)
+{
+ mode_t mode = 0;
+
+ if (cft->mode)
+ return cft->mode;
+
+ if (cft->read || cft->read_u64 || cft->read_s64 ||
+ cft->read_map || cft->read_seq_string)
+ mode |= S_IRUGO;
+
+ if (cft->write || cft->write_u64 || cft->write_s64 ||
+ cft->write_string || cft->trigger)
+ mode |= S_IWUSR;
+
+ return mode;
+}
+
int cgroup_add_file(struct cgroup *cgrp,
struct cgroup_subsys *subsys,
const struct cftype *cft)
@@ -1693,6 +1816,7 @@ int cgroup_add_file(struct cgroup *cgrp,
struct dentry *dir = cgrp->dentry;
struct dentry *dentry;
int error;
+ mode_t mode;
char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -1703,7 +1827,8 @@ int cgroup_add_file(struct cgroup *cgrp,
BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
dentry = lookup_one_len(name, dir, strlen(name));
if (!IS_ERR(dentry)) {
- error = cgroup_create_file(dentry, 0644 | S_IFREG,
+ mode = cgroup_file_mode(cft);
+ error = cgroup_create_file(dentry, mode | S_IFREG,
cgrp->root->sb);
if (!error)
dentry->d_fsdata = (void *)cft;
@@ -2091,12 +2216,30 @@ err:
return ret;
}
+/*
+ * Cache pids for all threads in the same pid namespace that are
+ * opening the same "tasks" file.
+ */
+struct cgroup_pids {
+ /* The node in cgrp->pids_list */
+ struct list_head list;
+ /* The cgroup those pids belong to */
+ struct cgroup *cgrp;
+ /* The namepsace those pids belong to */
+ struct pid_namespace *ns;
+ /* Array of process ids in the cgroup */
+ pid_t *tasks_pids;
+ /* How many files are using the this tasks_pids array */
+ int use_count;
+ /* Length of the current tasks_pids array */
+ int length;
+};
+
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
-
/*
* seq_file methods for the "tasks" file. The seq_file position is the
* next pid to display; the seq_file iterator is a pointer to the pid
@@ -2111,45 +2254,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
- struct cgroup *cgrp = s->private;
+ struct cgroup_pids *cp = s->private;
+ struct cgroup *cgrp = cp->cgrp;
int index = 0, pid = *pos;
int *iter;
down_read(&cgrp->pids_mutex);
if (pid) {
- int end = cgrp->pids_length;
+ int end = cp->length;
while (index < end) {
int mid = (index + end) / 2;
- if (cgrp->tasks_pids[mid] == pid) {
+ if (cp->tasks_pids[mid] == pid) {
index = mid;
break;
- } else if (cgrp->tasks_pids[mid] <= pid)
+ } else if (cp->tasks_pids[mid] <= pid)
index = mid + 1;
else
end = mid;
}
}
/* If we're off the end of the array, we're done */
- if (index >= cgrp->pids_length)
+ if (index >= cp->length)
return NULL;
/* Update the abstract position to be the actual pid that we found */
- iter = cgrp->tasks_pids + index;
+ iter = cp->tasks_pids + index;
*pos = *iter;
return iter;
}
static void cgroup_tasks_stop(struct seq_file *s, void *v)
{
- struct cgroup *cgrp = s->private;
+ struct cgroup_pids *cp = s->private;
+ struct cgroup *cgrp = cp->cgrp;
up_read(&cgrp->pids_mutex);
}
static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct cgroup *cgrp = s->private;
+ struct cgroup_pids *cp = s->private;
int *p = v;
- int *end = cgrp->tasks_pids + cgrp->pids_length;
+ int *end = cp->tasks_pids + cp->length;
/*
* Advance to the next pid in the array. If this goes off the
@@ -2176,26 +2321,33 @@ static struct seq_operations cgroup_tasks_seq_operations = {
.show = cgroup_tasks_show,
};
-static void release_cgroup_pid_array(struct cgroup *cgrp)
+static void release_cgroup_pid_array(struct cgroup_pids *cp)
{
+ struct cgroup *cgrp = cp->cgrp;
+
down_write(&cgrp->pids_mutex);
- BUG_ON(!cgrp->pids_use_count);
- if (!--cgrp->pids_use_count) {
- kfree(cgrp->tasks_pids);
- cgrp->tasks_pids = NULL;
- cgrp->pids_length = 0;
+ BUG_ON(!cp->use_count);
+ if (!--cp->use_count) {
+ list_del(&cp->list);
+ put_pid_ns(cp->ns);
+ kfree(cp->tasks_pids);
+ kfree(cp);
}
up_write(&cgrp->pids_mutex);
}
static int cgroup_tasks_release(struct inode *inode, struct file *file)
{
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+ struct seq_file *seq;
+ struct cgroup_pids *cp;
if (!(file->f_mode & FMODE_READ))
return 0;
- release_cgroup_pid_array(cgrp);
+ seq = file->private_data;
+ cp = seq->private;
+
+ release_cgroup_pid_array(cp);
return seq_release(inode, file);
}
@@ -2214,6 +2366,8 @@ static struct file_operations cgroup_tasks_operations = {
static int cgroup_tasks_open(struct inode *unused, struct file *file)
{
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+ struct pid_namespace *ns = current->nsproxy->pid_ns;
+ struct cgroup_pids *cp;
pid_t *pidarray;
int npids;
int retval;
@@ -2240,20 +2394,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
* array if necessary
*/
down_write(&cgrp->pids_mutex);
- kfree(cgrp->tasks_pids);
- cgrp->tasks_pids = pidarray;
- cgrp->pids_length = npids;
- cgrp->pids_use_count++;
+
+ list_for_each_entry(cp, &cgrp->pids_list, list) {
+ if (ns == cp->ns)
+ goto found;
+ }
+
+ cp = kzalloc(sizeof(*cp), GFP_KERNEL);
+ if (!cp) {
+ up_write(&cgrp->pids_mutex);
+ kfree(pidarray);
+ return -ENOMEM;
+ }
+ cp->cgrp = cgrp;
+ cp->ns = ns;
+ get_pid_ns(ns);
+ list_add(&cp->list, &cgrp->pids_list);
+found:
+ kfree(cp->tasks_pids);
+ cp->tasks_pids = pidarray;
+ cp->length = npids;
+ cp->use_count++;
up_write(&cgrp->pids_mutex);
file->f_op = &cgroup_tasks_operations;
retval = seq_open(file, &cgroup_tasks_seq_operations);
if (retval) {
- release_cgroup_pid_array(cgrp);
+ release_cgroup_pid_array(cp);
return retval;
}
- ((struct seq_file *)file->private_data)->private = cgrp;
+ ((struct seq_file *)file->private_data)->private = cp;
return 0;
}
@@ -2285,6 +2456,7 @@ static struct cftype files[] = {
.write_u64 = cgroup_tasks_write,
.release = cgroup_tasks_release,
.private = FILE_TASKLIST,
+ .mode = S_IRUGO | S_IWUSR,
},
{
@@ -2324,6 +2496,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
return err;
}
+ /* This cgroup is ready now */
+ for_each_subsys(cgrp->root, ss) {
+ struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+ /*
+ * Update id->css pointer and make this css visible from
+ * CSS ID functions. This pointer will be dereferened
+ * from RCU-read-side without locks.
+ */
+ if (css->id)
+ rcu_assign_pointer(css->id->css, css);
+ }
return 0;
}
@@ -2335,6 +2518,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
css->cgroup = cgrp;
atomic_set(&css->refcnt, 1);
css->flags = 0;
+ css->id = NULL;
if (cgrp == dummytop)
set_bit(CSS_ROOT, &css->flags);
BUG_ON(cgrp->subsys[ss->subsys_id]);
@@ -2349,7 +2533,7 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (ss->root == root)
- mutex_lock_nested(&ss->hierarchy_mutex, i);
+ mutex_lock(&ss->hierarchy_mutex);
}
}
@@ -2373,7 +2557,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
* Must be called with the mutex on the parent inode held
*/
static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
- int mode)
+ mode_t mode)
{
struct cgroup *cgrp;
struct cgroupfs_root *root = parent->root;
@@ -2410,6 +2594,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
goto err_destroy;
}
init_cgroup_css(css, ss, cgrp);
+ if (ss->use_id)
+ if (alloc_css_id(ss, parent, cgrp))
+ goto err_destroy;
+ /* At error, ->destroy() callback has to free assigned ID. */
}
cgroup_lock_hierarchy(root);
@@ -2434,7 +2622,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
err_remove:
+ cgroup_lock_hierarchy(root);
list_del(&cgrp->sibling);
+ cgroup_unlock_hierarchy(root);
root->number_of_cgroups--;
err_destroy:
@@ -2507,7 +2697,7 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
int refcnt;
- do {
+ while (1) {
/* We can only remove a CSS with a refcnt==1 */
refcnt = atomic_read(&css->refcnt);
if (refcnt > 1) {
@@ -2521,7 +2711,10 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
* css_tryget() to spin until we set the
* CSS_REMOVED bits or abort
*/
- } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt);
+ if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
+ break;
+ cpu_relax();
+ }
}
done:
for_each_subsys(cgrp->root, ss) {
@@ -2547,9 +2740,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
struct cgroup *cgrp = dentry->d_fsdata;
struct dentry *d;
struct cgroup *parent;
+ DEFINE_WAIT(wait);
+ int ret;
/* the vfs holds both inode->i_mutex already */
-
+again:
mutex_lock(&cgroup_mutex);
if (atomic_read(&cgrp->count) != 0) {
mutex_unlock(&cgroup_mutex);
@@ -2562,20 +2757,51 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
mutex_unlock(&cgroup_mutex);
/*
+ * In general, subsystem has no css->refcnt after pre_destroy(). But
+ * in racy cases, subsystem may have to get css->refcnt after
+ * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
+ * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
+ * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
+ * and subsystem's reference count handling. Please see css_get/put
+ * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
+ */
+ set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+
+ /*
* Call pre_destroy handlers of subsys. Notify subsystems
* that rmdir() request comes.
*/
- cgroup_call_pre_destroy(cgrp);
+ ret = cgroup_call_pre_destroy(cgrp);
+ if (ret) {
+ clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+ return ret;
+ }
mutex_lock(&cgroup_mutex);
parent = cgrp->parent;
-
- if (atomic_read(&cgrp->count)
- || !list_empty(&cgrp->children)
- || !cgroup_clear_css_refs(cgrp)) {
+ if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
+ clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
+ prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
+ if (!cgroup_clear_css_refs(cgrp)) {
+ mutex_unlock(&cgroup_mutex);
+ /*
+ * Because someone may call cgroup_wakeup_rmdir_waiter() before
+ * prepare_to_wait(), we need to check this flag.
+ */
+ if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
+ schedule();
+ finish_wait(&cgroup_rmdir_waitq, &wait);
+ clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+ if (signal_pending(current))
+ return -EINTR;
+ goto again;
+ }
+ /* NO css_tryget() can success after here. */
+ finish_wait(&cgroup_rmdir_waitq, &wait);
+ clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
spin_lock(&release_list_lock);
set_bit(CGRP_REMOVED, &cgrp->flags);
@@ -2630,6 +2856,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
BUG_ON(!list_empty(&init_task.tasks));
mutex_init(&ss->hierarchy_mutex);
+ lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
ss->active = 1;
}
@@ -2699,6 +2926,8 @@ int __init cgroup_init(void)
struct cgroup_subsys *ss = subsys[i];
if (!ss->early_init)
cgroup_init_subsys(ss);
+ if (ss->use_id)
+ cgroup_subsys_init_idr(ss);
}
/* Add init_css_set to the hash table */
@@ -2991,20 +3220,21 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
mutex_unlock(&cgroup_mutex);
return 0;
}
- task_lock(tsk);
- cg = tsk->cgroups;
- parent = task_cgroup(tsk, subsys->subsys_id);
/* Pin the hierarchy */
- if (!atomic_inc_not_zero(&parent->root->sb->s_active)) {
+ if (!atomic_inc_not_zero(&root->sb->s_active)) {
/* We race with the final deactivate_super() */
mutex_unlock(&cgroup_mutex);
return 0;
}
/* Keep the cgroup alive */
+ task_lock(tsk);
+ parent = task_cgroup(tsk, subsys->subsys_id);
+ cg = tsk->cgroups;
get_css_set(cg);
task_unlock(tsk);
+
mutex_unlock(&cgroup_mutex);
/* Now do the VFS work to create a cgroup */
@@ -3043,7 +3273,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
mutex_unlock(&inode->i_mutex);
put_css_set(cg);
- deactivate_super(parent->root->sb);
+ deactivate_super(root->sb);
/* The cgroup is still accessible in the VFS, but
* we're not going to try to rmdir() it at this
* point. */
@@ -3069,23 +3299,24 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
mutex_lock(&cgroup_mutex);
put_css_set(cg);
mutex_unlock(&cgroup_mutex);
- deactivate_super(parent->root->sb);
+ deactivate_super(root->sb);
return ret;
}
/**
- * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp
+ * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
* @cgrp: the cgroup in question
+ * @task: the task in question
*
- * See if @cgrp is a descendant of the current task's cgroup in
- * the appropriate hierarchy.
+ * See if @cgrp is a descendant of @task's cgroup in the appropriate
+ * hierarchy.
*
* If we are sending in dummytop, then presumably we are creating
* the top cgroup in the subsystem.
*
* Called only by the ns (nsproxy) cgroup.
*/
-int cgroup_is_descendant(const struct cgroup *cgrp)
+int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
{
int ret;
struct cgroup *target;
@@ -3095,7 +3326,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp)
return 1;
get_first_subsys(cgrp, NULL, &subsys_id);
- target = task_cgroup(current, subsys_id);
+ target = task_cgroup(task, subsys_id);
while (cgrp != target && cgrp!= cgrp->top_cgroup)
cgrp = cgrp->parent;
ret = (cgrp == target);
@@ -3128,10 +3359,12 @@ void __css_put(struct cgroup_subsys_state *css)
{
struct cgroup *cgrp = css->cgroup;
rcu_read_lock();
- if ((atomic_dec_return(&css->refcnt) == 1) &&
- notify_on_release(cgrp)) {
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
+ if (atomic_dec_return(&css->refcnt) == 1) {
+ if (notify_on_release(cgrp)) {
+ set_bit(CGRP_RELEASABLE, &cgrp->flags);
+ check_for_release(cgrp);
+ }
+ cgroup_wakeup_rmdir_waiter(cgrp);
}
rcu_read_unlock();
}
@@ -3231,3 +3464,232 @@ static int __init cgroup_disable(char *str)
return 1;
}
__setup("cgroup_disable=", cgroup_disable);
+
+/*
+ * Functons for CSS ID.
+ */
+
+/*
+ *To get ID other than 0, this should be called when !cgroup_is_removed().
+ */
+unsigned short css_id(struct cgroup_subsys_state *css)
+{
+ struct css_id *cssid = rcu_dereference(css->id);
+
+ if (cssid)
+ return cssid->id;
+ return 0;
+}
+
+unsigned short css_depth(struct cgroup_subsys_state *css)
+{
+ struct css_id *cssid = rcu_dereference(css->id);
+
+ if (cssid)
+ return cssid->depth;
+ return 0;
+}
+
+bool css_is_ancestor(struct cgroup_subsys_state *child,
+ const struct cgroup_subsys_state *root)
+{
+ struct css_id *child_id = rcu_dereference(child->id);
+ struct css_id *root_id = rcu_dereference(root->id);
+
+ if (!child_id || !root_id || (child_id->depth < root_id->depth))
+ return false;
+ return child_id->stack[root_id->depth] == root_id->id;
+}
+
+static void __free_css_id_cb(struct rcu_head *head)
+{
+ struct css_id *id;
+
+ id = container_of(head, struct css_id, rcu_head);
+ kfree(id);
+}
+
+void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
+{
+ struct css_id *id = css->id;
+ /* When this is called before css_id initialization, id can be NULL */
+ if (!id)
+ return;
+
+ BUG_ON(!ss->use_id);
+
+ rcu_assign_pointer(id->css, NULL);
+ rcu_assign_pointer(css->id, NULL);
+ spin_lock(&ss->id_lock);
+ idr_remove(&ss->idr, id->id);
+ spin_unlock(&ss->id_lock);
+ call_rcu(&id->rcu_head, __free_css_id_cb);
+}
+
+/*
+ * This is called by init or create(). Then, calls to this function are
+ * always serialized (By cgroup_mutex() at create()).
+ */
+
+static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
+{
+ struct css_id *newid;
+ int myid, error, size;
+
+ BUG_ON(!ss->use_id);
+
+ size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
+ newid = kzalloc(size, GFP_KERNEL);
+ if (!newid)
+ return ERR_PTR(-ENOMEM);
+ /* get id */
+ if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
+ error = -ENOMEM;
+ goto err_out;
+ }
+ spin_lock(&ss->id_lock);
+ /* Don't use 0. allocates an ID of 1-65535 */
+ error = idr_get_new_above(&ss->idr, newid, 1, &myid);
+ spin_unlock(&ss->id_lock);
+
+ /* Returns error when there are no free spaces for new ID.*/
+ if (error) {
+ error = -ENOSPC;
+ goto err_out;
+ }
+ if (myid > CSS_ID_MAX)
+ goto remove_idr;
+
+ newid->id = myid;
+ newid->depth = depth;
+ return newid;
+remove_idr:
+ error = -ENOSPC;
+ spin_lock(&ss->id_lock);
+ idr_remove(&ss->idr, myid);
+ spin_unlock(&ss->id_lock);
+err_out:
+ kfree(newid);
+ return ERR_PTR(error);
+
+}
+
+static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
+{
+ struct css_id *newid;
+ struct cgroup_subsys_state *rootcss;
+
+ spin_lock_init(&ss->id_lock);
+ idr_init(&ss->idr);
+
+ rootcss = init_css_set.subsys[ss->subsys_id];
+ newid = get_new_cssid(ss, 0);
+ if (IS_ERR(newid))
+ return PTR_ERR(newid);
+
+ newid->stack[0] = newid->id;
+ newid->css = rootcss;
+ rootcss->id = newid;
+ return 0;
+}
+
+static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
+ struct cgroup *child)
+{
+ int subsys_id, i, depth = 0;
+ struct cgroup_subsys_state *parent_css, *child_css;
+ struct css_id *child_id, *parent_id = NULL;
+
+ subsys_id = ss->subsys_id;
+ parent_css = parent->subsys[subsys_id];
+ child_css = child->subsys[subsys_id];
+ depth = css_depth(parent_css) + 1;
+ parent_id = parent_css->id;
+
+ child_id = get_new_cssid(ss, depth);
+ if (IS_ERR(child_id))
+ return PTR_ERR(child_id);
+
+ for (i = 0; i < depth; i++)
+ child_id->stack[i] = parent_id->stack[i];
+ child_id->stack[depth] = child_id->id;
+ /*
+ * child_id->css pointer will be set after this cgroup is available
+ * see cgroup_populate_dir()
+ */
+ rcu_assign_pointer(child_css->id, child_id);
+
+ return 0;
+}
+
+/**
+ * css_lookup - lookup css by id
+ * @ss: cgroup subsys to be looked into.
+ * @id: the id
+ *
+ * Returns pointer to cgroup_subsys_state if there is valid one with id.
+ * NULL if not. Should be called under rcu_read_lock()
+ */
+struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
+{
+ struct css_id *cssid = NULL;
+
+ BUG_ON(!ss->use_id);
+ cssid = idr_find(&ss->idr, id);
+
+ if (unlikely(!cssid))
+ return NULL;
+
+ return rcu_dereference(cssid->css);
+}
+
+/**
+ * css_get_next - lookup next cgroup under specified hierarchy.
+ * @ss: pointer to subsystem
+ * @id: current position of iteration.
+ * @root: pointer to css. search tree under this.
+ * @foundid: position of found object.
+ *
+ * Search next css under the specified hierarchy of rootid. Calling under
+ * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
+ */
+struct cgroup_subsys_state *
+css_get_next(struct cgroup_subsys *ss, int id,
+ struct cgroup_subsys_state *root, int *foundid)
+{
+ struct cgroup_subsys_state *ret = NULL;
+ struct css_id *tmp;
+ int tmpid;
+ int rootid = css_id(root);
+ int depth = css_depth(root);
+
+ if (!rootid)
+ return NULL;
+
+ BUG_ON(!ss->use_id);
+ /* fill start point for scan */
+ tmpid = id;
+ while (1) {
+ /*
+ * scan next entry from bitmap(tree), tmpid is updated after
+ * idr_get_next().
+ */
+ spin_lock(&ss->id_lock);
+ tmp = idr_get_next(&ss->idr, &tmpid);
+ spin_unlock(&ss->id_lock);
+
+ if (!tmp)
+ break;
+ if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
+ ret = rcu_dereference(tmp->css);
+ if (ret) {
+ *foundid = tmpid;
+ break;
+ }
+ }
+ /* continue to scan from next id */
+ tmpid = tmpid + 1;
+ }
+ return ret;
+}
+
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index daca6209202..0c92d797baa 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -40,9 +40,7 @@ static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
{
u64 count;
- cgroup_lock();
count = cgroup_task_count(cont);
- cgroup_unlock();
return count;
}
diff --git a/kernel/compat.c b/kernel/compat.c
index 42d56544460..f6c204f07ea 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -882,6 +882,17 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
}
+asmlinkage long
+compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
+ struct compat_siginfo __user *uinfo)
+{
+ siginfo_t info;
+
+ if (copy_siginfo_from_user32(&info, uinfo))
+ return -EFAULT;
+ return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+
#ifdef __ARCH_WANT_COMPAT_SYS_TIME
/* compat_time_t is a 32 bit "long" and needs to get converted. */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 79e40f00dcb..8ce10043e4a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -34,14 +34,11 @@ static struct {
* an ongoing cpu hotplug operation.
*/
int refcount;
-} cpu_hotplug;
-
-void __init cpu_hotplug_init(void)
-{
- cpu_hotplug.active_writer = NULL;
- mutex_init(&cpu_hotplug.lock);
- cpu_hotplug.refcount = 0;
-}
+} cpu_hotplug = {
+ .active_writer = NULL,
+ .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
+ .refcount = 0,
+};
#ifdef CONFIG_HOTPLUG_CPU
@@ -281,7 +278,7 @@ int __ref cpu_down(unsigned int cpu)
goto out;
}
- cpu_clear(cpu, cpu_active_map);
+ set_cpu_active(cpu, false);
/*
* Make sure the all cpus did the reschedule and are not
@@ -296,7 +293,7 @@ int __ref cpu_down(unsigned int cpu)
err = _cpu_down(cpu, 0);
if (cpu_online(cpu))
- cpu_set(cpu, cpu_active_map);
+ set_cpu_active(cpu, true);
out:
cpu_maps_update_done();
@@ -333,7 +330,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
goto out_notify;
BUG_ON(!cpu_online(cpu));
- cpu_set(cpu, cpu_active_map);
+ set_cpu_active(cpu, true);
/* Now call notifier in preparation. */
raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a85678865c5..7e75a41bd50 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,6 +61,14 @@
#include <linux/cgroup.h>
/*
+ * Workqueue for cpuset related tasks.
+ *
+ * Using kevent workqueue may cause deadlock when memory_migrate
+ * is set. So we create a separate workqueue thread for cpuset.
+ */
+static struct workqueue_struct *cpuset_wq;
+
+/*
* Tracks how many cpusets are currently defined in system.
* When there is only one cpuset (the root cpuset) we can
* short circuit some hooks.
@@ -89,12 +97,6 @@ struct cpuset {
struct cpuset *parent; /* my parent */
- /*
- * Copy of global cpuset_mems_generation as of the most
- * recent time this cpuset changed its mems_allowed.
- */
- int mems_generation;
-
struct fmeter fmeter; /* memory_pressure filter */
/* partition number for rebuild_sched_domains() */
@@ -120,10 +122,6 @@ static inline struct cpuset *task_cs(struct task_struct *task)
return container_of(task_subsys_state(task, cpuset_subsys_id),
struct cpuset, css);
}
-struct cpuset_hotplug_scanner {
- struct cgroup_scanner scan;
- struct cgroup *to;
-};
/* bits in struct cpuset flags field */
typedef enum {
@@ -172,27 +170,6 @@ static inline int is_spread_slab(const struct cpuset *cs)
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
-/*
- * Increment this integer everytime any cpuset changes its
- * mems_allowed value. Users of cpusets can track this generation
- * number, and avoid having to lock and reload mems_allowed unless
- * the cpuset they're using changes generation.
- *
- * A single, global generation is needed because cpuset_attach_task() could
- * reattach a task to a different cpuset, which must not have its
- * generation numbers aliased with those of that tasks previous cpuset.
- *
- * Generations are needed for mems_allowed because one task cannot
- * modify another's memory placement. So we must enable every task,
- * on every visit to __alloc_pages(), to efficiently check whether
- * its current->cpuset->mems_allowed has changed, requiring an update
- * of its current->mems_allowed.
- *
- * Since writes to cpuset_mems_generation are guarded by the cgroup lock
- * there is no need to mark it atomic.
- */
-static int cpuset_mems_generation;
-
static struct cpuset top_cpuset = {
.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
};
@@ -224,8 +201,9 @@ static struct cpuset top_cpuset = {
* If a task is only holding callback_mutex, then it has read-only
* access to cpusets.
*
- * The task_struct fields mems_allowed and mems_generation may only
- * be accessed in the context of that task, so require no locks.
+ * Now, the task_struct fields mems_allowed and mempolicy may be changed
+ * by other task, we use alloc_lock in the task_struct fields to protect
+ * them.
*
* The cpuset_common_file_read() handlers only hold callback_mutex across
* small pieces of code, such as when reading out possibly multi-word
@@ -327,75 +305,22 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
}
-/**
- * cpuset_update_task_memory_state - update task memory placement
- *
- * If the current tasks cpusets mems_allowed changed behind our
- * backs, update current->mems_allowed, mems_generation and task NUMA
- * mempolicy to the new value.
- *
- * Task mempolicy is updated by rebinding it relative to the
- * current->cpuset if a task has its memory placement changed.
- * Do not call this routine if in_interrupt().
- *
- * Call without callback_mutex or task_lock() held. May be
- * called with or without cgroup_mutex held. Thanks in part to
- * 'the_top_cpuset_hack', the task's cpuset pointer will never
- * be NULL. This routine also might acquire callback_mutex during
- * call.
- *
- * Reading current->cpuset->mems_generation doesn't need task_lock
- * to guard the current->cpuset derefence, because it is guarded
- * from concurrent freeing of current->cpuset using RCU.
- *
- * The rcu_dereference() is technically probably not needed,
- * as I don't actually mind if I see a new cpuset pointer but
- * an old value of mems_generation. However this really only
- * matters on alpha systems using cpusets heavily. If I dropped
- * that rcu_dereference(), it would save them a memory barrier.
- * For all other arch's, rcu_dereference is a no-op anyway, and for
- * alpha systems not using cpusets, another planned optimization,
- * avoiding the rcu critical section for tasks in the root cpuset
- * which is statically allocated, so can't vanish, will make this
- * irrelevant. Better to use RCU as intended, than to engage in
- * some cute trick to save a memory barrier that is impossible to
- * test, for alpha systems using cpusets heavily, which might not
- * even exist.
- *
- * This routine is needed to update the per-task mems_allowed data,
- * within the tasks context, when it is trying to allocate memory
- * (in various mm/mempolicy.c routines) and notices that some other
- * task has been modifying its cpuset.
+/*
+ * update task's spread flag if cpuset's page/slab spread flag is set
+ *
+ * Called with callback_mutex/cgroup_mutex held
*/
-
-void cpuset_update_task_memory_state(void)
+static void cpuset_update_task_spread_flag(struct cpuset *cs,
+ struct task_struct *tsk)
{
- int my_cpusets_mem_gen;
- struct task_struct *tsk = current;
- struct cpuset *cs;
-
- rcu_read_lock();
- my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
- rcu_read_unlock();
-
- if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
- mutex_lock(&callback_mutex);
- task_lock(tsk);
- cs = task_cs(tsk); /* Maybe changed when task not locked */
- guarantee_online_mems(cs, &tsk->mems_allowed);
- tsk->cpuset_mems_generation = cs->mems_generation;
- if (is_spread_page(cs))
- tsk->flags |= PF_SPREAD_PAGE;
- else
- tsk->flags &= ~PF_SPREAD_PAGE;
- if (is_spread_slab(cs))
- tsk->flags |= PF_SPREAD_SLAB;
- else
- tsk->flags &= ~PF_SPREAD_SLAB;
- task_unlock(tsk);
- mutex_unlock(&callback_mutex);
- mpol_rebind_task(tsk, &tsk->mems_allowed);
- }
+ if (is_spread_page(cs))
+ tsk->flags |= PF_SPREAD_PAGE;
+ else
+ tsk->flags &= ~PF_SPREAD_PAGE;
+ if (is_spread_slab(cs))
+ tsk->flags |= PF_SPREAD_SLAB;
+ else
+ tsk->flags &= ~PF_SPREAD_SLAB;
}
/*
@@ -513,6 +438,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
return 0;
}
+#ifdef CONFIG_SMP
/*
* Helper routine for generate_sched_domains().
* Do cpusets a, b have overlapping cpus_allowed masks?
@@ -807,6 +733,18 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
put_online_cpus();
}
+#else /* !CONFIG_SMP */
+static void do_rebuild_sched_domains(struct work_struct *unused)
+{
+}
+
+static int generate_sched_domains(struct cpumask **domains,
+ struct sched_domain_attr **attributes)
+{
+ *domains = NULL;
+ return 1;
+}
+#endif /* CONFIG_SMP */
static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
@@ -831,7 +769,7 @@ static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
*/
static void async_rebuild_sched_domains(void)
{
- schedule_work(&rebuild_sched_domains_work);
+ queue_work(cpuset_wq, &rebuild_sched_domains_work);
}
/*
@@ -990,14 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* other task, the task_struct mems_allowed that we are hacking
* is for our current task, which must allocate new pages for that
* migrating memory region.
- *
- * We call cpuset_update_task_memory_state() before hacking
- * our tasks mems_allowed, so that we are assured of being in
- * sync with our tasks cpuset, and in particular, callbacks to
- * cpuset_update_task_memory_state() from nested page allocations
- * won't see any mismatch of our cpuset and task mems_generation
- * values, so won't overwrite our hacked tasks mems_allowed
- * nodemask.
*/
static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
@@ -1005,17 +935,64 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
{
struct task_struct *tsk = current;
- cpuset_update_task_memory_state();
-
- mutex_lock(&callback_mutex);
tsk->mems_allowed = *to;
- mutex_unlock(&callback_mutex);
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
- mutex_lock(&callback_mutex);
guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
- mutex_unlock(&callback_mutex);
+}
+
+/*
+ * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
+ * @tsk: the task to change
+ * @newmems: new nodes that the task will be set
+ *
+ * In order to avoid seeing no nodes if the old and new nodes are disjoint,
+ * we structure updates as setting all new allowed nodes, then clearing newly
+ * disallowed ones.
+ *
+ * Called with task's alloc_lock held
+ */
+static void cpuset_change_task_nodemask(struct task_struct *tsk,
+ nodemask_t *newmems)
+{
+ nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+ mpol_rebind_task(tsk, &tsk->mems_allowed);
+ mpol_rebind_task(tsk, newmems);
+ tsk->mems_allowed = *newmems;
+}
+
+/*
+ * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
+ * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
+ * memory_migrate flag is set. Called with cgroup_mutex held.
+ */
+static void cpuset_change_nodemask(struct task_struct *p,
+ struct cgroup_scanner *scan)
+{
+ struct mm_struct *mm;
+ struct cpuset *cs;
+ int migrate;
+ const nodemask_t *oldmem = scan->data;
+ nodemask_t newmems;
+
+ cs = cgroup_cs(scan->cg);
+ guarantee_online_mems(cs, &newmems);
+
+ task_lock(p);
+ cpuset_change_task_nodemask(p, &newmems);
+ task_unlock(p);
+
+ mm = get_task_mm(p);
+ if (!mm)
+ return;
+
+ migrate = is_memory_migrate(cs);
+
+ mpol_rebind_mm(mm, &cs->mems_allowed);
+ if (migrate)
+ cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
+ mmput(mm);
}
static void *cpuset_being_rebound;
@@ -1024,104 +1001,48 @@ static void *cpuset_being_rebound;
* update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
* @oldmem: old mems_allowed of cpuset cs
+ * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
* Called with cgroup_mutex held
- * Return 0 if successful, -errno if not.
+ * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * if @heap != NULL.
*/
-static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
+static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
+ struct ptr_heap *heap)
{
- struct task_struct *p;
- struct mm_struct **mmarray;
- int i, n, ntasks;
- int migrate;
- int fudge;
- struct cgroup_iter it;
- int retval;
+ struct cgroup_scanner scan;
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
- fudge = 10; /* spare mmarray[] slots */
- fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */
- retval = -ENOMEM;
-
- /*
- * Allocate mmarray[] to hold mm reference for each task
- * in cpuset cs. Can't kmalloc GFP_KERNEL while holding
- * tasklist_lock. We could use GFP_ATOMIC, but with a
- * few more lines of code, we can retry until we get a big
- * enough mmarray[] w/o using GFP_ATOMIC.
- */
- while (1) {
- ntasks = cgroup_task_count(cs->css.cgroup); /* guess */
- ntasks += fudge;
- mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
- if (!mmarray)
- goto done;
- read_lock(&tasklist_lock); /* block fork */
- if (cgroup_task_count(cs->css.cgroup) <= ntasks)
- break; /* got enough */
- read_unlock(&tasklist_lock); /* try again */
- kfree(mmarray);
- }
-
- n = 0;
-
- /* Load up mmarray[] with mm reference for each task in cpuset. */
- cgroup_iter_start(cs->css.cgroup, &it);
- while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
- struct mm_struct *mm;
-
- if (n >= ntasks) {
- printk(KERN_WARNING
- "Cpuset mempolicy rebind incomplete.\n");
- break;
- }
- mm = get_task_mm(p);
- if (!mm)
- continue;
- mmarray[n++] = mm;
- }
- cgroup_iter_end(cs->css.cgroup, &it);
- read_unlock(&tasklist_lock);
+ scan.cg = cs->css.cgroup;
+ scan.test_task = NULL;
+ scan.process_task = cpuset_change_nodemask;
+ scan.heap = heap;
+ scan.data = (nodemask_t *)oldmem;
/*
- * Now that we've dropped the tasklist spinlock, we can
- * rebind the vma mempolicies of each mm in mmarray[] to their
- * new cpuset, and release that mm. The mpol_rebind_mm()
- * call takes mmap_sem, which we couldn't take while holding
- * tasklist_lock. Forks can happen again now - the mpol_dup()
- * cpuset_being_rebound check will catch such forks, and rebind
- * their vma mempolicies too. Because we still hold the global
- * cgroup_mutex, we know that no other rebind effort will
- * be contending for the global variable cpuset_being_rebound.
+ * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
+ * take while holding tasklist_lock. Forks can happen - the
+ * mpol_dup() cpuset_being_rebound check will catch such forks,
+ * and rebind their vma mempolicies too. Because we still hold
+ * the global cgroup_mutex, we know that no other rebind effort
+ * will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
*/
- migrate = is_memory_migrate(cs);
- for (i = 0; i < n; i++) {
- struct mm_struct *mm = mmarray[i];
-
- mpol_rebind_mm(mm, &cs->mems_allowed);
- if (migrate)
- cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
- mmput(mm);
- }
+ cgroup_scan_tasks(&scan);
/* We're done rebinding vmas to this cpuset's new mems_allowed. */
- kfree(mmarray);
cpuset_being_rebound = NULL;
- retval = 0;
-done:
- return retval;
}
/*
* Handle user request to change the 'mems' memory placement
* of a cpuset. Needs to validate the request, update the
- * cpusets mems_allowed and mems_generation, and for each
- * task in the cpuset, rebind any vma mempolicies and if
- * the cpuset is marked 'memory_migrate', migrate the tasks
- * pages to the new memory.
+ * cpusets mems_allowed, and for each task in the cpuset,
+ * update mems_allowed and rebind task's mempolicy and any vma
+ * mempolicies and if the cpuset is marked 'memory_migrate',
+ * migrate the tasks pages to the new memory.
*
* Call with cgroup_mutex held. May take callback_mutex during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
@@ -1133,6 +1054,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
{
nodemask_t oldmem;
int retval;
+ struct ptr_heap heap;
/*
* top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
@@ -1167,12 +1089,17 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
+ retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+ if (retval < 0)
+ goto done;
+
mutex_lock(&callback_mutex);
cs->mems_allowed = trialcs->mems_allowed;
- cs->mems_generation = cpuset_mems_generation++;
mutex_unlock(&callback_mutex);
- retval = update_tasks_nodemask(cs, &oldmem);
+ update_tasks_nodemask(cs, &oldmem, &heap);
+
+ heap_free(&heap);
done:
return retval;
}
@@ -1184,8 +1111,10 @@ int current_cpuset_is_being_rebound(void)
static int update_relax_domain_level(struct cpuset *cs, s64 val)
{
+#ifdef CONFIG_SMP
if (val < -1 || val >= SD_LV_MAX)
return -EINVAL;
+#endif
if (val != cs->relax_domain_level) {
cs->relax_domain_level = val;
@@ -1198,6 +1127,46 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
}
/*
+ * cpuset_change_flag - make a task's spread flags the same as its cpuset's
+ * @tsk: task to be updated
+ * @scan: struct cgroup_scanner containing the cgroup of the task
+ *
+ * Called by cgroup_scan_tasks() for each task in a cgroup.
+ *
+ * We don't need to re-check for the cgroup/cpuset membership, since we're
+ * holding cgroup_lock() at this point.
+ */
+static void cpuset_change_flag(struct task_struct *tsk,
+ struct cgroup_scanner *scan)
+{
+ cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
+}
+
+/*
+ * update_tasks_flags - update the spread flags of tasks in the cpuset.
+ * @cs: the cpuset in which each task's spread flags needs to be changed
+ * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
+ *
+ * Called with cgroup_mutex held
+ *
+ * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * calling callback functions for each.
+ *
+ * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * if @heap != NULL.
+ */
+static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
+{
+ struct cgroup_scanner scan;
+
+ scan.cg = cs->css.cgroup;
+ scan.test_task = NULL;
+ scan.process_task = cpuset_change_flag;
+ scan.heap = heap;
+ cgroup_scan_tasks(&scan);
+}
+
+/*
* update_flag - read a 0 or a 1 in a file and update associated flag
* bit: the bit to update (see cpuset_flagbits_t)
* cs: the cpuset to update
@@ -1210,8 +1179,10 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
int turning_on)
{
struct cpuset *trialcs;
- int err;
int balance_flag_changed;
+ int spread_flag_changed;
+ struct ptr_heap heap;
+ int err;
trialcs = alloc_trial_cpuset(cs);
if (!trialcs)
@@ -1226,9 +1197,16 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
if (err < 0)
goto out;
+ err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+ if (err < 0)
+ goto out;
+
balance_flag_changed = (is_sched_load_balance(cs) !=
is_sched_load_balance(trialcs));
+ spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
+ || (is_spread_page(cs) != is_spread_page(trialcs)));
+
mutex_lock(&callback_mutex);
cs->flags = trialcs->flags;
mutex_unlock(&callback_mutex);
@@ -1236,6 +1214,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
async_rebuild_sched_domains();
+ if (spread_flag_changed)
+ update_tasks_flags(cs, &heap);
+ heap_free(&heap);
out:
free_trial_cpuset(trialcs);
return err;
@@ -1347,19 +1328,22 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
struct cgroup *cont, struct task_struct *tsk)
{
struct cpuset *cs = cgroup_cs(cont);
- int ret = 0;
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
return -ENOSPC;
- if (tsk->flags & PF_THREAD_BOUND) {
- mutex_lock(&callback_mutex);
- if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed))
- ret = -EINVAL;
- mutex_unlock(&callback_mutex);
- }
+ /*
+ * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
+ * cannot change their cpu affinity and isolating such threads by their
+ * set of allowed nodes is unnecessary. Thus, cpusets are not
+ * applicable for such threads. This prevents checking for success of
+ * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
+ * be changed.
+ */
+ if (tsk->flags & PF_THREAD_BOUND)
+ return -EINVAL;
- return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL);
+ return security_task_setscheduler(tsk, 0, NULL);
}
static void cpuset_attach(struct cgroup_subsys *ss,
@@ -1374,15 +1358,20 @@ static void cpuset_attach(struct cgroup_subsys *ss,
if (cs == &top_cpuset) {
cpumask_copy(cpus_attach, cpu_possible_mask);
+ to = node_possible_map;
} else {
- mutex_lock(&callback_mutex);
guarantee_online_cpus(cs, cpus_attach);
- mutex_unlock(&callback_mutex);
+ guarantee_online_mems(cs, &to);
}
err = set_cpus_allowed_ptr(tsk, cpus_attach);
if (err)
return;
+ task_lock(tsk);
+ cpuset_change_task_nodemask(tsk, &to);
+ task_unlock(tsk);
+ cpuset_update_task_spread_flag(cs, tsk);
+
from = oldcs->mems_allowed;
to = cs->mems_allowed;
mm = get_task_mm(tsk);
@@ -1444,11 +1433,9 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
break;
case FILE_SPREAD_PAGE:
retval = update_flag(CS_SPREAD_PAGE, cs, val);
- cs->mems_generation = cpuset_mems_generation++;
break;
case FILE_SPREAD_SLAB:
retval = update_flag(CS_SPREAD_SLAB, cs, val);
- cs->mems_generation = cpuset_mems_generation++;
break;
default:
retval = -EINVAL;
@@ -1698,6 +1685,7 @@ static struct cftype files[] = {
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
.private = FILE_MEMORY_PRESSURE,
+ .mode = S_IRUGO,
},
{
@@ -1787,8 +1775,6 @@ static struct cgroup_subsys_state *cpuset_create(
struct cpuset *parent;
if (!cont->parent) {
- /* This is early initialization for the top cgroup */
- top_cpuset.mems_generation = cpuset_mems_generation++;
return &top_cpuset.css;
}
parent = cgroup_cs(cont->parent);
@@ -1800,7 +1786,6 @@ static struct cgroup_subsys_state *cpuset_create(
return ERR_PTR(-ENOMEM);
}
- cpuset_update_task_memory_state();
cs->flags = 0;
if (is_spread_page(parent))
set_bit(CS_SPREAD_PAGE, &cs->flags);
@@ -1809,7 +1794,6 @@ static struct cgroup_subsys_state *cpuset_create(
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
- cs->mems_generation = cpuset_mems_generation++;
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
@@ -1828,8 +1812,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
{
struct cpuset *cs = cgroup_cs(cont);
- cpuset_update_task_memory_state();
-
if (is_sched_load_balance(cs))
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
@@ -1850,21 +1832,6 @@ struct cgroup_subsys cpuset_subsys = {
.early_init = 1,
};
-/*
- * cpuset_init_early - just enough so that the calls to
- * cpuset_update_task_memory_state() in early init code
- * are harmless.
- */
-
-int __init cpuset_init_early(void)
-{
- alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
-
- top_cpuset.mems_generation = cpuset_mems_generation++;
- return 0;
-}
-
-
/**
* cpuset_init - initialize cpusets at system boot
*
@@ -1875,11 +1842,13 @@ int __init cpuset_init(void)
{
int err = 0;
+ if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
+ BUG();
+
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
fmeter_init(&top_cpuset.fmeter);
- top_cpuset.mems_generation = cpuset_mems_generation++;
set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
top_cpuset.relax_domain_level = -1;
@@ -1905,10 +1874,9 @@ int __init cpuset_init(void)
static void cpuset_do_move_task(struct task_struct *tsk,
struct cgroup_scanner *scan)
{
- struct cpuset_hotplug_scanner *chsp;
+ struct cgroup *new_cgroup = scan->data;
- chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
- cgroup_attach_task(chsp->to, tsk);
+ cgroup_attach_task(new_cgroup, tsk);
}
/**
@@ -1924,15 +1892,15 @@ static void cpuset_do_move_task(struct task_struct *tsk,
*/
static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
{
- struct cpuset_hotplug_scanner scan;
+ struct cgroup_scanner scan;
- scan.scan.cg = from->css.cgroup;
- scan.scan.test_task = NULL; /* select all tasks in cgroup */
- scan.scan.process_task = cpuset_do_move_task;
- scan.scan.heap = NULL;
- scan.to = to->css.cgroup;
+ scan.cg = from->css.cgroup;
+ scan.test_task = NULL; /* select all tasks in cgroup */
+ scan.process_task = cpuset_do_move_task;
+ scan.heap = NULL;
+ scan.data = to->css.cgroup;
- if (cgroup_scan_tasks(&scan.scan))
+ if (cgroup_scan_tasks(&scan))
printk(KERN_ERR "move_member_tasks_to_cpuset: "
"cgroup_scan_tasks failed\n");
}
@@ -2025,7 +1993,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
remove_tasks_in_empty_cpuset(cp);
else {
update_tasks_cpumask(cp, NULL);
- update_tasks_nodemask(cp, &oldmems);
+ update_tasks_nodemask(cp, &oldmems, NULL);
}
}
}
@@ -2061,7 +2029,9 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
}
cgroup_lock();
+ mutex_lock(&callback_mutex);
cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
+ mutex_unlock(&callback_mutex);
scan_for_empty_cpusets(&top_cpuset);
ndoms = generate_sched_domains(&doms, &attr);
cgroup_unlock();
@@ -2084,11 +2054,12 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
cgroup_lock();
switch (action) {
case MEM_ONLINE:
- top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
- break;
case MEM_OFFLINE:
+ mutex_lock(&callback_mutex);
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
- scan_for_empty_cpusets(&top_cpuset);
+ mutex_unlock(&callback_mutex);
+ if (action == MEM_OFFLINE)
+ scan_for_empty_cpusets(&top_cpuset);
break;
default:
break;
@@ -2111,6 +2082,9 @@ void __init cpuset_init_smp(void)
hotcpu_notifier(cpuset_track_online_cpus, 0);
hotplug_memory_notifier(cpuset_track_online_nodes, 10);
+
+ cpuset_wq = create_singlethread_workqueue("cpuset");
+ BUG_ON(!cpuset_wq);
}
/**
@@ -2195,26 +2169,24 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
}
/**
- * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
- * @z: is this zone on an allowed node?
+ * cpuset_node_allowed_softwall - Can we allocate on a memory node?
+ * @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
- * If we're in interrupt, yes, we can always allocate. If
- * __GFP_THISNODE is set, yes, we can always allocate. If zone
- * z's node is in our tasks mems_allowed, yes. If it's not a
- * __GFP_HARDWALL request and this zone's nodes is in the nearest
- * hardwalled cpuset ancestor to this tasks cpuset, yes.
- * If the task has been OOM killed and has access to memory reserves
- * as specified by the TIF_MEMDIE flag, yes.
+ * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
+ * set, yes, we can always allocate. If node is in our task's mems_allowed,
+ * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest
+ * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been
+ * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
+ * flag, yes.
* Otherwise, no.
*
- * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
- * reduces to cpuset_zone_allowed_hardwall(). Otherwise,
- * cpuset_zone_allowed_softwall() might sleep, and might allow a zone
- * from an enclosing cpuset.
+ * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
+ * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()
+ * might sleep, and might allow a node from an enclosing cpuset.
*
- * cpuset_zone_allowed_hardwall() only handles the simpler case of
- * hardwall cpusets, and never sleeps.
+ * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
+ * cpusets, and never sleeps.
*
* The __GFP_THISNODE placement logic is really handled elsewhere,
* by forcibly using a zonelist starting at a specified node, and by
@@ -2253,20 +2225,17 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
* GFP_USER - only nodes in current tasks mems allowed ok.
*
* Rule:
- * Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
+ * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
* pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
* the code that might scan up ancestor cpusets and sleep.
*/
-
-int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
{
- int node; /* node that zone z is on */
const struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
return 1;
- node = zone_to_nid(z);
might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
if (node_isset(node, current->mems_allowed))
return 1;
@@ -2295,15 +2264,15 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
}
/*
- * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
- * @z: is this zone on an allowed node?
+ * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
+ * @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
- * If we're in interrupt, yes, we can always allocate.
- * If __GFP_THISNODE is set, yes, we can always allocate. If zone
- * z's node is in our tasks mems_allowed, yes. If the task has been
- * OOM killed and has access to memory reserves as specified by the
- * TIF_MEMDIE flag, yes. Otherwise, no.
+ * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
+ * set, yes, we can always allocate. If node is in our task's mems_allowed,
+ * yes. If the task has been OOM killed and has access to memory reserves as
+ * specified by the TIF_MEMDIE flag, yes.
+ * Otherwise, no.
*
* The __GFP_THISNODE placement logic is really handled elsewhere,
* by forcibly using a zonelist starting at a specified node, and by
@@ -2311,20 +2280,16 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
* any node on the zonelist except the first. By the time any such
* calls get to this routine, we should just shut up and say 'yes'.
*
- * Unlike the cpuset_zone_allowed_softwall() variant, above,
- * this variant requires that the zone be in the current tasks
+ * Unlike the cpuset_node_allowed_softwall() variant, above,
+ * this variant requires that the node be in the current task's
* mems_allowed or that we're in interrupt. It does not scan up the
* cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
* It never sleeps.
*/
-
-int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
{
- int node; /* node that zone z is on */
-
if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
return 1;
- node = zone_to_nid(z);
if (node_isset(node, current->mems_allowed))
return 1;
/*
diff --git a/kernel/cred.c b/kernel/cred.c
index 3a039189d70..006fcab009d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -18,6 +18,18 @@
#include <linux/cn_proc.h>
#include "cred-internals.h"
+#if 0
+#define kdebug(FMT, ...) \
+ printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#else
+static inline __attribute__((format(printf, 1, 2)))
+void no_printk(const char *fmt, ...)
+{
+}
+#define kdebug(FMT, ...) \
+ no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#endif
+
static struct kmem_cache *cred_jar;
/*
@@ -36,6 +48,10 @@ static struct thread_group_cred init_tgcred = {
*/
struct cred init_cred = {
.usage = ATOMIC_INIT(4),
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ .subscribers = ATOMIC_INIT(2),
+ .magic = CRED_MAGIC,
+#endif
.securebits = SECUREBITS_DEFAULT,
.cap_inheritable = CAP_INIT_INH_SET,
.cap_permitted = CAP_FULL_SET,
@@ -48,6 +64,31 @@ struct cred init_cred = {
#endif
};
+static inline void set_cred_subscribers(struct cred *cred, int n)
+{
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ atomic_set(&cred->subscribers, n);
+#endif
+}
+
+static inline int read_cred_subscribers(const struct cred *cred)
+{
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ return atomic_read(&cred->subscribers);
+#else
+ return 0;
+#endif
+}
+
+static inline void alter_cred_subscribers(const struct cred *_cred, int n)
+{
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ struct cred *cred = (struct cred *) _cred;
+
+ atomic_add(n, &cred->subscribers);
+#endif
+}
+
/*
* Dispose of the shared task group credentials
*/
@@ -85,9 +126,22 @@ static void put_cred_rcu(struct rcu_head *rcu)
{
struct cred *cred = container_of(rcu, struct cred, rcu);
+ kdebug("put_cred_rcu(%p)", cred);
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ if (cred->magic != CRED_MAGIC_DEAD ||
+ atomic_read(&cred->usage) != 0 ||
+ read_cred_subscribers(cred) != 0)
+ panic("CRED: put_cred_rcu() sees %p with"
+ " mag %x, put %p, usage %d, subscr %d\n",
+ cred, cred->magic, cred->put_addr,
+ atomic_read(&cred->usage),
+ read_cred_subscribers(cred));
+#else
if (atomic_read(&cred->usage) != 0)
panic("CRED: put_cred_rcu() sees %p with usage %d\n",
cred, atomic_read(&cred->usage));
+#endif
security_cred_free(cred);
key_put(cred->thread_keyring);
@@ -106,12 +160,90 @@ static void put_cred_rcu(struct rcu_head *rcu)
*/
void __put_cred(struct cred *cred)
{
+ kdebug("__put_cred(%p{%d,%d})", cred,
+ atomic_read(&cred->usage),
+ read_cred_subscribers(cred));
+
BUG_ON(atomic_read(&cred->usage) != 0);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ BUG_ON(read_cred_subscribers(cred) != 0);
+ cred->magic = CRED_MAGIC_DEAD;
+ cred->put_addr = __builtin_return_address(0);
+#endif
+ BUG_ON(cred == current->cred);
+ BUG_ON(cred == current->real_cred);
call_rcu(&cred->rcu, put_cred_rcu);
}
EXPORT_SYMBOL(__put_cred);
+/*
+ * Clean up a task's credentials when it exits
+ */
+void exit_creds(struct task_struct *tsk)
+{
+ struct cred *cred;
+
+ kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
+ atomic_read(&tsk->cred->usage),
+ read_cred_subscribers(tsk->cred));
+
+ cred = (struct cred *) tsk->real_cred;
+ tsk->real_cred = NULL;
+ validate_creds(cred);
+ alter_cred_subscribers(cred, -1);
+ put_cred(cred);
+
+ cred = (struct cred *) tsk->cred;
+ tsk->cred = NULL;
+ validate_creds(cred);
+ alter_cred_subscribers(cred, -1);
+ put_cred(cred);
+
+ cred = (struct cred *) tsk->replacement_session_keyring;
+ if (cred) {
+ tsk->replacement_session_keyring = NULL;
+ validate_creds(cred);
+ put_cred(cred);
+ }
+}
+
+/*
+ * Allocate blank credentials, such that the credentials can be filled in at a
+ * later date without risk of ENOMEM.
+ */
+struct cred *cred_alloc_blank(void)
+{
+ struct cred *new;
+
+ new = kmem_cache_zalloc(cred_jar, GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+#ifdef CONFIG_KEYS
+ new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
+ if (!new->tgcred) {
+ kfree(new);
+ return NULL;
+ }
+ atomic_set(&new->tgcred->usage, 1);
+#endif
+
+ atomic_set(&new->usage, 1);
+
+ if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
+ goto error;
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ new->magic = CRED_MAGIC;
+#endif
+ return new;
+
+error:
+ abort_creds(new);
+ return NULL;
+}
+
/**
* prepare_creds - Prepare a new set of credentials for modification
*
@@ -132,16 +264,19 @@ struct cred *prepare_creds(void)
const struct cred *old;
struct cred *new;
- BUG_ON(atomic_read(&task->real_cred->usage) < 1);
+ validate_process_creds();
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
+ kdebug("prepare_creds() alloc %p", new);
+
old = task->cred;
memcpy(new, old, sizeof(struct cred));
atomic_set(&new->usage, 1);
+ set_cred_subscribers(new, 0);
get_group_info(new->group_info);
get_uid(new->user);
@@ -157,6 +292,7 @@ struct cred *prepare_creds(void)
if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
goto error;
+ validate_creds(new);
return new;
error:
@@ -167,7 +303,7 @@ EXPORT_SYMBOL(prepare_creds);
/*
* Prepare credentials for current to perform an execve()
- * - The caller must hold current->cred_exec_mutex
+ * - The caller must hold current->cred_guard_mutex
*/
struct cred *prepare_exec_creds(void)
{
@@ -229,9 +365,12 @@ struct cred *prepare_usermodehelper_creds(void)
if (!new)
return NULL;
+ kdebug("prepare_usermodehelper_creds() alloc %p", new);
+
memcpy(new, &init_cred, sizeof(struct cred));
atomic_set(&new->usage, 1);
+ set_cred_subscribers(new, 0);
get_group_info(new->group_info);
get_uid(new->user);
@@ -250,6 +389,7 @@ struct cred *prepare_usermodehelper_creds(void)
#endif
if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
goto error;
+ validate_creds(new);
BUG_ON(atomic_read(&new->usage) != 1);
return new;
@@ -276,7 +416,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
struct cred *new;
int ret;
- mutex_init(&p->cred_exec_mutex);
+ mutex_init(&p->cred_guard_mutex);
if (
#ifdef CONFIG_KEYS
@@ -286,6 +426,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
) {
p->real_cred = get_cred(p->cred);
get_cred(p->cred);
+ alter_cred_subscribers(p->cred, 2);
+ kdebug("share_creds(%p{%d,%d})",
+ p->cred, atomic_read(&p->cred->usage),
+ read_cred_subscribers(p->cred));
atomic_inc(&p->cred->user->processes);
return 0;
}
@@ -331,6 +475,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
atomic_inc(&new->user->processes);
p->cred = p->real_cred = get_cred(new);
+ alter_cred_subscribers(new, 2);
+ validate_creds(new);
return 0;
error_put:
@@ -355,13 +501,20 @@ error_put:
int commit_creds(struct cred *new)
{
struct task_struct *task = current;
- const struct cred *old;
+ const struct cred *old = task->real_cred;
- BUG_ON(task->cred != task->real_cred);
- BUG_ON(atomic_read(&task->real_cred->usage) < 2);
+ kdebug("commit_creds(%p{%d,%d})", new,
+ atomic_read(&new->usage),
+ read_cred_subscribers(new));
+
+ BUG_ON(task->cred != old);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ BUG_ON(read_cred_subscribers(old) < 2);
+ validate_creds(old);
+ validate_creds(new);
+#endif
BUG_ON(atomic_read(&new->usage) < 1);
- old = task->real_cred;
security_commit_creds(new, old);
get_cred(new); /* we will require a ref for the subj creds too */
@@ -390,12 +543,14 @@ int commit_creds(struct cred *new)
* cheaply with the new uid cache, so if it matters
* we should be checking for it. -DaveM
*/
+ alter_cred_subscribers(new, 2);
if (new->user != old->user)
atomic_inc(&new->user->processes);
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
if (new->user != old->user)
atomic_dec(&old->user->processes);
+ alter_cred_subscribers(old, -2);
sched_switch_user(task);
@@ -428,6 +583,13 @@ EXPORT_SYMBOL(commit_creds);
*/
void abort_creds(struct cred *new)
{
+ kdebug("abort_creds(%p{%d,%d})", new,
+ atomic_read(&new->usage),
+ read_cred_subscribers(new));
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ BUG_ON(read_cred_subscribers(new) != 0);
+#endif
BUG_ON(atomic_read(&new->usage) < 1);
put_cred(new);
}
@@ -444,7 +606,20 @@ const struct cred *override_creds(const struct cred *new)
{
const struct cred *old = current->cred;
- rcu_assign_pointer(current->cred, get_cred(new));
+ kdebug("override_creds(%p{%d,%d})", new,
+ atomic_read(&new->usage),
+ read_cred_subscribers(new));
+
+ validate_creds(old);
+ validate_creds(new);
+ get_cred(new);
+ alter_cred_subscribers(new, 1);
+ rcu_assign_pointer(current->cred, new);
+ alter_cred_subscribers(old, -1);
+
+ kdebug("override_creds() = %p{%d,%d}", old,
+ atomic_read(&old->usage),
+ read_cred_subscribers(old));
return old;
}
EXPORT_SYMBOL(override_creds);
@@ -460,7 +635,15 @@ void revert_creds(const struct cred *old)
{
const struct cred *override = current->cred;
+ kdebug("revert_creds(%p{%d,%d})", old,
+ atomic_read(&old->usage),
+ read_cred_subscribers(old));
+
+ validate_creds(old);
+ validate_creds(override);
+ alter_cred_subscribers(old, 1);
rcu_assign_pointer(current->cred, old);
+ alter_cred_subscribers(override, -1);
put_cred(override);
}
EXPORT_SYMBOL(revert_creds);
@@ -502,11 +685,15 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
if (!new)
return NULL;
+ kdebug("prepare_kernel_cred() alloc %p", new);
+
if (daemon)
old = get_task_cred(daemon);
else
old = get_cred(&init_cred);
+ validate_creds(old);
+
*new = *old;
get_uid(new->user);
get_group_info(new->group_info);
@@ -526,7 +713,9 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
goto error;
atomic_set(&new->usage, 1);
+ set_cred_subscribers(new, 0);
put_cred(old);
+ validate_creds(new);
return new;
error:
@@ -589,3 +778,95 @@ int set_create_files_as(struct cred *new, struct inode *inode)
return security_kernel_create_files_as(new, inode);
}
EXPORT_SYMBOL(set_create_files_as);
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+
+/*
+ * dump invalid credentials
+ */
+static void dump_invalid_creds(const struct cred *cred, const char *label,
+ const struct task_struct *tsk)
+{
+ printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
+ label, cred,
+ cred == &init_cred ? "[init]" : "",
+ cred == tsk->real_cred ? "[real]" : "",
+ cred == tsk->cred ? "[eff]" : "");
+ printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
+ cred->magic, cred->put_addr);
+ printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
+ atomic_read(&cred->usage),
+ read_cred_subscribers(cred));
+ printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
+ cred->uid, cred->euid, cred->suid, cred->fsuid);
+ printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
+ cred->gid, cred->egid, cred->sgid, cred->fsgid);
+#ifdef CONFIG_SECURITY
+ printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
+ if ((unsigned long) cred->security >= PAGE_SIZE &&
+ (((unsigned long) cred->security & 0xffffff00) !=
+ (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
+ printk(KERN_ERR "CRED: ->security {%x, %x}\n",
+ ((u32*)cred->security)[0],
+ ((u32*)cred->security)[1]);
+#endif
+}
+
+/*
+ * report use of invalid credentials
+ */
+void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
+{
+ printk(KERN_ERR "CRED: Invalid credentials\n");
+ printk(KERN_ERR "CRED: At %s:%u\n", file, line);
+ dump_invalid_creds(cred, "Specified", current);
+ BUG();
+}
+EXPORT_SYMBOL(__invalid_creds);
+
+/*
+ * check the credentials on a process
+ */
+void __validate_process_creds(struct task_struct *tsk,
+ const char *file, unsigned line)
+{
+ if (tsk->cred == tsk->real_cred) {
+ if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
+ creds_are_invalid(tsk->cred)))
+ goto invalid_creds;
+ } else {
+ if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
+ read_cred_subscribers(tsk->cred) < 1 ||
+ creds_are_invalid(tsk->real_cred) ||
+ creds_are_invalid(tsk->cred)))
+ goto invalid_creds;
+ }
+ return;
+
+invalid_creds:
+ printk(KERN_ERR "CRED: Invalid process credentials\n");
+ printk(KERN_ERR "CRED: At %s:%u\n", file, line);
+
+ dump_invalid_creds(tsk->real_cred, "Real", tsk);
+ if (tsk->cred != tsk->real_cred)
+ dump_invalid_creds(tsk->cred, "Effective", tsk);
+ else
+ printk(KERN_ERR "CRED: Effective creds == Real creds\n");
+ BUG();
+}
+EXPORT_SYMBOL(__validate_process_creds);
+
+/*
+ * check creds for do_exit()
+ */
+void validate_creds_for_do_exit(struct task_struct *tsk)
+{
+ kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
+ tsk->real_cred, tsk->cred,
+ atomic_read(&tsk->cred->usage),
+ read_cred_subscribers(tsk->cred));
+
+ __validate_process_creds(tsk, __FILE__, __LINE__);
+}
+
+#endif /* CONFIG_DEBUG_CREDENTIALS */
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index 038707404b7..962a3b574f2 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -98,7 +98,7 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
* @size: size of requested memory area
* @dma_handle: This will be filled with the correct dma handle
* @ret: This pointer will be filled with the virtual address
- * to allocated area.
+ * to allocated area.
*
* This function should be only called from per-arch dma_alloc_coherent()
* to support allocation from per-device coherent memory pools.
@@ -118,31 +118,32 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
mem = dev->dma_mem;
if (!mem)
return 0;
- if (unlikely(size > mem->size))
- return 0;
+
+ *ret = NULL;
+
+ if (unlikely(size > (mem->size << PAGE_SHIFT)))
+ goto err;
pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
- if (pageno >= 0) {
- /*
- * Memory was found in the per-device arena.
- */
- *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
- *ret = mem->virt_base + (pageno << PAGE_SHIFT);
- memset(*ret, 0, size);
- } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) {
- /*
- * The per-device arena is exhausted and we are not
- * permitted to fall back to generic memory.
- */
- *ret = NULL;
- } else {
- /*
- * The per-device arena is exhausted and we are
- * permitted to fall back to generic memory.
- */
- return 0;
- }
+ if (unlikely(pageno < 0))
+ goto err;
+
+ /*
+ * Memory was found in the per-device area.
+ */
+ *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
+ *ret = mem->virt_base + (pageno << PAGE_SHIFT);
+ memset(*ret, 0, size);
+
return 1;
+
+err:
+ /*
+ * In the case where the allocation can not be satisfied from the
+ * per-device area, try to fall back to generic memory if the
+ * constraints allow it.
+ */
+ return mem->flags & DMA_MEMORY_EXCLUSIVE;
}
EXPORT_SYMBOL(dma_alloc_from_coherent);
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 667c841c295..c35452cadde 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -18,6 +18,7 @@
#include <linux/syscalls.h>
#include <linux/sysctl.h>
#include <linux/types.h>
+#include <linux/fs_struct.h>
static void default_handler(int, struct pt_regs *);
@@ -145,28 +146,6 @@ __set_personality(u_long personality)
return 0;
}
- if (atomic_read(&current->fs->count) != 1) {
- struct fs_struct *fsp, *ofsp;
-
- fsp = copy_fs_struct(current->fs);
- if (fsp == NULL) {
- module_put(ep->module);
- return -ENOMEM;
- }
-
- task_lock(current);
- ofsp = current->fs;
- current->fs = fsp;
- task_unlock(current);
-
- put_fs_struct(ofsp);
- }
-
- /*
- * At that point we are guaranteed to be the sole owner of
- * current->fs.
- */
-
current->personality = personality;
oep = current_thread_info()->exec_domain;
current_thread_info()->exec_domain = ep;
diff --git a/kernel/exit.c b/kernel/exit.c
index f80dec3f187..ae5d8660ddf 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -12,7 +12,6 @@
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/tty.h>
-#include <linux/mnt_namespace.h>
#include <linux/iocontext.h>
#include <linux/key.h>
#include <linux/security.h>
@@ -46,8 +45,10 @@
#include <linux/blkdev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/tracehook.h>
+#include <linux/fs_struct.h>
#include <linux/init_task.h>
-#include <trace/sched.h>
+#include <linux/perf_counter.h>
+#include <trace/events/sched.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -55,17 +56,8 @@
#include <asm/mmu_context.h>
#include "cred-internals.h"
-DEFINE_TRACE(sched_process_free);
-DEFINE_TRACE(sched_process_exit);
-DEFINE_TRACE(sched_process_wait);
-
static void exit_mm(struct task_struct * tsk);
-static inline int task_detached(struct task_struct *p)
-{
- return p->exit_signal == -1;
-}
-
static void __unhash_process(struct task_struct *p)
{
nr_threads--;
@@ -118,6 +110,8 @@ static void __exit_signal(struct task_struct *tsk)
* We won't ever get here for the group leader, since it
* will have been the last reference on the signal_struct.
*/
+ sig->utime = cputime_add(sig->utime, task_utime(tsk));
+ sig->stime = cputime_add(sig->stime, task_stime(tsk));
sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
@@ -126,6 +120,7 @@ static void __exit_signal(struct task_struct *tsk)
sig->inblock += task_io_get_inblock(tsk);
sig->oublock += task_io_get_oublock(tsk);
task_io_accounting_add(&sig->ioac, &tsk->ioac);
+ sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig = NULL; /* Marker for below. */
}
@@ -159,6 +154,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+#ifdef CONFIG_PERF_COUNTERS
+ WARN_ON_ONCE(tsk->perf_counter_ctxp);
+#endif
trace_sched_process_free(tsk);
put_task_struct(tsk);
}
@@ -175,6 +173,7 @@ repeat:
atomic_dec(&__task_cred(p)->user->processes);
proc_flush_task(p);
+
write_lock_irq(&tasklist_lock);
tracehook_finish_release_task(p);
__exit_signal(p);
@@ -359,16 +358,12 @@ static void reparent_to_kthreadd(void)
void __set_special_pids(struct pid *pid)
{
struct task_struct *curr = current->group_leader;
- pid_t nr = pid_nr(pid);
- if (task_session(curr) != pid) {
+ if (task_session(curr) != pid)
change_pid(curr, PIDTYPE_SID, pid);
- set_task_session(curr, nr);
- }
- if (task_pgrp(curr) != pid) {
+
+ if (task_pgrp(curr) != pid)
change_pid(curr, PIDTYPE_PGID, pid);
- set_task_pgrp(curr, nr);
- }
}
static void set_special_pids(struct pid *pid)
@@ -379,9 +374,8 @@ static void set_special_pids(struct pid *pid)
}
/*
- * Let kernel threads use this to say that they
- * allow a certain signal (since daemonize() will
- * have disabled all of them by default).
+ * Let kernel threads use this to say that they allow a certain signal.
+ * Must not be used if kthread was cloned with CLONE_SIGHAND.
*/
int allow_signal(int sig)
{
@@ -389,14 +383,14 @@ int allow_signal(int sig)
return -EINVAL;
spin_lock_irq(&current->sighand->siglock);
+ /* This is only needed for daemonize()'ed kthreads */
sigdelset(&current->blocked, sig);
- if (!current->mm) {
- /* Kernel threads handle their own signals.
- Let the signal code know it'll be handled, so
- that they don't get converted to SIGKILL or
- just silently dropped */
- current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
- }
+ /*
+ * Kernel threads handle their own signals. Let the signal code
+ * know it'll be handled, so that they don't get converted to
+ * SIGKILL or just silently dropped.
+ */
+ current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
return 0;
@@ -426,7 +420,6 @@ EXPORT_SYMBOL(disallow_signal);
void daemonize(const char *name, ...)
{
va_list args;
- struct fs_struct *fs;
sigset_t blocked;
va_start(args, name);
@@ -459,11 +452,7 @@ void daemonize(const char *name, ...)
/* Become as one with the init task */
- exit_fs(current); /* current->fs->count--; */
- fs = init_task.fs;
- current->fs = fs;
- atomic_inc(&fs->count);
-
+ daemonize_fs_struct();
exit_files(current);
current->files = init_task.files;
atomic_inc(&current->files->count);
@@ -562,30 +551,6 @@ void exit_files(struct task_struct *tsk)
}
}
-void put_fs_struct(struct fs_struct *fs)
-{
- /* No need to hold fs->lock if we are killing it */
- if (atomic_dec_and_test(&fs->count)) {
- path_put(&fs->root);
- path_put(&fs->pwd);
- kmem_cache_free(fs_cachep, fs);
- }
-}
-
-void exit_fs(struct task_struct *tsk)
-{
- struct fs_struct * fs = tsk->fs;
-
- if (fs) {
- task_lock(tsk);
- tsk->fs = NULL;
- task_unlock(tsk);
- put_fs_struct(fs);
- }
-}
-
-EXPORT_SYMBOL_GPL(exit_fs);
-
#ifdef CONFIG_MM_OWNER
/*
* Task p is exiting and it owned mm, lets find a new owner for it
@@ -624,7 +589,7 @@ retry:
/*
* Search in the siblings
*/
- list_for_each_entry(c, &p->parent->children, sibling) {
+ list_for_each_entry(c, &p->real_parent->children, sibling) {
if (c->mm == mm)
goto assign_new_owner;
}
@@ -729,119 +694,6 @@ static void exit_mm(struct task_struct * tsk)
}
/*
- * Return nonzero if @parent's children should reap themselves.
- *
- * Called with write_lock_irq(&tasklist_lock) held.
- */
-static int ignoring_children(struct task_struct *parent)
-{
- int ret;
- struct sighand_struct *psig = parent->sighand;
- unsigned long flags;
- spin_lock_irqsave(&psig->siglock, flags);
- ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
- (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
- spin_unlock_irqrestore(&psig->siglock, flags);
- return ret;
-}
-
-/*
- * Detach all tasks we were using ptrace on.
- * Any that need to be release_task'd are put on the @dead list.
- *
- * Called with write_lock(&tasklist_lock) held.
- */
-static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
-{
- struct task_struct *p, *n;
- int ign = -1;
-
- list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
- __ptrace_unlink(p);
-
- if (p->exit_state != EXIT_ZOMBIE)
- continue;
-
- /*
- * If it's a zombie, our attachedness prevented normal
- * parent notification or self-reaping. Do notification
- * now if it would have happened earlier. If it should
- * reap itself, add it to the @dead list. We can't call
- * release_task() here because we already hold tasklist_lock.
- *
- * If it's our own child, there is no notification to do.
- * But if our normal children self-reap, then this child
- * was prevented by ptrace and we must reap it now.
- */
- if (!task_detached(p) && thread_group_empty(p)) {
- if (!same_thread_group(p->real_parent, parent))
- do_notify_parent(p, p->exit_signal);
- else {
- if (ign < 0)
- ign = ignoring_children(parent);
- if (ign)
- p->exit_signal = -1;
- }
- }
-
- if (task_detached(p)) {
- /*
- * Mark it as in the process of being reaped.
- */
- p->exit_state = EXIT_DEAD;
- list_add(&p->ptrace_entry, dead);
- }
- }
-}
-
-/*
- * Finish up exit-time ptrace cleanup.
- *
- * Called without locks.
- */
-static void ptrace_exit_finish(struct task_struct *parent,
- struct list_head *dead)
-{
- struct task_struct *p, *n;
-
- BUG_ON(!list_empty(&parent->ptraced));
-
- list_for_each_entry_safe(p, n, dead, ptrace_entry) {
- list_del_init(&p->ptrace_entry);
- release_task(p);
- }
-}
-
-static void reparent_thread(struct task_struct *p, struct task_struct *father)
-{
- if (p->pdeath_signal)
- /* We already hold the tasklist_lock here. */
- group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
-
- list_move_tail(&p->sibling, &p->real_parent->children);
-
- /* If this is a threaded reparent there is no need to
- * notify anyone anything has happened.
- */
- if (same_thread_group(p->real_parent, father))
- return;
-
- /* We don't want people slaying init. */
- if (!task_detached(p))
- p->exit_signal = SIGCHLD;
-
- /* If we'd notified the old parent about this child's death,
- * also notify the new parent.
- */
- if (!ptrace_reparented(p) &&
- p->exit_state == EXIT_ZOMBIE &&
- !task_detached(p) && thread_group_empty(p))
- do_notify_parent(p, p->exit_signal);
-
- kill_orphaned_pgrp(p, father);
-}
-
-/*
* When we die, we re-parent all our children.
* Try to give them to another thread in our thread
* group, and if no such member exists, give it to
@@ -880,31 +732,68 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
return pid_ns->child_reaper;
}
+/*
+* Any that need to be release_task'd are put on the @dead list.
+ */
+static void reparent_thread(struct task_struct *father, struct task_struct *p,
+ struct list_head *dead)
+{
+ if (p->pdeath_signal)
+ group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+
+ list_move_tail(&p->sibling, &p->real_parent->children);
+
+ if (task_detached(p))
+ return;
+ /*
+ * If this is a threaded reparent there is no need to
+ * notify anyone anything has happened.
+ */
+ if (same_thread_group(p->real_parent, father))
+ return;
+
+ /* We don't want people slaying init. */
+ p->exit_signal = SIGCHLD;
+
+ /* If it has exited notify the new parent about this child's death. */
+ if (!task_ptrace(p) &&
+ p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
+ do_notify_parent(p, p->exit_signal);
+ if (task_detached(p)) {
+ p->exit_state = EXIT_DEAD;
+ list_move_tail(&p->sibling, dead);
+ }
+ }
+
+ kill_orphaned_pgrp(p, father);
+}
+
static void forget_original_parent(struct task_struct *father)
{
struct task_struct *p, *n, *reaper;
- LIST_HEAD(ptrace_dead);
+ LIST_HEAD(dead_children);
+
+ exit_ptrace(father);
write_lock_irq(&tasklist_lock);
reaper = find_new_reaper(father);
- /*
- * First clean up ptrace if we were using it.
- */
- ptrace_exit(father, &ptrace_dead);
list_for_each_entry_safe(p, n, &father->children, sibling) {
p->real_parent = reaper;
if (p->parent == father) {
- BUG_ON(p->ptrace);
+ BUG_ON(task_ptrace(p));
p->parent = p->real_parent;
}
- reparent_thread(p, father);
+ reparent_thread(father, p, &dead_children);
}
-
write_unlock_irq(&tasklist_lock);
+
BUG_ON(!list_empty(&father->children));
- ptrace_exit_finish(father, &ptrace_dead);
+ list_for_each_entry_safe(p, n, &dead_children, sibling) {
+ list_del_init(&p->sibling);
+ release_task(p);
+ }
}
/*
@@ -947,8 +836,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
*/
if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
(tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
- tsk->self_exec_id != tsk->parent_exec_id) &&
- !capable(CAP_KILL))
+ tsk->self_exec_id != tsk->parent_exec_id))
tsk->exit_signal = SIGCHLD;
signal = tracehook_notify_death(tsk, &cookie, group_dead);
@@ -977,12 +865,9 @@ static void check_stack_usage(void)
{
static DEFINE_SPINLOCK(low_water_lock);
static int lowest_to_date = THREAD_SIZE;
- unsigned long *n = end_of_stack(current);
unsigned long free;
- while (*n == 0)
- n++;
- free = (unsigned long)n - (unsigned long)end_of_stack(current);
+ free = stack_not_used(current);
if (free >= lowest_to_date)
return;
@@ -1016,6 +901,8 @@ NORET_TYPE void do_exit(long code)
tracehook_report_exit(&code);
+ validate_creds_for_do_exit(tsk);
+
/*
* We're taking recursive faults here in do_exit. Safest is to just
* leave this task alone and wait for reboot.
@@ -1037,6 +924,8 @@ NORET_TYPE void do_exit(long code)
schedule();
}
+ exit_irq_thread();
+
exit_signals(tsk); /* sets PF_EXITING */
/*
* tsk->flags are checked in the futex code to protect against
@@ -1087,16 +976,19 @@ NORET_TYPE void do_exit(long code)
module_put(tsk->binfmt->module);
proc_exit_connector(tsk);
+
+ /*
+ * Flush inherited counters to the parent - before the parent
+ * gets woken up by child-exit notifications.
+ */
+ perf_counter_exit_task(tsk);
+
exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA
mpol_put(tsk->mempolicy);
tsk->mempolicy = NULL;
#endif
#ifdef CONFIG_FUTEX
- /*
- * This must happen late, after the PID is not
- * hashed anymore:
- */
if (unlikely(!list_empty(&tsk->pi_state_list)))
exit_pi_state_list(tsk);
if (unlikely(current->pi_state_cache))
@@ -1119,7 +1011,10 @@ NORET_TYPE void do_exit(long code)
if (tsk->splice_pipe)
__free_pipe_info(tsk->splice_pipe);
+ validate_creds_for_do_exit(tsk);
+
preempt_disable();
+ exit_rcu();
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
schedule();
@@ -1189,6 +1084,18 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
return 0;
}
+struct wait_opts {
+ enum pid_type wo_type;
+ int wo_flags;
+ struct pid *wo_pid;
+
+ struct siginfo __user *wo_info;
+ int __user *wo_stat;
+ struct rusage __user *wo_rusage;
+
+ int notask_error;
+};
+
static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
{
struct pid *pid = NULL;
@@ -1199,13 +1106,12 @@ static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
return pid;
}
-static int eligible_child(enum pid_type type, struct pid *pid, int options,
- struct task_struct *p)
+static int eligible_child(struct wait_opts *wo, struct task_struct *p)
{
int err;
- if (type < PIDTYPE_MAX) {
- if (task_pid_type(p, type) != pid)
+ if (wo->wo_type < PIDTYPE_MAX) {
+ if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
return 0;
}
@@ -1214,8 +1120,8 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
* set; otherwise, wait for non-clone children *only*. (Note:
* A "clone" child here is one that reports to its parent
* using a signal other than SIGCHLD.) */
- if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
- && !(options & __WALL))
+ if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
+ && !(wo->wo_flags & __WALL))
return 0;
err = security_task_wait(p);
@@ -1225,14 +1131,15 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
return 1;
}
-static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
- int why, int status,
- struct siginfo __user *infop,
- struct rusage __user *rusagep)
+static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
+ pid_t pid, uid_t uid, int why, int status)
{
- int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
+ struct siginfo __user *infop;
+ int retval = wo->wo_rusage
+ ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
put_task_struct(p);
+ infop = wo->wo_info;
if (!retval)
retval = put_user(SIGCHLD, &infop->si_signo);
if (!retval)
@@ -1256,19 +1163,18 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
-static int wait_task_zombie(struct task_struct *p, int options,
- struct siginfo __user *infop,
- int __user *stat_addr, struct rusage __user *ru)
+static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
{
unsigned long state;
int retval, status, traced;
pid_t pid = task_pid_vnr(p);
uid_t uid = __task_cred(p)->uid;
+ struct siginfo __user *infop;
- if (!likely(options & WEXITED))
+ if (!likely(wo->wo_flags & WEXITED))
return 0;
- if (unlikely(options & WNOWAIT)) {
+ if (unlikely(wo->wo_flags & WNOWAIT)) {
int exit_code = p->exit_code;
int why, status;
@@ -1281,8 +1187,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
status = exit_code & 0x7f;
}
- return wait_noreap_copyout(p, pid, uid, why,
- status, infop, ru);
+ return wait_noreap_copyout(wo, p, pid, uid, why, status);
}
/*
@@ -1296,11 +1201,13 @@ static int wait_task_zombie(struct task_struct *p, int options,
}
traced = ptrace_reparented(p);
-
- if (likely(!traced)) {
+ /*
+ * It can be ptraced but not reparented, check
+ * !task_detached() to filter out sub-threads.
+ */
+ if (likely(!traced) && likely(!task_detached(p))) {
struct signal_struct *psig;
struct signal_struct *sig;
- struct task_cputime cputime;
/*
* The resource counters for the group leader are in its
@@ -1313,26 +1220,23 @@ static int wait_task_zombie(struct task_struct *p, int options,
* p->signal fields, because they are only touched by
* __exit_signal, which runs with tasklist_lock
* write-locked anyway, and so is excluded here. We do
- * need to protect the access to p->parent->signal fields,
+ * need to protect the access to parent->signal fields,
* as other threads in the parent group can be right
* here reaping other children at the same time.
- *
- * We use thread_group_cputime() to get times for the thread
- * group, which consolidates times for all threads in the
- * group including the group leader.
*/
- thread_group_cputime(p, &cputime);
- spin_lock_irq(&p->parent->sighand->siglock);
- psig = p->parent->signal;
+ spin_lock_irq(&p->real_parent->sighand->siglock);
+ psig = p->real_parent->signal;
sig = p->signal;
psig->cutime =
cputime_add(psig->cutime,
- cputime_add(cputime.utime,
- sig->cutime));
+ cputime_add(p->utime,
+ cputime_add(sig->utime,
+ sig->cutime)));
psig->cstime =
cputime_add(psig->cstime,
- cputime_add(cputime.stime,
- sig->cstime));
+ cputime_add(p->stime,
+ cputime_add(sig->stime,
+ sig->cstime)));
psig->cgtime =
cputime_add(psig->cgtime,
cputime_add(p->gtime,
@@ -1354,7 +1258,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
sig->oublock + sig->coublock;
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
- spin_unlock_irq(&p->parent->sighand->siglock);
+ spin_unlock_irq(&p->real_parent->sighand->siglock);
}
/*
@@ -1363,11 +1267,14 @@ static int wait_task_zombie(struct task_struct *p, int options,
*/
read_unlock(&tasklist_lock);
- retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+ retval = wo->wo_rusage
+ ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
status = (p->signal->flags & SIGNAL_GROUP_EXIT)
? p->signal->group_exit_code : p->exit_code;
- if (!retval && stat_addr)
- retval = put_user(status, stat_addr);
+ if (!retval && wo->wo_stat)
+ retval = put_user(status, wo->wo_stat);
+
+ infop = wo->wo_info;
if (!retval && infop)
retval = put_user(SIGCHLD, &infop->si_signo);
if (!retval && infop)
@@ -1417,42 +1324,51 @@ static int wait_task_zombie(struct task_struct *p, int options,
return retval;
}
+static int *task_stopped_code(struct task_struct *p, bool ptrace)
+{
+ if (ptrace) {
+ if (task_is_stopped_or_traced(p))
+ return &p->exit_code;
+ } else {
+ if (p->signal->flags & SIGNAL_STOP_STOPPED)
+ return &p->signal->group_exit_code;
+ }
+ return NULL;
+}
+
/*
* Handle sys_wait4 work for one task in state TASK_STOPPED. We hold
* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
-static int wait_task_stopped(int ptrace, struct task_struct *p,
- int options, struct siginfo __user *infop,
- int __user *stat_addr, struct rusage __user *ru)
+static int wait_task_stopped(struct wait_opts *wo,
+ int ptrace, struct task_struct *p)
{
- int retval, exit_code, why;
+ struct siginfo __user *infop;
+ int retval, exit_code, *p_code, why;
uid_t uid = 0; /* unneeded, required by compiler */
pid_t pid;
- if (!(options & WUNTRACED))
+ /*
+ * Traditionally we see ptrace'd stopped tasks regardless of options.
+ */
+ if (!ptrace && !(wo->wo_flags & WUNTRACED))
return 0;
exit_code = 0;
spin_lock_irq(&p->sighand->siglock);
- if (unlikely(!task_is_stopped_or_traced(p)))
- goto unlock_sig;
-
- if (!ptrace && p->signal->group_stop_count > 0)
- /*
- * A group stop is in progress and this is the group leader.
- * We won't report until all threads have stopped.
- */
+ p_code = task_stopped_code(p, ptrace);
+ if (unlikely(!p_code))
goto unlock_sig;
- exit_code = p->exit_code;
+ exit_code = *p_code;
if (!exit_code)
goto unlock_sig;
- if (!unlikely(options & WNOWAIT))
- p->exit_code = 0;
+ if (!unlikely(wo->wo_flags & WNOWAIT))
+ *p_code = 0;
/* don't need the RCU readlock here as we're holding a spinlock */
uid = __task_cred(p)->uid;
@@ -1473,14 +1389,15 @@ unlock_sig:
why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
read_unlock(&tasklist_lock);
- if (unlikely(options & WNOWAIT))
- return wait_noreap_copyout(p, pid, uid,
- why, exit_code,
- infop, ru);
+ if (unlikely(wo->wo_flags & WNOWAIT))
+ return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
- retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
- if (!retval && stat_addr)
- retval = put_user((exit_code << 8) | 0x7f, stat_addr);
+ retval = wo->wo_rusage
+ ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
+ if (!retval && wo->wo_stat)
+ retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
+
+ infop = wo->wo_info;
if (!retval && infop)
retval = put_user(SIGCHLD, &infop->si_signo);
if (!retval && infop)
@@ -1507,15 +1424,13 @@ unlock_sig:
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
-static int wait_task_continued(struct task_struct *p, int options,
- struct siginfo __user *infop,
- int __user *stat_addr, struct rusage __user *ru)
+static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
{
int retval;
pid_t pid;
uid_t uid;
- if (!unlikely(options & WCONTINUED))
+ if (!unlikely(wo->wo_flags & WCONTINUED))
return 0;
if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
@@ -1527,7 +1442,7 @@ static int wait_task_continued(struct task_struct *p, int options,
spin_unlock_irq(&p->sighand->siglock);
return 0;
}
- if (!unlikely(options & WNOWAIT))
+ if (!unlikely(wo->wo_flags & WNOWAIT))
p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
uid = __task_cred(p)->uid;
spin_unlock_irq(&p->sighand->siglock);
@@ -1536,17 +1451,17 @@ static int wait_task_continued(struct task_struct *p, int options,
get_task_struct(p);
read_unlock(&tasklist_lock);
- if (!infop) {
- retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
+ if (!wo->wo_info) {
+ retval = wo->wo_rusage
+ ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
put_task_struct(p);
- if (!retval && stat_addr)
- retval = put_user(0xffff, stat_addr);
+ if (!retval && wo->wo_stat)
+ retval = put_user(0xffff, wo->wo_stat);
if (!retval)
retval = pid;
} else {
- retval = wait_noreap_copyout(p, pid, uid,
- CLD_CONTINUED, SIGCONT,
- infop, ru);
+ retval = wait_noreap_copyout(wo, p, pid, uid,
+ CLD_CONTINUED, SIGCONT);
BUG_ON(retval == 0);
}
@@ -1556,19 +1471,16 @@ static int wait_task_continued(struct task_struct *p, int options,
/*
* Consider @p for a wait by @parent.
*
- * -ECHILD should be in *@notask_error before the first call.
+ * -ECHILD should be in ->notask_error before the first call.
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
* Returns zero if the search for a child should continue;
- * then *@notask_error is 0 if @p is an eligible child,
+ * then ->notask_error is 0 if @p is an eligible child,
* or another error from security_task_wait(), or still -ECHILD.
*/
-static int wait_consider_task(struct task_struct *parent, int ptrace,
- struct task_struct *p, int *notask_error,
- enum pid_type type, struct pid *pid, int options,
- struct siginfo __user *infop,
- int __user *stat_addr, struct rusage __user *ru)
+static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent,
+ int ptrace, struct task_struct *p)
{
- int ret = eligible_child(type, pid, options, p);
+ int ret = eligible_child(wo, p);
if (!ret)
return ret;
@@ -1580,16 +1492,17 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
* to look for security policy problems, rather
* than for mysterious wait bugs.
*/
- if (*notask_error)
- *notask_error = ret;
+ if (wo->notask_error)
+ wo->notask_error = ret;
+ return 0;
}
- if (likely(!ptrace) && unlikely(p->ptrace)) {
+ if (likely(!ptrace) && unlikely(task_ptrace(p))) {
/*
* This child is hidden by ptrace.
* We aren't allowed to see it now, but eventually we will.
*/
- *notask_error = 0;
+ wo->notask_error = 0;
return 0;
}
@@ -1600,34 +1513,30 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
* We don't reap group leaders with subthreads.
*/
if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
- return wait_task_zombie(p, options, infop, stat_addr, ru);
+ return wait_task_zombie(wo, p);
/*
* It's stopped or running now, so it might
* later continue, exit, or stop again.
*/
- *notask_error = 0;
+ wo->notask_error = 0;
- if (task_is_stopped_or_traced(p))
- return wait_task_stopped(ptrace, p, options,
- infop, stat_addr, ru);
+ if (task_stopped_code(p, ptrace))
+ return wait_task_stopped(wo, ptrace, p);
- return wait_task_continued(p, options, infop, stat_addr, ru);
+ return wait_task_continued(wo, p);
}
/*
* Do the work of do_wait() for one thread in the group, @tsk.
*
- * -ECHILD should be in *@notask_error before the first call.
+ * -ECHILD should be in ->notask_error before the first call.
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
* Returns zero if the search for a child should continue; then
- * *@notask_error is 0 if there were any eligible children,
+ * ->notask_error is 0 if there were any eligible children,
* or another error from security_task_wait(), or still -ECHILD.
*/
-static int do_wait_thread(struct task_struct *tsk, int *notask_error,
- enum pid_type type, struct pid *pid, int options,
- struct siginfo __user *infop, int __user *stat_addr,
- struct rusage __user *ru)
+static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
{
struct task_struct *p;
@@ -1636,9 +1545,7 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
* Do not consider detached threads.
*/
if (!task_detached(p)) {
- int ret = wait_consider_task(tsk, 0, p, notask_error,
- type, pid, options,
- infop, stat_addr, ru);
+ int ret = wait_consider_task(wo, tsk, 0, p);
if (ret)
return ret;
}
@@ -1647,22 +1554,12 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
return 0;
}
-static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
- enum pid_type type, struct pid *pid, int options,
- struct siginfo __user *infop, int __user *stat_addr,
- struct rusage __user *ru)
+static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
{
struct task_struct *p;
- /*
- * Traditionally we see ptrace'd stopped tasks regardless of options.
- */
- options |= WUNTRACED;
-
list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
- int ret = wait_consider_task(tsk, 1, p, notask_error,
- type, pid, options,
- infop, stat_addr, ru);
+ int ret = wait_consider_task(wo, tsk, 1, p);
if (ret)
return ret;
}
@@ -1670,65 +1567,59 @@ static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
return 0;
}
-static long do_wait(enum pid_type type, struct pid *pid, int options,
- struct siginfo __user *infop, int __user *stat_addr,
- struct rusage __user *ru)
+static long do_wait(struct wait_opts *wo)
{
DECLARE_WAITQUEUE(wait, current);
struct task_struct *tsk;
int retval;
- trace_sched_process_wait(pid);
+ trace_sched_process_wait(wo->wo_pid);
add_wait_queue(&current->signal->wait_chldexit,&wait);
repeat:
/*
* If there is nothing that can match our critiera just get out.
- * We will clear @retval to zero if we see any child that might later
- * match our criteria, even if we are not able to reap it yet.
+ * We will clear ->notask_error to zero if we see any child that
+ * might later match our criteria, even if we are not able to reap
+ * it yet.
*/
- retval = -ECHILD;
- if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
- goto end;
+ wo->notask_error = -ECHILD;
+ if ((wo->wo_type < PIDTYPE_MAX) &&
+ (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
+ goto notask;
- current->state = TASK_INTERRUPTIBLE;
+ set_current_state(TASK_INTERRUPTIBLE);
read_lock(&tasklist_lock);
tsk = current;
do {
- int tsk_result = do_wait_thread(tsk, &retval,
- type, pid, options,
- infop, stat_addr, ru);
- if (!tsk_result)
- tsk_result = ptrace_do_wait(tsk, &retval,
- type, pid, options,
- infop, stat_addr, ru);
- if (tsk_result) {
- /*
- * tasklist_lock is unlocked and we have a final result.
- */
- retval = tsk_result;
+ retval = do_wait_thread(wo, tsk);
+ if (retval)
goto end;
- }
- if (options & __WNOTHREAD)
+ retval = ptrace_do_wait(wo, tsk);
+ if (retval)
+ goto end;
+
+ if (wo->wo_flags & __WNOTHREAD)
break;
- tsk = next_thread(tsk);
- BUG_ON(tsk->signal != current->signal);
- } while (tsk != current);
+ } while_each_thread(current, tsk);
read_unlock(&tasklist_lock);
- if (!retval && !(options & WNOHANG)) {
+notask:
+ retval = wo->notask_error;
+ if (!retval && !(wo->wo_flags & WNOHANG)) {
retval = -ERESTARTSYS;
if (!signal_pending(current)) {
schedule();
goto repeat;
}
}
-
end:
- current->state = TASK_RUNNING;
+ __set_current_state(TASK_RUNNING);
remove_wait_queue(&current->signal->wait_chldexit,&wait);
- if (infop) {
+ if (wo->wo_info) {
+ struct siginfo __user *infop = wo->wo_info;
+
if (retval > 0)
retval = 0;
else {
@@ -1757,6 +1648,7 @@ end:
SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
infop, int, options, struct rusage __user *, ru)
{
+ struct wait_opts wo;
struct pid *pid = NULL;
enum pid_type type;
long ret;
@@ -1786,7 +1678,14 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
if (type < PIDTYPE_MAX)
pid = find_get_pid(upid);
- ret = do_wait(type, pid, options, infop, NULL, ru);
+
+ wo.wo_type = type;
+ wo.wo_pid = pid;
+ wo.wo_flags = options;
+ wo.wo_info = infop;
+ wo.wo_stat = NULL;
+ wo.wo_rusage = ru;
+ ret = do_wait(&wo);
put_pid(pid);
/* avoid REGPARM breakage on x86: */
@@ -1797,6 +1696,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
int, options, struct rusage __user *, ru)
{
+ struct wait_opts wo;
struct pid *pid = NULL;
enum pid_type type;
long ret;
@@ -1812,13 +1712,19 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
pid = find_get_pid(-upid);
} else if (upid == 0) {
type = PIDTYPE_PGID;
- pid = get_pid(task_pgrp(current));
+ pid = get_task_pid(current, PIDTYPE_PGID);
} else /* upid > 0 */ {
type = PIDTYPE_PID;
pid = find_get_pid(upid);
}
- ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru);
+ wo.wo_type = type;
+ wo.wo_pid = pid;
+ wo.wo_flags = options | WEXITED;
+ wo.wo_info = NULL;
+ wo.wo_stat = stat_addr;
+ wo.wo_rusage = ru;
+ ret = do_wait(&wo);
put_pid(pid);
/* avoid REGPARM breakage on x86: */
diff --git a/kernel/extable.c b/kernel/extable.c
index e136ed8d82b..7f8f263f852 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -15,11 +15,22 @@
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#include <linux/ftrace.h>
+#include <linux/memory.h>
#include <linux/module.h>
+#include <linux/mutex.h>
#include <linux/init.h>
-#include <linux/ftrace.h>
-#include <asm/uaccess.h>
+
#include <asm/sections.h>
+#include <asm/uaccess.h>
+
+/*
+ * mutex protecting text section modification (dynamic code patching).
+ * some users need to sleep (allocating memory...) while they hold this lock.
+ *
+ * NOT exported to modules - patching kernel text is a really delicate matter.
+ */
+DEFINE_MUTEX(text_mutex);
extern struct exception_table_entry __start___ex_table[];
extern struct exception_table_entry __stop___ex_table[];
@@ -41,31 +52,50 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
return e;
}
-__notrace_funcgraph int core_kernel_text(unsigned long addr)
+static inline int init_kernel_text(unsigned long addr)
+{
+ if (addr >= (unsigned long)_sinittext &&
+ addr <= (unsigned long)_einittext)
+ return 1;
+ return 0;
+}
+
+int core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
addr <= (unsigned long)_etext)
return 1;
if (system_state == SYSTEM_BOOTING &&
- addr >= (unsigned long)_sinittext &&
- addr <= (unsigned long)_einittext)
+ init_kernel_text(addr))
return 1;
return 0;
}
-__notrace_funcgraph int __kernel_text_address(unsigned long addr)
+int __kernel_text_address(unsigned long addr)
{
if (core_kernel_text(addr))
return 1;
- return __module_text_address(addr) != NULL;
+ if (is_module_text_address(addr))
+ return 1;
+ /*
+ * There might be init symbols in saved stacktraces.
+ * Give those symbols a chance to be printed in
+ * backtraces (such as lockdep traces).
+ *
+ * Since we are after the module-symbols check, there's
+ * no danger of address overlap:
+ */
+ if (init_kernel_text(addr))
+ return 1;
+ return 0;
}
int kernel_text_address(unsigned long addr)
{
if (core_kernel_text(addr))
return 1;
- return module_text_address(addr) != NULL;
+ return is_module_text_address(addr);
}
/*
@@ -81,5 +111,5 @@ int func_ptr_is_kernel_text(void *ptr)
addr = (unsigned long) dereference_function_descriptor(ptr);
if (core_kernel_text(addr))
return 1;
- return module_text_address(addr) != NULL;
+ return is_module_text_address(addr);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index bf0cef8bbdf..bfee931ee3f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -17,7 +17,6 @@
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
-#include <linux/mnt_namespace.h>
#include <linux/personality.h>
#include <linux/mempolicy.h>
#include <linux/sem.h>
@@ -60,7 +59,9 @@
#include <linux/tty.h>
#include <linux/proc_fs.h>
#include <linux/blkdev.h>
-#include <trace/sched.h>
+#include <linux/fs_struct.h>
+#include <linux/magic.h>
+#include <linux/perf_counter.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -69,6 +70,8 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+#include <trace/events/sched.h>
+
/*
* Protected counters by write_lock_irq(&tasklist_lock)
*/
@@ -81,8 +84,6 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
-DEFINE_TRACE(sched_process_fork);
-
int nr_processes(void)
{
int cpu;
@@ -151,8 +152,7 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(atomic_read(&tsk->usage));
WARN_ON(tsk == current);
- put_cred(tsk->real_cred);
- put_cred(tsk->cred);
+ exit_creds(tsk);
delayacct_tsk_free(tsk);
if (!profile_handoff_task(tsk))
@@ -176,7 +176,7 @@ void __init fork_init(unsigned long mempages)
/* create a slab on which task_structs can be allocated */
task_struct_cachep =
kmem_cache_create("task_struct", sizeof(struct task_struct),
- ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
+ ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
#endif
/* do the arch specific task caches init */
@@ -212,6 +212,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
{
struct task_struct *tsk;
struct thread_info *ti;
+ unsigned long *stackend;
+
int err;
prepare_to_copy(orig);
@@ -237,6 +239,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
goto out;
setup_thread_stack(tsk, orig);
+ stackend = end_of_stack(tsk);
+ *stackend = STACK_END_MAGIC; /* for overflow detection */
#ifdef CONFIG_CC_STACKPROTECTOR
tsk->stack_canary = get_random_int();
@@ -279,7 +283,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
mm->free_area_cache = oldmm->mmap_base;
mm->cached_hole_size = ~0UL;
mm->map_count = 0;
- cpus_clear(mm->cpu_vm_mask);
+ cpumask_clear(mm_cpumask(mm));
mm->mm_rb = RB_ROOT;
rb_link = &mm->mm_rb.rb_node;
rb_parent = NULL;
@@ -562,18 +566,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
* the value intact in a core dump, and to save the unnecessary
* trouble otherwise. Userland only wants this done for a sys_exit.
*/
- if (tsk->clear_child_tid
- && !(tsk->flags & PF_SIGNALED)
- && atomic_read(&mm->mm_users) > 1) {
- u32 __user * tidptr = tsk->clear_child_tid;
+ if (tsk->clear_child_tid) {
+ if (!(tsk->flags & PF_SIGNALED) &&
+ atomic_read(&mm->mm_users) > 1) {
+ /*
+ * We don't check the error code - if userspace has
+ * not set up a proper pointer then tough luck.
+ */
+ put_user(0, tsk->clear_child_tid);
+ sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
+ 1, NULL, NULL, 0);
+ }
tsk->clear_child_tid = NULL;
-
- /*
- * We don't check the error code - if userspace has
- * not set up a proper pointer then tough luck.
- */
- put_user(0, tidptr);
- sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
}
}
@@ -639,6 +643,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
tsk->min_flt = tsk->maj_flt = 0;
tsk->nvcsw = tsk->nivcsw = 0;
+#ifdef CONFIG_DETECT_HUNG_TASK
+ tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
+#endif
tsk->mm = NULL;
tsk->active_mm = NULL;
@@ -676,38 +683,21 @@ fail_nomem:
return retval;
}
-static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
-{
- struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
- /* We don't need to lock fs - think why ;-) */
- if (fs) {
- atomic_set(&fs->count, 1);
- rwlock_init(&fs->lock);
- fs->umask = old->umask;
- read_lock(&old->lock);
- fs->root = old->root;
- path_get(&old->root);
- fs->pwd = old->pwd;
- path_get(&old->pwd);
- read_unlock(&old->lock);
- }
- return fs;
-}
-
-struct fs_struct *copy_fs_struct(struct fs_struct *old)
-{
- return __copy_fs_struct(old);
-}
-
-EXPORT_SYMBOL_GPL(copy_fs_struct);
-
static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
{
+ struct fs_struct *fs = current->fs;
if (clone_flags & CLONE_FS) {
- atomic_inc(&current->fs->count);
+ /* tsk->fs is already what we want */
+ write_lock(&fs->lock);
+ if (fs->in_exec) {
+ write_unlock(&fs->lock);
+ return -EAGAIN;
+ }
+ fs->users++;
+ write_unlock(&fs->lock);
return 0;
}
- tsk->fs = __copy_fs_struct(current->fs);
+ tsk->fs = copy_fs_struct(fs);
if (!tsk->fs)
return -ENOMEM;
return 0;
@@ -808,6 +798,12 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
sig->cputime_expires.virt_exp = cputime_zero;
sig->cputime_expires.sched_exp = 0;
+ if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
+ sig->cputime_expires.prof_exp =
+ secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
+ sig->cputimer.running = 1;
+ }
+
/* The timer lists. */
INIT_LIST_HEAD(&sig->cpu_timers[0]);
INIT_LIST_HEAD(&sig->cpu_timers[1]);
@@ -817,16 +813,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
{
struct signal_struct *sig;
- int ret;
- if (clone_flags & CLONE_THREAD) {
- ret = thread_group_cputime_clone_thread(current);
- if (likely(!ret)) {
- atomic_inc(&current->signal->count);
- atomic_inc(&current->signal->live);
- }
- return ret;
- }
+ if (clone_flags & CLONE_THREAD)
+ return 0;
+
sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
tsk->signal = sig;
if (!sig)
@@ -836,6 +826,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
atomic_set(&sig->live, 1);
init_waitqueue_head(&sig->wait_chldexit);
sig->flags = 0;
+ if (clone_flags & CLONE_NEWPID)
+ sig->flags |= SIGNAL_UNKILLABLE;
sig->group_exit_code = 0;
sig->group_exit_task = NULL;
sig->group_stop_count = 0;
@@ -851,13 +843,14 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->tty_old_pgrp = NULL;
sig->tty = NULL;
- sig->cutime = sig->cstime = cputime_zero;
+ sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
sig->gtime = cputime_zero;
sig->cgtime = cputime_zero;
sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
task_io_accounting_init(&sig->ioac);
+ sig->sum_sched_runtime = 0;
taskstats_tgid_init(sig);
task_lock(current->group_leader);
@@ -880,16 +873,6 @@ void __cleanup_signal(struct signal_struct *sig)
kmem_cache_free(signal_cachep, sig);
}
-static void cleanup_signal(struct task_struct *tsk)
-{
- struct signal_struct *sig = tsk->signal;
-
- atomic_dec(&sig->live);
-
- if (atomic_dec_and_test(&sig->count))
- __cleanup_signal(sig);
-}
-
static void copy_flags(unsigned long clone_flags, struct task_struct *p)
{
unsigned long new_flags = p->flags;
@@ -984,6 +967,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (!p)
goto fork_out;
+ ftrace_graph_init_task(p);
+
rt_mutex_init_task(p);
#ifdef CONFIG_PROVE_LOCKING
@@ -1007,6 +992,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
* triggers too late. This doesn't hurt, the check is only there
* to stop root fork bombs.
*/
+ retval = -EAGAIN;
if (nr_threads >= max_threads)
goto bad_fork_cleanup_count;
@@ -1021,14 +1007,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
copy_flags(clone_flags, p);
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
-#ifdef CONFIG_PREEMPT_RCU
- p->rcu_read_lock_nesting = 0;
- p->rcu_flipctr_idx = 0;
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
+ rcu_copy_process(p);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
- clear_tsk_thread_flag(p, TIF_SIGPENDING);
init_sigpending(&p->pending);
p->utime = cputime_zero;
@@ -1041,11 +1023,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->default_timer_slack_ns = current->timer_slack_ns;
-#ifdef CONFIG_DETECT_SOFTLOCKUP
- p->last_switch_count = 0;
- p->last_switch_timestamp = 0;
-#endif
-
task_io_accounting_init(&p->ioac);
acct_clear_integrals(p);
@@ -1095,12 +1072,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
- if (unlikely(ptrace_reparented(current)))
- ptrace_fork(p, clone_flags);
+
+ p->bts = NULL;
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p, clone_flags);
+ retval = perf_counter_init_task(p);
+ if (retval)
+ goto bad_fork_cleanup_policy;
+
if ((retval = audit_alloc(p)))
goto bad_fork_cleanup_policy;
/* copy all the process information */
@@ -1120,7 +1101,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_mm;
if ((retval = copy_io(clone_flags, p)))
goto bad_fork_cleanup_namespaces;
- retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
+ retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
if (retval)
goto bad_fork_cleanup_io;
@@ -1137,8 +1118,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
}
}
- ftrace_graph_init_task(p);
-
p->pid = pid_nr(pid);
p->tgid = p->pid;
if (clone_flags & CLONE_THREAD)
@@ -1147,7 +1126,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (current->nsproxy != p->nsproxy) {
retval = ns_cgroup_clone(p, pid);
if (retval)
- goto bad_fork_free_graph;
+ goto bad_fork_free_pid;
}
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1179,10 +1158,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#endif
clear_all_latency_tracing(p);
- /* Our parent execution domain becomes current domain
- These must match for thread signalling to apply */
- p->parent_exec_id = p->self_exec_id;
-
/* ok, now we should be set up.. */
p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
p->pdeath_signal = 0;
@@ -1220,10 +1195,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
set_task_cpu(p, smp_processor_id());
/* CLONE_PARENT re-uses the old parent */
- if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
+ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent;
- else
+ p->parent_exec_id = current->parent_exec_id;
+ } else {
p->real_parent = current;
+ p->parent_exec_id = current->self_exec_id;
+ }
spin_lock(&current->sighand->siglock);
@@ -1240,10 +1218,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
- goto bad_fork_free_graph;
+ goto bad_fork_free_pid;
}
if (clone_flags & CLONE_THREAD) {
+ atomic_inc(&current->signal->count);
+ atomic_inc(&current->signal->live);
p->group_leader = current->group_leader;
list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
}
@@ -1259,8 +1239,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->signal->leader_pid = pid;
tty_kref_put(p->signal->tty);
p->signal->tty = tty_kref_get(current->signal->tty);
- set_task_pgrp(p, task_pgrp_nr(current));
- set_task_session(p, task_session_nr(current));
attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
attach_pid(p, PIDTYPE_SID, task_session(current));
list_add_tail_rcu(&p->tasks, &init_task.tasks);
@@ -1275,10 +1253,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
+ perf_counter_fork(p);
return p;
-bad_fork_free_graph:
- ftrace_graph_exit_task(p);
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
@@ -1290,7 +1267,8 @@ bad_fork_cleanup_mm:
if (p->mm)
mmput(p->mm);
bad_fork_cleanup_signal:
- cleanup_signal(p);
+ if (!(clone_flags & CLONE_THREAD))
+ __cleanup_signal(p->signal);
bad_fork_cleanup_sighand:
__cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
@@ -1302,6 +1280,7 @@ bad_fork_cleanup_semundo:
bad_fork_cleanup_audit:
audit_free(p);
bad_fork_cleanup_policy:
+ perf_counter_free_task(p);
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
@@ -1314,8 +1293,7 @@ bad_fork_cleanup_put_domain:
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);
- put_cred(p->real_cred);
- put_cred(p->cred);
+ exit_creds(p);
bad_fork_free:
free_task(p);
fork_out:
@@ -1418,7 +1396,7 @@ long do_fork(unsigned long clone_flags,
}
audit_finish_fork(p);
- tracehook_report_clone(trace, regs, clone_flags, nr, p);
+ tracehook_report_clone(regs, clone_flags, nr, p);
/*
* We set PF_STARTING at creation in case tracing wants to
@@ -1470,20 +1448,21 @@ void __init proc_caches_init(void)
{
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
- sighand_ctor);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
+ SLAB_NOTRACK, sighand_ctor);
signal_cachep = kmem_cache_create("signal_cache",
sizeof(struct signal_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
mmap_init();
}
@@ -1539,12 +1518,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
{
struct fs_struct *fs = current->fs;
- if ((unshare_flags & CLONE_FS) &&
- (fs && atomic_read(&fs->count) > 1)) {
- *new_fsp = __copy_fs_struct(current->fs);
- if (!*new_fsp)
- return -ENOMEM;
- }
+ if (!(unshare_flags & CLONE_FS) || !fs)
+ return 0;
+
+ /* don't need lock here; in the worst case we'll do useless copy */
+ if (fs->users == 1)
+ return 0;
+
+ *new_fsp = copy_fs_struct(fs);
+ if (!*new_fsp)
+ return -ENOMEM;
return 0;
}
@@ -1660,8 +1643,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
if (new_fs) {
fs = current->fs;
+ write_lock(&fs->lock);
current->fs = new_fs;
- new_fs = fs;
+ if (--fs->users)
+ new_fs = NULL;
+ else
+ new_fs = fs;
+ write_unlock(&fs->lock);
}
if (new_mm) {
@@ -1700,7 +1688,7 @@ bad_unshare_cleanup_sigh:
bad_unshare_cleanup_fs:
if (new_fs)
- put_fs_struct(new_fs);
+ free_fs_struct(new_fs);
bad_unshare_cleanup_thread:
bad_unshare_out:
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 2f4936cf708..bd1d42b17cb 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -44,12 +44,19 @@ void refrigerator(void)
recalc_sigpending(); /* We sent fake signal, clean it up */
spin_unlock_irq(&current->sighand->siglock);
+ /* prevent accounting of that task to load */
+ current->flags |= PF_FREEZING;
+
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!frozen(current))
break;
schedule();
}
+
+ /* Remove the accounting blocker */
+ current->flags &= ~PF_FREEZING;
+
pr_debug("%s left refrigerator\n", current->comm);
__set_current_state(save);
}
diff --git a/kernel/futex.c b/kernel/futex.c
index f89d373a9c6..248dd119a86 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -19,6 +19,10 @@
* PRIVATE futexes by Eric Dumazet
* Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
*
+ * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
+ * Copyright (C) IBM Corporation, 2009
+ * Thanks to Thomas Gleixner for conceptual design and careful reviews.
+ *
* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
* enough at me, Linus for the original (flawed) idea, Matthew
* Kirkwood for proof-of-concept implementation.
@@ -96,8 +100,8 @@ struct futex_pi_state {
*/
struct futex_q {
struct plist_node list;
- /* There can only be a single waiter */
- wait_queue_head_t waiter;
+ /* Waiter reference */
+ struct task_struct *task;
/* Which hash list lock to use: */
spinlock_t *lock_ptr;
@@ -107,14 +111,21 @@ struct futex_q {
/* Optional priority inheritance state: */
struct futex_pi_state *pi_state;
- struct task_struct *task;
+
+ /* rt_waiter storage for requeue_pi: */
+ struct rt_mutex_waiter *rt_waiter;
+
+ /* The expected requeue pi target futex key: */
+ union futex_key *requeue_pi_key;
/* Bitset for the optional bitmasked wakeup */
u32 bitset;
};
/*
- * Split the global futex_lock into every hash list lock.
+ * Hash buckets are shared by all the futex_keys that hash to the same
+ * location. Each key may have multiple futex_q structures, one for each task
+ * waiting on a futex.
*/
struct futex_hash_bucket {
spinlock_t lock;
@@ -189,9 +200,9 @@ static void drop_futex_key_refs(union futex_key *key)
/**
* get_futex_key - Get parameters which are the keys for a futex.
* @uaddr: virtual address of the futex
- * @shared: NULL for a PROCESS_PRIVATE futex,
- * &current->mm->mmap_sem for a PROCESS_SHARED futex
+ * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
* @key: address where result is stored.
+ * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE)
*
* Returns a negative error code or 0
* The key words are stored in *key on success.
@@ -200,11 +211,10 @@ static void drop_futex_key_refs(union futex_key *key)
* offset_within_page). For private mappings, it's (uaddr, current->mm).
* We can usually work out the index without swapping in the page.
*
- * fshared is NULL for PROCESS_PRIVATE futexes
- * For other futexes, it points to &current->mm->mmap_sem and
- * caller must have taken the reader lock. but NOT any spinlocks.
+ * lock_page() might sleep, the caller should not hold a spinlock.
*/
-static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
+static int
+get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
@@ -227,7 +237,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
* but access_ok() should be faster than find_vma()
*/
if (!fshared) {
- if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
+ if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
return -EFAULT;
key->private.mm = mm;
key->private.address = address;
@@ -236,10 +246,11 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
}
again:
- err = get_user_pages_fast(address, 1, 0, &page);
+ err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page);
if (err < 0)
return err;
+ page = compound_head(page);
lock_page(page);
if (!page->mapping) {
unlock_page(page);
@@ -277,6 +288,44 @@ void put_futex_key(int fshared, union futex_key *key)
drop_futex_key_refs(key);
}
+/*
+ * fault_in_user_writeable - fault in user address and verify RW access
+ * @uaddr: pointer to faulting user space address
+ *
+ * Slow path to fixup the fault we just took in the atomic write
+ * access to @uaddr.
+ *
+ * We have no generic implementation of a non destructive write to the
+ * user address. We know that we faulted in the atomic pagefault
+ * disabled section so we can as well avoid the #PF overhead by
+ * calling get_user_pages() right away.
+ */
+static int fault_in_user_writeable(u32 __user *uaddr)
+{
+ int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
+ 1, 1, 0, NULL, NULL);
+ return ret < 0 ? ret : 0;
+}
+
+/**
+ * futex_top_waiter() - Return the highest priority waiter on a futex
+ * @hb: the hash bucket the futex_q's reside in
+ * @key: the futex key (to distinguish it from other futex futex_q's)
+ *
+ * Must be called with the hb lock held.
+ */
+static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
+ union futex_key *key)
+{
+ struct futex_q *this;
+
+ plist_for_each_entry(this, &hb->chain, list) {
+ if (match_futex(&this->key, key))
+ return this;
+ }
+ return NULL;
+}
+
static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
{
u32 curval;
@@ -299,41 +348,6 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
return ret ? -EFAULT : 0;
}
-/*
- * Fault handling.
- */
-static int futex_handle_fault(unsigned long address, int attempt)
-{
- struct vm_area_struct * vma;
- struct mm_struct *mm = current->mm;
- int ret = -EFAULT;
-
- if (attempt > 2)
- return ret;
-
- down_read(&mm->mmap_sem);
- vma = find_vma(mm, address);
- if (vma && address >= vma->vm_start &&
- (vma->vm_flags & VM_WRITE)) {
- int fault;
- fault = handle_mm_fault(mm, vma, address, 1);
- if (unlikely((fault & VM_FAULT_ERROR))) {
-#if 0
- /* XXX: let's do this when we verify it is OK */
- if (ret & VM_FAULT_OOM)
- ret = -ENOMEM;
-#endif
- } else {
- ret = 0;
- if (fault & VM_FAULT_MAJOR)
- current->maj_flt++;
- else
- current->min_flt++;
- }
- }
- up_read(&mm->mmap_sem);
- return ret;
-}
/*
* PI code:
@@ -573,29 +587,160 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
return 0;
}
+/**
+ * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex
+ * @uaddr: the pi futex user address
+ * @hb: the pi futex hash bucket
+ * @key: the futex key associated with uaddr and hb
+ * @ps: the pi_state pointer where we store the result of the
+ * lookup
+ * @task: the task to perform the atomic lock work for. This will
+ * be "current" except in the case of requeue pi.
+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Returns:
+ * 0 - ready to wait
+ * 1 - acquired the lock
+ * <0 - error
+ *
+ * The hb->lock and futex_key refs shall be held by the caller.
+ */
+static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
+ union futex_key *key,
+ struct futex_pi_state **ps,
+ struct task_struct *task, int set_waiters)
+{
+ int lock_taken, ret, ownerdied = 0;
+ u32 uval, newval, curval;
+
+retry:
+ ret = lock_taken = 0;
+
+ /*
+ * To avoid races, we attempt to take the lock here again
+ * (by doing a 0 -> TID atomic cmpxchg), while holding all
+ * the locks. It will most likely not succeed.
+ */
+ newval = task_pid_vnr(task);
+ if (set_waiters)
+ newval |= FUTEX_WAITERS;
+
+ curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
+
+ if (unlikely(curval == -EFAULT))
+ return -EFAULT;
+
+ /*
+ * Detect deadlocks.
+ */
+ if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
+ return -EDEADLK;
+
+ /*
+ * Surprise - we got the lock. Just return to userspace:
+ */
+ if (unlikely(!curval))
+ return 1;
+
+ uval = curval;
+
+ /*
+ * Set the FUTEX_WAITERS flag, so the owner will know it has someone
+ * to wake at the next unlock.
+ */
+ newval = curval | FUTEX_WAITERS;
+
+ /*
+ * There are two cases, where a futex might have no owner (the
+ * owner TID is 0): OWNER_DIED. We take over the futex in this
+ * case. We also do an unconditional take over, when the owner
+ * of the futex died.
+ *
+ * This is safe as we are protected by the hash bucket lock !
+ */
+ if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
+ /* Keep the OWNER_DIED bit */
+ newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
+ ownerdied = 0;
+ lock_taken = 1;
+ }
+
+ curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+
+ if (unlikely(curval == -EFAULT))
+ return -EFAULT;
+ if (unlikely(curval != uval))
+ goto retry;
+
+ /*
+ * We took the lock due to owner died take over.
+ */
+ if (unlikely(lock_taken))
+ return 1;
+
+ /*
+ * We dont have the lock. Look up the PI state (or create it if
+ * we are the first waiter):
+ */
+ ret = lookup_pi_state(uval, hb, key, ps);
+
+ if (unlikely(ret)) {
+ switch (ret) {
+ case -ESRCH:
+ /*
+ * No owner found for this futex. Check if the
+ * OWNER_DIED bit is set to figure out whether
+ * this is a robust futex or not.
+ */
+ if (get_futex_value_locked(&curval, uaddr))
+ return -EFAULT;
+
+ /*
+ * We simply start over in case of a robust
+ * futex. The code above will take the futex
+ * and return happy.
+ */
+ if (curval & FUTEX_OWNER_DIED) {
+ ownerdied = 1;
+ goto retry;
+ }
+ default:
+ break;
+ }
+ }
+
+ return ret;
+}
+
/*
* The hash bucket lock must be held when this is called.
* Afterwards, the futex_q must not be accessed.
*/
static void wake_futex(struct futex_q *q)
{
- plist_del(&q->list, &q->list.plist);
+ struct task_struct *p = q->task;
+
/*
- * The lock in wake_up_all() is a crucial memory barrier after the
- * plist_del() and also before assigning to q->lock_ptr.
+ * We set q->lock_ptr = NULL _before_ we wake up the task. If
+ * a non futex wake up happens on another CPU then the task
+ * might exit and p would dereference a non existing task
+ * struct. Prevent this by holding a reference on p across the
+ * wake up.
*/
- wake_up(&q->waiter);
+ get_task_struct(p);
+
+ plist_del(&q->list, &q->list.plist);
/*
- * The waiting task can free the futex_q as soon as this is written,
- * without taking any locks. This must come last.
- *
- * A memory barrier is required here to prevent the following store
- * to lock_ptr from getting ahead of the wakeup. Clearing the lock
- * at the end of wake_up_all() does not prevent this store from
- * moving.
+ * The waiting task can free the futex_q as soon as
+ * q->lock_ptr = NULL is written, without taking any locks. A
+ * memory barrier is required here to prevent the following
+ * store to lock_ptr from getting ahead of the plist_del.
*/
smp_wmb();
q->lock_ptr = NULL;
+
+ wake_up_state(p, TASK_NORMAL);
+ put_task_struct(p);
}
static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -692,9 +837,16 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
}
}
+static inline void
+double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+{
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+}
+
/*
- * Wake up all waiters hashed on the physical page that is mapped
- * to this virtual address:
+ * Wake up waiters matching bitset queued on this futex (uaddr).
*/
static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
{
@@ -707,7 +859,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
if (!bitset)
return -EINVAL;
- ret = get_futex_key(uaddr, fshared, &key);
+ ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ);
if (unlikely(ret != 0))
goto out;
@@ -717,7 +869,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
plist_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key)) {
- if (this->pi_state) {
+ if (this->pi_state || this->rt_waiter) {
ret = -EINVAL;
break;
}
@@ -750,29 +902,25 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
struct futex_hash_bucket *hb1, *hb2;
struct plist_head *head;
struct futex_q *this, *next;
- int ret, op_ret, attempt = 0;
+ int ret, op_ret;
-retryfull:
- ret = get_futex_key(uaddr1, fshared, &key1);
+retry:
+ ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2);
+ ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out_put_key1;
hb1 = hash_futex(&key1);
hb2 = hash_futex(&key2);
-retry:
double_lock_hb(hb1, hb2);
-
+retry_private:
op_ret = futex_atomic_op_inuser(op, uaddr2);
if (unlikely(op_ret < 0)) {
- u32 dummy;
- spin_unlock(&hb1->lock);
- if (hb1 != hb2)
- spin_unlock(&hb2->lock);
+ double_unlock_hb(hb1, hb2);
#ifndef CONFIG_MMU
/*
@@ -788,26 +936,16 @@ retry:
goto out_put_keys;
}
- /*
- * futex_atomic_op_inuser needs to both read and write
- * *(int __user *)uaddr2, but we can't modify it
- * non-atomically. Therefore, if get_user below is not
- * enough, we need to handle the fault ourselves, while
- * still holding the mmap_sem.
- */
- if (attempt++) {
- ret = futex_handle_fault((unsigned long)uaddr2,
- attempt);
- if (ret)
- goto out_put_keys;
- goto retry;
- }
-
- ret = get_user(dummy, uaddr2);
+ ret = fault_in_user_writeable(uaddr2);
if (ret)
- return ret;
+ goto out_put_keys;
+
+ if (!fshared)
+ goto retry_private;
- goto retryfull;
+ put_futex_key(fshared, &key2);
+ put_futex_key(fshared, &key1);
+ goto retry;
}
head = &hb1->chain;
@@ -834,9 +972,7 @@ retry:
ret += op_ret;
}
- spin_unlock(&hb1->lock);
- if (hb1 != hb2)
- spin_unlock(&hb2->lock);
+ double_unlock_hb(hb1, hb2);
out_put_keys:
put_futex_key(fshared, &key2);
out_put_key1:
@@ -845,30 +981,205 @@ out:
return ret;
}
-/*
- * Requeue all waiters hashed on one physical page to another
- * physical page.
+/**
+ * requeue_futex() - Requeue a futex_q from one hb to another
+ * @q: the futex_q to requeue
+ * @hb1: the source hash_bucket
+ * @hb2: the target hash_bucket
+ * @key2: the new key for the requeued futex_q
+ */
+static inline
+void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
+ struct futex_hash_bucket *hb2, union futex_key *key2)
+{
+
+ /*
+ * If key1 and key2 hash to the same bucket, no need to
+ * requeue.
+ */
+ if (likely(&hb1->chain != &hb2->chain)) {
+ plist_del(&q->list, &hb1->chain);
+ plist_add(&q->list, &hb2->chain);
+ q->lock_ptr = &hb2->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+ q->list.plist.lock = &hb2->lock;
+#endif
+ }
+ get_futex_key_refs(key2);
+ q->key = *key2;
+}
+
+/**
+ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
+ * q: the futex_q
+ * key: the key of the requeue target futex
+ * hb: the hash_bucket of the requeue target futex
+ *
+ * During futex_requeue, with requeue_pi=1, it is possible to acquire the
+ * target futex if it is uncontended or via a lock steal. Set the futex_q key
+ * to the requeue target futex so the waiter can detect the wakeup on the right
+ * futex, but remove it from the hb and NULL the rt_waiter so it can detect
+ * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
+ * to protect access to the pi_state to fixup the owner later. Must be called
+ * with both q->lock_ptr and hb->lock held.
+ */
+static inline
+void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
+ struct futex_hash_bucket *hb)
+{
+ drop_futex_key_refs(&q->key);
+ get_futex_key_refs(key);
+ q->key = *key;
+
+ WARN_ON(plist_node_empty(&q->list));
+ plist_del(&q->list, &q->list.plist);
+
+ WARN_ON(!q->rt_waiter);
+ q->rt_waiter = NULL;
+
+ q->lock_ptr = &hb->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+ q->list.plist.lock = &hb->lock;
+#endif
+
+ wake_up_state(q->task, TASK_NORMAL);
+}
+
+/**
+ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
+ * @pifutex: the user address of the to futex
+ * @hb1: the from futex hash bucket, must be locked by the caller
+ * @hb2: the to futex hash bucket, must be locked by the caller
+ * @key1: the from futex key
+ * @key2: the to futex key
+ * @ps: address to store the pi_state pointer
+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Try and get the lock on behalf of the top waiter if we can do it atomically.
+ * Wake the top waiter if we succeed. If the caller specified set_waiters,
+ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
+ * hb1 and hb2 must be held by the caller.
+ *
+ * Returns:
+ * 0 - failed to acquire the lock atomicly
+ * 1 - acquired the lock
+ * <0 - error
+ */
+static int futex_proxy_trylock_atomic(u32 __user *pifutex,
+ struct futex_hash_bucket *hb1,
+ struct futex_hash_bucket *hb2,
+ union futex_key *key1, union futex_key *key2,
+ struct futex_pi_state **ps, int set_waiters)
+{
+ struct futex_q *top_waiter = NULL;
+ u32 curval;
+ int ret;
+
+ if (get_futex_value_locked(&curval, pifutex))
+ return -EFAULT;
+
+ /*
+ * Find the top_waiter and determine if there are additional waiters.
+ * If the caller intends to requeue more than 1 waiter to pifutex,
+ * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
+ * as we have means to handle the possible fault. If not, don't set
+ * the bit unecessarily as it will force the subsequent unlock to enter
+ * the kernel.
+ */
+ top_waiter = futex_top_waiter(hb1, key1);
+
+ /* There are no waiters, nothing for us to do. */
+ if (!top_waiter)
+ return 0;
+
+ /* Ensure we requeue to the expected futex. */
+ if (!match_futex(top_waiter->requeue_pi_key, key2))
+ return -EINVAL;
+
+ /*
+ * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
+ * the contended case or if set_waiters is 1. The pi_state is returned
+ * in ps in contended cases.
+ */
+ ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
+ set_waiters);
+ if (ret == 1)
+ requeue_pi_wake_futex(top_waiter, key2, hb2);
+
+ return ret;
+}
+
+/**
+ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
+ * uaddr1: source futex user address
+ * uaddr2: target futex user address
+ * nr_wake: number of waiters to wake (must be 1 for requeue_pi)
+ * nr_requeue: number of waiters to requeue (0-INT_MAX)
+ * requeue_pi: if we are attempting to requeue from a non-pi futex to a
+ * pi futex (pi to pi requeue is not supported)
+ *
+ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
+ * uaddr2 atomically on behalf of the top waiter.
+ *
+ * Returns:
+ * >=0 - on success, the number of tasks requeued or woken
+ * <0 - on error
*/
static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
- int nr_wake, int nr_requeue, u32 *cmpval)
+ int nr_wake, int nr_requeue, u32 *cmpval,
+ int requeue_pi)
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+ int drop_count = 0, task_count = 0, ret;
+ struct futex_pi_state *pi_state = NULL;
struct futex_hash_bucket *hb1, *hb2;
struct plist_head *head1;
struct futex_q *this, *next;
- int ret, drop_count = 0;
+ u32 curval2;
+
+ if (requeue_pi) {
+ /*
+ * requeue_pi requires a pi_state, try to allocate it now
+ * without any locks in case it fails.
+ */
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+ /*
+ * requeue_pi must wake as many tasks as it can, up to nr_wake
+ * + nr_requeue, since it acquires the rt_mutex prior to
+ * returning to userspace, so as to not leave the rt_mutex with
+ * waiters and no owner. However, second and third wake-ups
+ * cannot be predicted as they involve race conditions with the
+ * first wake and a fault while looking up the pi_state. Both
+ * pthread_cond_signal() and pthread_cond_broadcast() should
+ * use nr_wake=1.
+ */
+ if (nr_wake != 1)
+ return -EINVAL;
+ }
retry:
- ret = get_futex_key(uaddr1, fshared, &key1);
+ if (pi_state != NULL) {
+ /*
+ * We will have to lookup the pi_state again, so free this one
+ * to keep the accounting correct.
+ */
+ free_pi_state(pi_state);
+ pi_state = NULL;
+ }
+
+ ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2);
+ ret = get_futex_key(uaddr2, fshared, &key2,
+ requeue_pi ? VERIFY_WRITE : VERIFY_READ);
if (unlikely(ret != 0))
goto out_put_key1;
hb1 = hash_futex(&key1);
hb2 = hash_futex(&key2);
+retry_private:
double_lock_hb(hb1, hb2);
if (likely(cmpval != NULL)) {
@@ -877,16 +1188,18 @@ retry:
ret = get_futex_value_locked(&curval, uaddr1);
if (unlikely(ret)) {
- spin_unlock(&hb1->lock);
- if (hb1 != hb2)
- spin_unlock(&hb2->lock);
+ double_unlock_hb(hb1, hb2);
ret = get_user(curval, uaddr1);
+ if (ret)
+ goto out_put_keys;
- if (!ret)
- goto retry;
+ if (!fshared)
+ goto retry_private;
- goto out_put_keys;
+ put_futex_key(fshared, &key2);
+ put_futex_key(fshared, &key1);
+ goto retry;
}
if (curval != *cmpval) {
ret = -EAGAIN;
@@ -894,40 +1207,123 @@ retry:
}
}
+ if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
+ /*
+ * Attempt to acquire uaddr2 and wake the top waiter. If we
+ * intend to requeue waiters, force setting the FUTEX_WAITERS
+ * bit. We force this here where we are able to easily handle
+ * faults rather in the requeue loop below.
+ */
+ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+ &key2, &pi_state, nr_requeue);
+
+ /*
+ * At this point the top_waiter has either taken uaddr2 or is
+ * waiting on it. If the former, then the pi_state will not
+ * exist yet, look it up one more time to ensure we have a
+ * reference to it.
+ */
+ if (ret == 1) {
+ WARN_ON(pi_state);
+ task_count++;
+ ret = get_futex_value_locked(&curval2, uaddr2);
+ if (!ret)
+ ret = lookup_pi_state(curval2, hb2, &key2,
+ &pi_state);
+ }
+
+ switch (ret) {
+ case 0:
+ break;
+ case -EFAULT:
+ double_unlock_hb(hb1, hb2);
+ put_futex_key(fshared, &key2);
+ put_futex_key(fshared, &key1);
+ ret = fault_in_user_writeable(uaddr2);
+ if (!ret)
+ goto retry;
+ goto out;
+ case -EAGAIN:
+ /* The owner was exiting, try again. */
+ double_unlock_hb(hb1, hb2);
+ put_futex_key(fshared, &key2);
+ put_futex_key(fshared, &key1);
+ cond_resched();
+ goto retry;
+ default:
+ goto out_unlock;
+ }
+ }
+
head1 = &hb1->chain;
plist_for_each_entry_safe(this, next, head1, list) {
- if (!match_futex (&this->key, &key1))
+ if (task_count - nr_wake >= nr_requeue)
+ break;
+
+ if (!match_futex(&this->key, &key1))
continue;
- if (++ret <= nr_wake) {
+
+ /*
+ * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
+ * be paired with each other and no other futex ops.
+ */
+ if ((requeue_pi && !this->rt_waiter) ||
+ (!requeue_pi && this->rt_waiter)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /*
+ * Wake nr_wake waiters. For requeue_pi, if we acquired the
+ * lock, we already woke the top_waiter. If not, it will be
+ * woken by futex_unlock_pi().
+ */
+ if (++task_count <= nr_wake && !requeue_pi) {
wake_futex(this);
- } else {
- /*
- * If key1 and key2 hash to the same bucket, no need to
- * requeue.
- */
- if (likely(head1 != &hb2->chain)) {
- plist_del(&this->list, &hb1->chain);
- plist_add(&this->list, &hb2->chain);
- this->lock_ptr = &hb2->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
- this->list.plist.lock = &hb2->lock;
-#endif
- }
- this->key = key2;
- get_futex_key_refs(&key2);
- drop_count++;
+ continue;
+ }
- if (ret - nr_wake >= nr_requeue)
- break;
+ /* Ensure we requeue to the expected futex for requeue_pi. */
+ if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /*
+ * Requeue nr_requeue waiters and possibly one more in the case
+ * of requeue_pi if we couldn't acquire the lock atomically.
+ */
+ if (requeue_pi) {
+ /* Prepare the waiter to take the rt_mutex. */
+ atomic_inc(&pi_state->refcount);
+ this->pi_state = pi_state;
+ ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+ this->rt_waiter,
+ this->task, 1);
+ if (ret == 1) {
+ /* We got the lock. */
+ requeue_pi_wake_futex(this, &key2, hb2);
+ continue;
+ } else if (ret) {
+ /* -EDEADLK */
+ this->pi_state = NULL;
+ free_pi_state(pi_state);
+ goto out_unlock;
+ }
}
+ requeue_futex(this, hb1, hb2, &key2);
+ drop_count++;
}
out_unlock:
- spin_unlock(&hb1->lock);
- if (hb1 != hb2)
- spin_unlock(&hb2->lock);
+ double_unlock_hb(hb1, hb2);
- /* drop_futex_key_refs() must be called outside the spinlocks. */
+ /*
+ * drop_futex_key_refs() must be called outside the spinlocks. During
+ * the requeue we moved futex_q's from the hash bucket at key1 to the
+ * one at key2 and updated their key pointer. We no longer need to
+ * hold the references to key1.
+ */
while (--drop_count >= 0)
drop_futex_key_refs(&key1);
@@ -936,7 +1332,9 @@ out_put_keys:
out_put_key1:
put_futex_key(fshared, &key1);
out:
- return ret;
+ if (pi_state != NULL)
+ free_pi_state(pi_state);
+ return ret ? ret : task_count;
}
/* The key must be already stored in q->key. */
@@ -944,8 +1342,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
{
struct futex_hash_bucket *hb;
- init_waitqueue_head(&q->waiter);
-
get_futex_key_refs(&q->key);
hb = hash_futex(&q->key);
q->lock_ptr = &hb->lock;
@@ -1063,7 +1459,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
struct futex_pi_state *pi_state = q->pi_state;
struct task_struct *oldowner = pi_state->owner;
u32 uval, curval, newval;
- int ret, attempt = 0;
+ int ret;
/* Owner died? */
if (!pi_state->owner)
@@ -1076,11 +1472,9 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
* in the user space variable. This must be atomic as we have
* to preserve the owner died bit here.
*
- * Note: We write the user space value _before_ changing the
- * pi_state because we can fault here. Imagine swapped out
- * pages or a fork, which was running right before we acquired
- * mmap_sem, that marked all the anonymous memory readonly for
- * cow.
+ * Note: We write the user space value _before_ changing the pi_state
+ * because we can fault here. Imagine swapped out pages or a fork
+ * that marked all the anonymous memory readonly for cow.
*
* Modifying pi_state _before_ the user space value would
* leave the pi_state in an inconsistent state when we fault
@@ -1136,7 +1530,7 @@ retry:
handle_fault:
spin_unlock(q->lock_ptr);
- ret = futex_handle_fault((unsigned long)uaddr, attempt++);
+ ret = fault_in_user_writeable(uaddr);
spin_lock(q->lock_ptr);
@@ -1158,36 +1552,152 @@ handle_fault:
*/
#define FLAGS_SHARED 0x01
#define FLAGS_CLOCKRT 0x02
+#define FLAGS_HAS_TIMEOUT 0x04
static long futex_wait_restart(struct restart_block *restart);
-static int futex_wait(u32 __user *uaddr, int fshared,
- u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+/**
+ * fixup_owner() - Post lock pi_state and corner case management
+ * @uaddr: user address of the futex
+ * @fshared: whether the futex is shared (1) or not (0)
+ * @q: futex_q (contains pi_state and access to the rt_mutex)
+ * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
+ *
+ * After attempting to lock an rt_mutex, this function is called to cleanup
+ * the pi_state owner as well as handle race conditions that may allow us to
+ * acquire the lock. Must be called with the hb lock held.
+ *
+ * Returns:
+ * 1 - success, lock taken
+ * 0 - success, lock not taken
+ * <0 - on error (-EFAULT)
+ */
+static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
+ int locked)
{
- struct task_struct *curr = current;
- DECLARE_WAITQUEUE(wait, curr);
- struct futex_hash_bucket *hb;
- struct futex_q q;
- u32 uval;
- int ret;
- struct hrtimer_sleeper t;
- int rem = 0;
+ struct task_struct *owner;
+ int ret = 0;
- if (!bitset)
- return -EINVAL;
+ if (locked) {
+ /*
+ * Got the lock. We might not be the anticipated owner if we
+ * did a lock-steal - fix up the PI-state in that case:
+ */
+ if (q->pi_state->owner != current)
+ ret = fixup_pi_state_owner(uaddr, q, current, fshared);
+ goto out;
+ }
- q.pi_state = NULL;
- q.bitset = bitset;
-retry:
- q.key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q.key);
- if (unlikely(ret != 0))
+ /*
+ * Catch the rare case, where the lock was released when we were on the
+ * way back before we locked the hash bucket.
+ */
+ if (q->pi_state->owner == current) {
+ /*
+ * Try to get the rt_mutex now. This might fail as some other
+ * task acquired the rt_mutex after we removed ourself from the
+ * rt_mutex waiters list.
+ */
+ if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
+ locked = 1;
+ goto out;
+ }
+
+ /*
+ * pi_state is incorrect, some other task did a lock steal and
+ * we returned due to timeout or signal without taking the
+ * rt_mutex. Too late. We can access the rt_mutex_owner without
+ * locking, as the other task is now blocked on the hash bucket
+ * lock. Fix the state up.
+ */
+ owner = rt_mutex_owner(&q->pi_state->pi_mutex);
+ ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
goto out;
+ }
- hb = queue_lock(&q);
+ /*
+ * Paranoia check. If we did not take the lock, then we should not be
+ * the owner, nor the pending owner, of the rt_mutex.
+ */
+ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
+ printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
+ "pi-state %p\n", ret,
+ q->pi_state->pi_mutex.owner,
+ q->pi_state->owner);
+
+out:
+ return ret ? ret : locked;
+}
+
+/**
+ * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
+ * @hb: the futex hash bucket, must be locked by the caller
+ * @q: the futex_q to queue up on
+ * @timeout: the prepared hrtimer_sleeper, or null for no timeout
+ */
+static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
+ struct hrtimer_sleeper *timeout)
+{
+ queue_me(q, hb);
/*
- * Access the page AFTER the futex is queued.
+ * There might have been scheduling since the queue_me(), as we
+ * cannot hold a spinlock across the get_user() in case it
+ * faults, and we cannot just set TASK_INTERRUPTIBLE state when
+ * queueing ourselves into the futex hash. This code thus has to
+ * rely on the futex_wake() code removing us from hash when it
+ * wakes us up.
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ /* Arm the timer */
+ if (timeout) {
+ hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+ if (!hrtimer_active(&timeout->timer))
+ timeout->task = NULL;
+ }
+
+ /*
+ * !plist_node_empty() is safe here without any lock.
+ * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
+ */
+ if (likely(!plist_node_empty(&q->list))) {
+ /*
+ * If the timer has already expired, current will already be
+ * flagged for rescheduling. Only call schedule if there
+ * is no timeout, or if it has yet to expire.
+ */
+ if (!timeout || timeout->task)
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+}
+
+/**
+ * futex_wait_setup() - Prepare to wait on a futex
+ * @uaddr: the futex userspace address
+ * @val: the expected value
+ * @fshared: whether the futex is shared (1) or not (0)
+ * @q: the associated futex_q
+ * @hb: storage for hash_bucket pointer to be returned to caller
+ *
+ * Setup the futex_q and locate the hash_bucket. Get the futex value and
+ * compare it with the expected value. Handle atomic faults internally.
+ * Return with the hb lock held and a q.key reference on success, and unlocked
+ * with no q.key reference on failure.
+ *
+ * Returns:
+ * 0 - uaddr contains val and hb has been locked
+ * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
+ */
+static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+ struct futex_q *q, struct futex_hash_bucket **hb)
+{
+ u32 uval;
+ int ret;
+
+ /*
+ * Access the page AFTER the hash-bucket is locked.
* Order is important:
*
* Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
@@ -1202,121 +1712,116 @@ retry:
* A consequence is that futex_wait() can return zero and absorb
* a wakeup when *uaddr != val on entry to the syscall. This is
* rare, but normal.
- *
- * for shared futexes, we hold the mmap semaphore, so the mapping
- * cannot have changed since we looked it up in get_futex_key.
*/
+retry:
+ q->key = FUTEX_KEY_INIT;
+ ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
+ if (unlikely(ret != 0))
+ return ret;
+
+retry_private:
+ *hb = queue_lock(q);
+
ret = get_futex_value_locked(&uval, uaddr);
- if (unlikely(ret)) {
- queue_unlock(&q, hb);
- put_futex_key(fshared, &q.key);
+ if (ret) {
+ queue_unlock(q, *hb);
ret = get_user(uval, uaddr);
+ if (ret)
+ goto out;
- if (!ret)
- goto retry;
- return ret;
+ if (!fshared)
+ goto retry_private;
+
+ put_futex_key(fshared, &q->key);
+ goto retry;
}
- ret = -EWOULDBLOCK;
- if (uval != val)
- goto out_unlock_put_key;
- /* Only actually queue if *uaddr contained val. */
- queue_me(&q, hb);
+ if (uval != val) {
+ queue_unlock(q, *hb);
+ ret = -EWOULDBLOCK;
+ }
- /*
- * There might have been scheduling since the queue_me(), as we
- * cannot hold a spinlock across the get_user() in case it
- * faults, and we cannot just set TASK_INTERRUPTIBLE state when
- * queueing ourselves into the futex hash. This code thus has to
- * rely on the futex_wake() code removing us from hash when it
- * wakes us up.
- */
+out:
+ if (ret)
+ put_futex_key(fshared, &q->key);
+ return ret;
+}
- /* add_wait_queue is the barrier after __set_current_state. */
- __set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&q.waiter, &wait);
- /*
- * !plist_node_empty() is safe here without any lock.
- * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
- */
- if (likely(!plist_node_empty(&q.list))) {
- if (!abs_time)
- schedule();
- else {
- unsigned long slack;
- slack = current->timer_slack_ns;
- if (rt_task(current))
- slack = 0;
- hrtimer_init_on_stack(&t.timer,
- clockrt ? CLOCK_REALTIME :
- CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
- hrtimer_init_sleeper(&t, current);
- hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
-
- hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
- if (!hrtimer_active(&t.timer))
- t.task = NULL;
+static int futex_wait(u32 __user *uaddr, int fshared,
+ u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+{
+ struct hrtimer_sleeper timeout, *to = NULL;
+ struct restart_block *restart;
+ struct futex_hash_bucket *hb;
+ struct futex_q q;
+ int ret;
- /*
- * the timer could have already expired, in which
- * case current would be flagged for rescheduling.
- * Don't bother calling schedule.
- */
- if (likely(t.task))
- schedule();
+ if (!bitset)
+ return -EINVAL;
- hrtimer_cancel(&t.timer);
+ q.pi_state = NULL;
+ q.bitset = bitset;
+ q.rt_waiter = NULL;
+ q.requeue_pi_key = NULL;
- /* Flag if a timeout occured */
- rem = (t.task == NULL);
+ if (abs_time) {
+ to = &timeout;
- destroy_hrtimer_on_stack(&t.timer);
- }
+ hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+ CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper(to, current);
+ hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+ current->timer_slack_ns);
}
- __set_current_state(TASK_RUNNING);
- /*
- * NOTE: we don't remove ourselves from the waitqueue because
- * we are the only user of it.
- */
+ /* Prepare to wait on uaddr. */
+ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+ if (ret)
+ goto out;
+
+ /* queue_me and wait for wakeup, timeout, or a signal. */
+ futex_wait_queue_me(hb, &q, to);
/* If we were woken (and unqueued), we succeeded, whatever. */
+ ret = 0;
if (!unqueue_me(&q))
- return 0;
- if (rem)
- return -ETIMEDOUT;
+ goto out_put_key;
+ ret = -ETIMEDOUT;
+ if (to && !to->task)
+ goto out_put_key;
/*
* We expect signal_pending(current), but another thread may
* have handled it for us already.
*/
+ ret = -ERESTARTSYS;
if (!abs_time)
- return -ERESTARTSYS;
- else {
- struct restart_block *restart;
- restart = &current_thread_info()->restart_block;
- restart->fn = futex_wait_restart;
- restart->futex.uaddr = (u32 *)uaddr;
- restart->futex.val = val;
- restart->futex.time = abs_time->tv64;
- restart->futex.bitset = bitset;
- restart->futex.flags = 0;
-
- if (fshared)
- restart->futex.flags |= FLAGS_SHARED;
- if (clockrt)
- restart->futex.flags |= FLAGS_CLOCKRT;
- return -ERESTART_RESTARTBLOCK;
- }
+ goto out_put_key;
-out_unlock_put_key:
- queue_unlock(&q, hb);
- put_futex_key(fshared, &q.key);
+ restart = &current_thread_info()->restart_block;
+ restart->fn = futex_wait_restart;
+ restart->futex.uaddr = (u32 *)uaddr;
+ restart->futex.val = val;
+ restart->futex.time = abs_time->tv64;
+ restart->futex.bitset = bitset;
+ restart->futex.flags = FLAGS_HAS_TIMEOUT;
+
+ if (fshared)
+ restart->futex.flags |= FLAGS_SHARED;
+ if (clockrt)
+ restart->futex.flags |= FLAGS_CLOCKRT;
+ ret = -ERESTART_RESTARTBLOCK;
+
+out_put_key:
+ put_futex_key(fshared, &q.key);
out:
+ if (to) {
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+ }
return ret;
}
@@ -1325,13 +1830,16 @@ static long futex_wait_restart(struct restart_block *restart)
{
u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
int fshared = 0;
- ktime_t t;
+ ktime_t t, *tp = NULL;
- t.tv64 = restart->futex.time;
+ if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
+ t.tv64 = restart->futex.time;
+ tp = &t;
+ }
restart->fn = do_no_restart_syscall;
if (restart->futex.flags & FLAGS_SHARED)
fshared = 1;
- return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
+ return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
restart->futex.bitset,
restart->futex.flags & FLAGS_CLOCKRT);
}
@@ -1347,11 +1855,9 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
int detect, ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
- struct task_struct *curr = current;
struct futex_hash_bucket *hb;
- u32 uval, newval, curval;
struct futex_q q;
- int ret, lock_taken, ownerdied = 0, attempt = 0;
+ int res, ret;
if (refill_pi_state_cache())
return -ENOMEM;
@@ -1365,117 +1871,35 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
}
q.pi_state = NULL;
+ q.rt_waiter = NULL;
+ q.requeue_pi_key = NULL;
retry:
q.key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q.key);
+ ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out;
-retry_unlocked:
+retry_private:
hb = queue_lock(&q);
-retry_locked:
- ret = lock_taken = 0;
-
- /*
- * To avoid races, we attempt to take the lock here again
- * (by doing a 0 -> TID atomic cmpxchg), while holding all
- * the locks. It will most likely not succeed.
- */
- newval = task_pid_vnr(current);
-
- curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
-
- if (unlikely(curval == -EFAULT))
- goto uaddr_faulted;
-
- /*
- * Detect deadlocks. In case of REQUEUE_PI this is a valid
- * situation and we return success to user space.
- */
- if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
- ret = -EDEADLK;
- goto out_unlock_put_key;
- }
-
- /*
- * Surprise - we got the lock. Just return to userspace:
- */
- if (unlikely(!curval))
- goto out_unlock_put_key;
-
- uval = curval;
-
- /*
- * Set the WAITERS flag, so the owner will know it has someone
- * to wake at next unlock
- */
- newval = curval | FUTEX_WAITERS;
-
- /*
- * There are two cases, where a futex might have no owner (the
- * owner TID is 0): OWNER_DIED. We take over the futex in this
- * case. We also do an unconditional take over, when the owner
- * of the futex died.
- *
- * This is safe as we are protected by the hash bucket lock !
- */
- if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
- /* Keep the OWNER_DIED bit */
- newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
- ownerdied = 0;
- lock_taken = 1;
- }
-
- curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
- if (unlikely(curval == -EFAULT))
- goto uaddr_faulted;
- if (unlikely(curval != uval))
- goto retry_locked;
-
- /*
- * We took the lock due to owner died take over.
- */
- if (unlikely(lock_taken))
- goto out_unlock_put_key;
-
- /*
- * We dont have the lock. Look up the PI state (or create it if
- * we are the first waiter):
- */
- ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
-
+ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
if (unlikely(ret)) {
switch (ret) {
-
+ case 1:
+ /* We got the lock. */
+ ret = 0;
+ goto out_unlock_put_key;
+ case -EFAULT:
+ goto uaddr_faulted;
case -EAGAIN:
/*
* Task is exiting and we just wait for the
* exit to complete.
*/
queue_unlock(&q, hb);
+ put_futex_key(fshared, &q.key);
cond_resched();
goto retry;
-
- case -ESRCH:
- /*
- * No owner found for this futex. Check if the
- * OWNER_DIED bit is set to figure out whether
- * this is a robust futex or not.
- */
- if (get_futex_value_locked(&curval, uaddr))
- goto uaddr_faulted;
-
- /*
- * We simply start over in case of a robust
- * futex. The code above will take the futex
- * and return happy.
- */
- if (curval & FUTEX_OWNER_DIED) {
- ownerdied = 1;
- goto retry_locked;
- }
default:
goto out_unlock_put_key;
}
@@ -1499,74 +1923,29 @@ retry_locked:
}
spin_lock(q.lock_ptr);
+ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+ */
+ res = fixup_owner(uaddr, fshared, &q, !ret);
+ /*
+ * If fixup_owner() returned an error, proprogate that. If it acquired
+ * the lock, clear our -ETIMEDOUT or -EINTR.
+ */
+ if (res)
+ ret = (res < 0) ? res : 0;
- if (!ret) {
- /*
- * Got the lock. We might not be the anticipated owner
- * if we did a lock-steal - fix up the PI-state in
- * that case:
- */
- if (q.pi_state->owner != curr)
- ret = fixup_pi_state_owner(uaddr, &q, curr, fshared);
- } else {
- /*
- * Catch the rare case, where the lock was released
- * when we were on the way back before we locked the
- * hash bucket.
- */
- if (q.pi_state->owner == curr) {
- /*
- * Try to get the rt_mutex now. This might
- * fail as some other task acquired the
- * rt_mutex after we removed ourself from the
- * rt_mutex waiters list.
- */
- if (rt_mutex_trylock(&q.pi_state->pi_mutex))
- ret = 0;
- else {
- /*
- * pi_state is incorrect, some other
- * task did a lock steal and we
- * returned due to timeout or signal
- * without taking the rt_mutex. Too
- * late. We can access the
- * rt_mutex_owner without locking, as
- * the other task is now blocked on
- * the hash bucket lock. Fix the state
- * up.
- */
- struct task_struct *owner;
- int res;
-
- owner = rt_mutex_owner(&q.pi_state->pi_mutex);
- res = fixup_pi_state_owner(uaddr, &q, owner,
- fshared);
-
- /* propagate -EFAULT, if the fixup failed */
- if (res)
- ret = res;
- }
- } else {
- /*
- * Paranoia check. If we did not take the lock
- * in the trylock above, then we should not be
- * the owner of the rtmutex, neither the real
- * nor the pending one:
- */
- if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
- printk(KERN_ERR "futex_lock_pi: ret = %d "
- "pi-mutex: %p pi-state %p\n", ret,
- q.pi_state->pi_mutex.owner,
- q.pi_state->owner);
- }
- }
+ /*
+ * If fixup_owner() faulted and was unable to handle the fault, unlock
+ * it and return the fault to userspace.
+ */
+ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
+ rt_mutex_unlock(&q.pi_state->pi_mutex);
/* Unqueue and drop the lock */
unqueue_me_pi(&q);
- if (to)
- destroy_hrtimer_on_stack(&to->timer);
- return ret != -EINTR ? ret : -ERESTARTNOINTR;
+ goto out;
out_unlock_put_key:
queue_unlock(&q, hb);
@@ -1576,32 +1955,20 @@ out_put_key:
out:
if (to)
destroy_hrtimer_on_stack(&to->timer);
- return ret;
+ return ret != -EINTR ? ret : -ERESTARTNOINTR;
uaddr_faulted:
- /*
- * We have to r/w *(int __user *)uaddr, and we have to modify it
- * atomically. Therefore, if we continue to fault after get_user()
- * below, we need to handle the fault ourselves, while still holding
- * the mmap_sem. This can occur if the uaddr is under contention as
- * we have to drop the mmap_sem in order to call get_user().
- */
queue_unlock(&q, hb);
- if (attempt++) {
- ret = futex_handle_fault((unsigned long)uaddr, attempt);
- if (ret)
- goto out_put_key;
- goto retry_unlocked;
- }
+ ret = fault_in_user_writeable(uaddr);
+ if (ret)
+ goto out_put_key;
- ret = get_user(uval, uaddr);
- if (!ret)
- goto retry;
+ if (!fshared)
+ goto retry_private;
- if (to)
- destroy_hrtimer_on_stack(&to->timer);
- return ret;
+ put_futex_key(fshared, &q.key);
+ goto retry;
}
/*
@@ -1616,7 +1983,7 @@ static int futex_unlock_pi(u32 __user *uaddr, int fshared)
u32 uval;
struct plist_head *head;
union futex_key key = FUTEX_KEY_INIT;
- int ret, attempt = 0;
+ int ret;
retry:
if (get_user(uval, uaddr))
@@ -1627,12 +1994,11 @@ retry:
if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
return -EPERM;
- ret = get_futex_key(uaddr, fshared, &key);
+ ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out;
hb = hash_futex(&key);
-retry_unlocked:
spin_lock(&hb->lock);
/*
@@ -1689,27 +2055,234 @@ out:
return ret;
pi_faulted:
+ spin_unlock(&hb->lock);
+ put_futex_key(fshared, &key);
+
+ ret = fault_in_user_writeable(uaddr);
+ if (!ret)
+ goto retry;
+
+ return ret;
+}
+
+/**
+ * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
+ * @hb: the hash_bucket futex_q was original enqueued on
+ * @q: the futex_q woken while waiting to be requeued
+ * @key2: the futex_key of the requeue target futex
+ * @timeout: the timeout associated with the wait (NULL if none)
+ *
+ * Detect if the task was woken on the initial futex as opposed to the requeue
+ * target futex. If so, determine if it was a timeout or a signal that caused
+ * the wakeup and return the appropriate error code to the caller. Must be
+ * called with the hb lock held.
+ *
+ * Returns
+ * 0 - no early wakeup detected
+ * <0 - -ETIMEDOUT or -ERESTARTNOINTR
+ */
+static inline
+int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
+ struct futex_q *q, union futex_key *key2,
+ struct hrtimer_sleeper *timeout)
+{
+ int ret = 0;
+
+ /*
+ * With the hb lock held, we avoid races while we process the wakeup.
+ * We only need to hold hb (and not hb2) to ensure atomicity as the
+ * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
+ * It can't be requeued from uaddr2 to something else since we don't
+ * support a PI aware source futex for requeue.
+ */
+ if (!match_futex(&q->key, key2)) {
+ WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
+ /*
+ * We were woken prior to requeue by a timeout or a signal.
+ * Unqueue the futex_q and determine which it was.
+ */
+ plist_del(&q->list, &q->list.plist);
+ drop_futex_key_refs(&q->key);
+
+ if (timeout && !timeout->task)
+ ret = -ETIMEDOUT;
+ else
+ ret = -ERESTARTNOINTR;
+ }
+ return ret;
+}
+
+/**
+ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
+ * @uaddr: the futex we initialyl wait on (non-pi)
+ * @fshared: whether the futexes are shared (1) or not (0). They must be
+ * the same type, no requeueing from private to shared, etc.
+ * @val: the expected value of uaddr
+ * @abs_time: absolute timeout
+ * @bitset: 32 bit wakeup bitset set by userspace, defaults to all.
+ * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
+ * @uaddr2: the pi futex we will take prior to returning to user-space
+ *
+ * The caller will wait on uaddr and will be requeued by futex_requeue() to
+ * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and
+ * complete the acquisition of the rt_mutex prior to returning to userspace.
+ * This ensures the rt_mutex maintains an owner when it has waiters; without
+ * one, the pi logic wouldn't know which task to boost/deboost, if there was a
+ * need to.
+ *
+ * We call schedule in futex_wait_queue_me() when we enqueue and return there
+ * via the following:
+ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
+ * 2) wakeup on uaddr2 after a requeue
+ * 3) signal
+ * 4) timeout
+ *
+ * If 3, cleanup and return -ERESTARTNOINTR.
+ *
+ * If 2, we may then block on trying to take the rt_mutex and return via:
+ * 5) successful lock
+ * 6) signal
+ * 7) timeout
+ * 8) other lock acquisition failure
+ *
+ * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
+ *
+ * If 4 or 7, we cleanup and return with -ETIMEDOUT.
+ *
+ * Returns:
+ * 0 - On success
+ * <0 - On error
+ */
+static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+ u32 val, ktime_t *abs_time, u32 bitset,
+ int clockrt, u32 __user *uaddr2)
+{
+ struct hrtimer_sleeper timeout, *to = NULL;
+ struct rt_mutex_waiter rt_waiter;
+ struct rt_mutex *pi_mutex = NULL;
+ struct futex_hash_bucket *hb;
+ union futex_key key2;
+ struct futex_q q;
+ int res, ret;
+
+ if (!bitset)
+ return -EINVAL;
+
+ if (abs_time) {
+ to = &timeout;
+ hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+ CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper(to, current);
+ hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+ current->timer_slack_ns);
+ }
+
/*
- * We have to r/w *(int __user *)uaddr, and we have to modify it
- * atomically. Therefore, if we continue to fault after get_user()
- * below, we need to handle the fault ourselves, while still holding
- * the mmap_sem. This can occur if the uaddr is under contention as
- * we have to drop the mmap_sem in order to call get_user().
+ * The waiter is allocated on our stack, manipulated by the requeue
+ * code while we sleep on uaddr.
*/
+ debug_rt_mutex_init_waiter(&rt_waiter);
+ rt_waiter.task = NULL;
+
+ key2 = FUTEX_KEY_INIT;
+ ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
+ if (unlikely(ret != 0))
+ goto out;
+
+ q.pi_state = NULL;
+ q.bitset = bitset;
+ q.rt_waiter = &rt_waiter;
+ q.requeue_pi_key = &key2;
+
+ /* Prepare to wait on uaddr. */
+ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+ if (ret)
+ goto out_key2;
+
+ /* Queue the futex_q, drop the hb lock, wait for wakeup. */
+ futex_wait_queue_me(hb, &q, to);
+
+ spin_lock(&hb->lock);
+ ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
spin_unlock(&hb->lock);
+ if (ret)
+ goto out_put_keys;
- if (attempt++) {
- ret = futex_handle_fault((unsigned long)uaddr, attempt);
- if (ret)
- goto out;
- uval = 0;
- goto retry_unlocked;
+ /*
+ * In order for us to be here, we know our q.key == key2, and since
+ * we took the hb->lock above, we also know that futex_requeue() has
+ * completed and we no longer have to concern ourselves with a wakeup
+ * race with the atomic proxy lock acquition by the requeue code.
+ */
+
+ /* Check if the requeue code acquired the second futex for us. */
+ if (!q.rt_waiter) {
+ /*
+ * Got the lock. We might not be the anticipated owner if we
+ * did a lock-steal - fix up the PI-state in that case.
+ */
+ if (q.pi_state && (q.pi_state->owner != current)) {
+ spin_lock(q.lock_ptr);
+ ret = fixup_pi_state_owner(uaddr2, &q, current,
+ fshared);
+ spin_unlock(q.lock_ptr);
+ }
+ } else {
+ /*
+ * We have been woken up by futex_unlock_pi(), a timeout, or a
+ * signal. futex_unlock_pi() will not destroy the lock_ptr nor
+ * the pi_state.
+ */
+ WARN_ON(!&q.pi_state);
+ pi_mutex = &q.pi_state->pi_mutex;
+ ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
+ debug_rt_mutex_free_waiter(&rt_waiter);
+
+ spin_lock(q.lock_ptr);
+ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+ */
+ res = fixup_owner(uaddr2, fshared, &q, !ret);
+ /*
+ * If fixup_owner() returned an error, proprogate that. If it
+ * acquired the lock, clear our -ETIMEDOUT or -EINTR.
+ */
+ if (res)
+ ret = (res < 0) ? res : 0;
+
+ /* Unqueue and drop the lock. */
+ unqueue_me_pi(&q);
}
- ret = get_user(uval, uaddr);
- if (!ret)
- goto retry;
+ /*
+ * If fixup_pi_state_owner() faulted and was unable to handle the
+ * fault, unlock the rt_mutex and return the fault to userspace.
+ */
+ if (ret == -EFAULT) {
+ if (rt_mutex_owner(pi_mutex) == current)
+ rt_mutex_unlock(pi_mutex);
+ } else if (ret == -EINTR) {
+ /*
+ * We've already been requeued, but cannot restart by calling
+ * futex_lock_pi() directly. We could restart this syscall, but
+ * it would detect that the user space "val" changed and return
+ * -EWOULDBLOCK. Save the overhead of the restart and return
+ * -EWOULDBLOCK directly.
+ */
+ ret = -EWOULDBLOCK;
+ }
+out_put_keys:
+ put_futex_key(fshared, &q.key);
+out_key2:
+ put_futex_key(fshared, &key2);
+
+out:
+ if (to) {
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+ }
return ret;
}
@@ -1935,7 +2508,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
fshared = 1;
clockrt = op & FUTEX_CLOCK_REALTIME;
- if (clockrt && cmd != FUTEX_WAIT_BITSET)
+ if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
return -ENOSYS;
switch (cmd) {
@@ -1950,10 +2523,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
ret = futex_wake(uaddr, fshared, val, val3);
break;
case FUTEX_REQUEUE:
- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
+ ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
break;
case FUTEX_CMP_REQUEUE:
- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
+ ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+ 0);
break;
case FUTEX_WAKE_OP:
ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
@@ -1970,6 +2544,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
if (futex_cmpxchg_enabled)
ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
break;
+ case FUTEX_WAIT_REQUEUE_PI:
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
+ clockrt, uaddr2);
+ break;
+ case FUTEX_CMP_REQUEUE_PI:
+ ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+ 1);
+ break;
default:
ret = -ENOSYS;
}
@@ -1987,7 +2570,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
int cmd = op & FUTEX_CMD_MASK;
if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
- cmd == FUTEX_WAIT_BITSET)) {
+ cmd == FUTEX_WAIT_BITSET ||
+ cmd == FUTEX_WAIT_REQUEUE_PI)) {
if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
return -EFAULT;
if (!timespec_valid(&ts))
@@ -1999,11 +2583,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
tp = &t;
}
/*
- * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
+ * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
* number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
*/
if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
- cmd == FUTEX_WAKE_OP)
+ cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
val2 = (u32) (unsigned long) utime;
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d607a5b9ee2..235716556bf 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
int cmd = op & FUTEX_CMD_MASK;
if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
- cmd == FUTEX_WAIT_BITSET)) {
+ cmd == FUTEX_WAIT_BITSET ||
+ cmd == FUTEX_WAIT_REQUEUE_PI)) {
if (get_compat_timespec(&ts, utime))
return -EFAULT;
if (!timespec_valid(&ts))
@@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
t = ktime_add_safe(ktime_get(), t);
tp = &t;
}
- if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE)
+ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+ cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
val2 = (int) (unsigned long) utime;
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
new file mode 100644
index 00000000000..22e9dcfaa3d
--- /dev/null
+++ b/kernel/gcov/Kconfig
@@ -0,0 +1,48 @@
+menu "GCOV-based kernel profiling"
+
+config GCOV_KERNEL
+ bool "Enable gcov-based kernel profiling"
+ depends on DEBUG_FS && CONSTRUCTORS
+ default n
+ ---help---
+ This option enables gcov-based code profiling (e.g. for code coverage
+ measurements).
+
+ If unsure, say N.
+
+ Additionally specify CONFIG_GCOV_PROFILE_ALL=y to get profiling data
+ for the entire kernel. To enable profiling for specific files or
+ directories, add a line similar to the following to the respective
+ Makefile:
+
+ For a single file (e.g. main.o):
+ GCOV_PROFILE_main.o := y
+
+ For all files in one directory:
+ GCOV_PROFILE := y
+
+ To exclude files from being profiled even when CONFIG_GCOV_PROFILE_ALL
+ is specified, use:
+
+ GCOV_PROFILE_main.o := n
+ and:
+ GCOV_PROFILE := n
+
+ Note that the debugfs filesystem has to be mounted to access
+ profiling data.
+
+config GCOV_PROFILE_ALL
+ bool "Profile entire Kernel"
+ depends on GCOV_KERNEL
+ depends on S390 || X86
+ default n
+ ---help---
+ This options activates profiling for the entire kernel.
+
+ If unsure, say N.
+
+ Note that a kernel compiled with profiling flags will be significantly
+ larger and run slower. Also be sure to exclude files from profiling
+ which are not linked to the kernel image to prevent linker errors.
+
+endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
new file mode 100644
index 00000000000..3f761001d51
--- /dev/null
+++ b/kernel/gcov/Makefile
@@ -0,0 +1,3 @@
+EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
+
+obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
new file mode 100644
index 00000000000..9b22d03cc58
--- /dev/null
+++ b/kernel/gcov/base.c
@@ -0,0 +1,148 @@
+/*
+ * This code maintains a list of active profiling data structures.
+ *
+ * Copyright IBM Corp. 2009
+ * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ * Uses gcc-internal data definitions.
+ * Based on the gcov-kernel patch by:
+ * Hubertus Franke <frankeh@us.ibm.com>
+ * Nigel Hinds <nhinds@us.ibm.com>
+ * Rajan Ravindran <rajancr@us.ibm.com>
+ * Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ * Paul Larson
+ */
+
+#define pr_fmt(fmt) "gcov: " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include "gcov.h"
+
+static struct gcov_info *gcov_info_head;
+static int gcov_events_enabled;
+static DEFINE_MUTEX(gcov_lock);
+
+/*
+ * __gcov_init is called by gcc-generated constructor code for each object
+ * file compiled with -fprofile-arcs.
+ */
+void __gcov_init(struct gcov_info *info)
+{
+ static unsigned int gcov_version;
+
+ mutex_lock(&gcov_lock);
+ if (gcov_version == 0) {
+ gcov_version = info->version;
+ /*
+ * Printing gcc's version magic may prove useful for debugging
+ * incompatibility reports.
+ */
+ pr_info("version magic: 0x%x\n", gcov_version);
+ }
+ /*
+ * Add new profiling data structure to list and inform event
+ * listener.
+ */
+ info->next = gcov_info_head;
+ gcov_info_head = info;
+ if (gcov_events_enabled)
+ gcov_event(GCOV_ADD, info);
+ mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(__gcov_init);
+
+/*
+ * These functions may be referenced by gcc-generated profiling code but serve
+ * no function for kernel profiling.
+ */
+void __gcov_flush(void)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_flush);
+
+void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_add);
+
+void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_single);
+
+void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_delta);
+
+/**
+ * gcov_enable_events - enable event reporting through gcov_event()
+ *
+ * Turn on reporting of profiling data load/unload-events through the
+ * gcov_event() callback. Also replay all previous events once. This function
+ * is needed because some events are potentially generated too early for the
+ * callback implementation to handle them initially.
+ */
+void gcov_enable_events(void)
+{
+ struct gcov_info *info;
+
+ mutex_lock(&gcov_lock);
+ gcov_events_enabled = 1;
+ /* Perform event callback for previously registered entries. */
+ for (info = gcov_info_head; info; info = info->next)
+ gcov_event(GCOV_ADD, info);
+ mutex_unlock(&gcov_lock);
+}
+
+#ifdef CONFIG_MODULES
+static inline int within(void *addr, void *start, unsigned long size)
+{
+ return ((addr >= start) && (addr < start + size));
+}
+
+/* Update list and generate events when modules are unloaded. */
+static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
+ void *data)
+{
+ struct module *mod = data;
+ struct gcov_info *info;
+ struct gcov_info *prev;
+
+ if (event != MODULE_STATE_GOING)
+ return NOTIFY_OK;
+ mutex_lock(&gcov_lock);
+ prev = NULL;
+ /* Remove entries located in module from linked list. */
+ for (info = gcov_info_head; info; info = info->next) {
+ if (within(info, mod->module_core, mod->core_size)) {
+ if (prev)
+ prev->next = info->next;
+ else
+ gcov_info_head = info->next;
+ if (gcov_events_enabled)
+ gcov_event(GCOV_REMOVE, info);
+ } else
+ prev = info;
+ }
+ mutex_unlock(&gcov_lock);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block gcov_nb = {
+ .notifier_call = gcov_module_notifier,
+};
+
+static int __init gcov_init(void)
+{
+ return register_module_notifier(&gcov_nb);
+}
+device_initcall(gcov_init);
+#endif /* CONFIG_MODULES */
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
new file mode 100644
index 00000000000..ef3c3f88a7a
--- /dev/null
+++ b/kernel/gcov/fs.c
@@ -0,0 +1,673 @@
+/*
+ * This code exports profiling data as debugfs files to userspace.
+ *
+ * Copyright IBM Corp. 2009
+ * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ * Uses gcc-internal data definitions.
+ * Based on the gcov-kernel patch by:
+ * Hubertus Franke <frankeh@us.ibm.com>
+ * Nigel Hinds <nhinds@us.ibm.com>
+ * Rajan Ravindran <rajancr@us.ibm.com>
+ * Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ * Paul Larson
+ * Yi CDL Yang
+ */
+
+#define pr_fmt(fmt) "gcov: " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+#include "gcov.h"
+
+/**
+ * struct gcov_node - represents a debugfs entry
+ * @list: list head for child node list
+ * @children: child nodes
+ * @all: list head for list of all nodes
+ * @parent: parent node
+ * @info: associated profiling data structure if not a directory
+ * @ghost: when an object file containing profiling data is unloaded we keep a
+ * copy of the profiling data here to allow collecting coverage data
+ * for cleanup code. Such a node is called a "ghost".
+ * @dentry: main debugfs entry, either a directory or data file
+ * @links: associated symbolic links
+ * @name: data file basename
+ *
+ * struct gcov_node represents an entity within the gcov/ subdirectory
+ * of debugfs. There are directory and data file nodes. The latter represent
+ * the actual synthesized data file plus any associated symbolic links which
+ * are needed by the gcov tool to work correctly.
+ */
+struct gcov_node {
+ struct list_head list;
+ struct list_head children;
+ struct list_head all;
+ struct gcov_node *parent;
+ struct gcov_info *info;
+ struct gcov_info *ghost;
+ struct dentry *dentry;
+ struct dentry **links;
+ char name[0];
+};
+
+static const char objtree[] = OBJTREE;
+static const char srctree[] = SRCTREE;
+static struct gcov_node root_node;
+static struct dentry *reset_dentry;
+static LIST_HEAD(all_head);
+static DEFINE_MUTEX(node_lock);
+
+/* If non-zero, keep copies of profiling data for unloaded modules. */
+static int gcov_persist = 1;
+
+static int __init gcov_persist_setup(char *str)
+{
+ unsigned long val;
+
+ if (strict_strtoul(str, 0, &val)) {
+ pr_warning("invalid gcov_persist parameter '%s'\n", str);
+ return 0;
+ }
+ gcov_persist = val;
+ pr_info("setting gcov_persist to %d\n", gcov_persist);
+
+ return 1;
+}
+__setup("gcov_persist=", gcov_persist_setup);
+
+/*
+ * seq_file.start() implementation for gcov data files. Note that the
+ * gcov_iterator interface is designed to be more restrictive than seq_file
+ * (no start from arbitrary position, etc.), to simplify the iterator
+ * implementation.
+ */
+static void *gcov_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ loff_t i;
+
+ gcov_iter_start(seq->private);
+ for (i = 0; i < *pos; i++) {
+ if (gcov_iter_next(seq->private))
+ return NULL;
+ }
+ return seq->private;
+}
+
+/* seq_file.next() implementation for gcov data files. */
+static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos)
+{
+ struct gcov_iterator *iter = data;
+
+ if (gcov_iter_next(iter))
+ return NULL;
+ (*pos)++;
+
+ return iter;
+}
+
+/* seq_file.show() implementation for gcov data files. */
+static int gcov_seq_show(struct seq_file *seq, void *data)
+{
+ struct gcov_iterator *iter = data;
+
+ if (gcov_iter_write(iter, seq))
+ return -EINVAL;
+ return 0;
+}
+
+static void gcov_seq_stop(struct seq_file *seq, void *data)
+{
+ /* Unused. */
+}
+
+static const struct seq_operations gcov_seq_ops = {
+ .start = gcov_seq_start,
+ .next = gcov_seq_next,
+ .show = gcov_seq_show,
+ .stop = gcov_seq_stop,
+};
+
+/*
+ * Return the profiling data set for a given node. This can either be the
+ * original profiling data structure or a duplicate (also called "ghost")
+ * in case the associated object file has been unloaded.
+ */
+static struct gcov_info *get_node_info(struct gcov_node *node)
+{
+ if (node->info)
+ return node->info;
+
+ return node->ghost;
+}
+
+/*
+ * open() implementation for gcov data files. Create a copy of the profiling
+ * data set and initialize the iterator and seq_file interface.
+ */
+static int gcov_seq_open(struct inode *inode, struct file *file)
+{
+ struct gcov_node *node = inode->i_private;
+ struct gcov_iterator *iter;
+ struct seq_file *seq;
+ struct gcov_info *info;
+ int rc = -ENOMEM;
+
+ mutex_lock(&node_lock);
+ /*
+ * Read from a profiling data copy to minimize reference tracking
+ * complexity and concurrent access.
+ */
+ info = gcov_info_dup(get_node_info(node));
+ if (!info)
+ goto out_unlock;
+ iter = gcov_iter_new(info);
+ if (!iter)
+ goto err_free_info;
+ rc = seq_open(file, &gcov_seq_ops);
+ if (rc)
+ goto err_free_iter_info;
+ seq = file->private_data;
+ seq->private = iter;
+out_unlock:
+ mutex_unlock(&node_lock);
+ return rc;
+
+err_free_iter_info:
+ gcov_iter_free(iter);
+err_free_info:
+ gcov_info_free(info);
+ goto out_unlock;
+}
+
+/*
+ * release() implementation for gcov data files. Release resources allocated
+ * by open().
+ */
+static int gcov_seq_release(struct inode *inode, struct file *file)
+{
+ struct gcov_iterator *iter;
+ struct gcov_info *info;
+ struct seq_file *seq;
+
+ seq = file->private_data;
+ iter = seq->private;
+ info = gcov_iter_get_info(iter);
+ gcov_iter_free(iter);
+ gcov_info_free(info);
+ seq_release(inode, file);
+
+ return 0;
+}
+
+/*
+ * Find a node by the associated data file name. Needs to be called with
+ * node_lock held.
+ */
+static struct gcov_node *get_node_by_name(const char *name)
+{
+ struct gcov_node *node;
+ struct gcov_info *info;
+
+ list_for_each_entry(node, &all_head, all) {
+ info = get_node_info(node);
+ if (info && (strcmp(info->filename, name) == 0))
+ return node;
+ }
+
+ return NULL;
+}
+
+static void remove_node(struct gcov_node *node);
+
+/*
+ * write() implementation for gcov data files. Reset profiling data for the
+ * associated file. If the object file has been unloaded (i.e. this is
+ * a "ghost" node), remove the debug fs node as well.
+ */
+static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
+ size_t len, loff_t *pos)
+{
+ struct seq_file *seq;
+ struct gcov_info *info;
+ struct gcov_node *node;
+
+ seq = file->private_data;
+ info = gcov_iter_get_info(seq->private);
+ mutex_lock(&node_lock);
+ node = get_node_by_name(info->filename);
+ if (node) {
+ /* Reset counts or remove node for unloaded modules. */
+ if (node->ghost)
+ remove_node(node);
+ else
+ gcov_info_reset(node->info);
+ }
+ /* Reset counts for open file. */
+ gcov_info_reset(info);
+ mutex_unlock(&node_lock);
+
+ return len;
+}
+
+/*
+ * Given a string <path> representing a file path of format:
+ * path/to/file.gcda
+ * construct and return a new string:
+ * <dir/>path/to/file.<ext>
+ */
+static char *link_target(const char *dir, const char *path, const char *ext)
+{
+ char *target;
+ char *old_ext;
+ char *copy;
+
+ copy = kstrdup(path, GFP_KERNEL);
+ if (!copy)
+ return NULL;
+ old_ext = strrchr(copy, '.');
+ if (old_ext)
+ *old_ext = '\0';
+ if (dir)
+ target = kasprintf(GFP_KERNEL, "%s/%s.%s", dir, copy, ext);
+ else
+ target = kasprintf(GFP_KERNEL, "%s.%s", copy, ext);
+ kfree(copy);
+
+ return target;
+}
+
+/*
+ * Construct a string representing the symbolic link target for the given
+ * gcov data file name and link type. Depending on the link type and the
+ * location of the data file, the link target can either point to a
+ * subdirectory of srctree, objtree or in an external location.
+ */
+static char *get_link_target(const char *filename, const struct gcov_link *ext)
+{
+ const char *rel;
+ char *result;
+
+ if (strncmp(filename, objtree, strlen(objtree)) == 0) {
+ rel = filename + strlen(objtree) + 1;
+ if (ext->dir == SRC_TREE)
+ result = link_target(srctree, rel, ext->ext);
+ else
+ result = link_target(objtree, rel, ext->ext);
+ } else {
+ /* External compilation. */
+ result = link_target(NULL, filename, ext->ext);
+ }
+
+ return result;
+}
+
+#define SKEW_PREFIX ".tmp_"
+
+/*
+ * For a filename .tmp_filename.ext return filename.ext. Needed to compensate
+ * for filename skewing caused by the mod-versioning mechanism.
+ */
+static const char *deskew(const char *basename)
+{
+ if (strncmp(basename, SKEW_PREFIX, sizeof(SKEW_PREFIX) - 1) == 0)
+ return basename + sizeof(SKEW_PREFIX) - 1;
+ return basename;
+}
+
+/*
+ * Create links to additional files (usually .c and .gcno files) which the
+ * gcov tool expects to find in the same directory as the gcov data file.
+ */
+static void add_links(struct gcov_node *node, struct dentry *parent)
+{
+ char *basename;
+ char *target;
+ int num;
+ int i;
+
+ for (num = 0; gcov_link[num].ext; num++)
+ /* Nothing. */;
+ node->links = kcalloc(num, sizeof(struct dentry *), GFP_KERNEL);
+ if (!node->links)
+ return;
+ for (i = 0; i < num; i++) {
+ target = get_link_target(get_node_info(node)->filename,
+ &gcov_link[i]);
+ if (!target)
+ goto out_err;
+ basename = strrchr(target, '/');
+ if (!basename)
+ goto out_err;
+ basename++;
+ node->links[i] = debugfs_create_symlink(deskew(basename),
+ parent, target);
+ if (!node->links[i])
+ goto out_err;
+ kfree(target);
+ }
+
+ return;
+out_err:
+ kfree(target);
+ while (i-- > 0)
+ debugfs_remove(node->links[i]);
+ kfree(node->links);
+ node->links = NULL;
+}
+
+static const struct file_operations gcov_data_fops = {
+ .open = gcov_seq_open,
+ .release = gcov_seq_release,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = gcov_seq_write,
+};
+
+/* Basic initialization of a new node. */
+static void init_node(struct gcov_node *node, struct gcov_info *info,
+ const char *name, struct gcov_node *parent)
+{
+ INIT_LIST_HEAD(&node->list);
+ INIT_LIST_HEAD(&node->children);
+ INIT_LIST_HEAD(&node->all);
+ node->info = info;
+ node->parent = parent;
+ if (name)
+ strcpy(node->name, name);
+}
+
+/*
+ * Create a new node and associated debugfs entry. Needs to be called with
+ * node_lock held.
+ */
+static struct gcov_node *new_node(struct gcov_node *parent,
+ struct gcov_info *info, const char *name)
+{
+ struct gcov_node *node;
+
+ node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
+ if (!node) {
+ pr_warning("out of memory\n");
+ return NULL;
+ }
+ init_node(node, info, name, parent);
+ /* Differentiate between gcov data file nodes and directory nodes. */
+ if (info) {
+ node->dentry = debugfs_create_file(deskew(node->name), 0600,
+ parent->dentry, node, &gcov_data_fops);
+ } else
+ node->dentry = debugfs_create_dir(node->name, parent->dentry);
+ if (!node->dentry) {
+ pr_warning("could not create file\n");
+ kfree(node);
+ return NULL;
+ }
+ if (info)
+ add_links(node, parent->dentry);
+ list_add(&node->list, &parent->children);
+ list_add(&node->all, &all_head);
+
+ return node;
+}
+
+/* Remove symbolic links associated with node. */
+static void remove_links(struct gcov_node *node)
+{
+ int i;
+
+ if (!node->links)
+ return;
+ for (i = 0; gcov_link[i].ext; i++)
+ debugfs_remove(node->links[i]);
+ kfree(node->links);
+ node->links = NULL;
+}
+
+/*
+ * Remove node from all lists and debugfs and release associated resources.
+ * Needs to be called with node_lock held.
+ */
+static void release_node(struct gcov_node *node)
+{
+ list_del(&node->list);
+ list_del(&node->all);
+ debugfs_remove(node->dentry);
+ remove_links(node);
+ if (node->ghost)
+ gcov_info_free(node->ghost);
+ kfree(node);
+}
+
+/* Release node and empty parents. Needs to be called with node_lock held. */
+static void remove_node(struct gcov_node *node)
+{
+ struct gcov_node *parent;
+
+ while ((node != &root_node) && list_empty(&node->children)) {
+ parent = node->parent;
+ release_node(node);
+ node = parent;
+ }
+}
+
+/*
+ * Find child node with given basename. Needs to be called with node_lock
+ * held.
+ */
+static struct gcov_node *get_child_by_name(struct gcov_node *parent,
+ const char *name)
+{
+ struct gcov_node *node;
+
+ list_for_each_entry(node, &parent->children, list) {
+ if (strcmp(node->name, name) == 0)
+ return node;
+ }
+
+ return NULL;
+}
+
+/*
+ * write() implementation for reset file. Reset all profiling data to zero
+ * and remove ghost nodes.
+ */
+static ssize_t reset_write(struct file *file, const char __user *addr,
+ size_t len, loff_t *pos)
+{
+ struct gcov_node *node;
+
+ mutex_lock(&node_lock);
+restart:
+ list_for_each_entry(node, &all_head, all) {
+ if (node->info)
+ gcov_info_reset(node->info);
+ else if (list_empty(&node->children)) {
+ remove_node(node);
+ /* Several nodes may have gone - restart loop. */
+ goto restart;
+ }
+ }
+ mutex_unlock(&node_lock);
+
+ return len;
+}
+
+/* read() implementation for reset file. Unused. */
+static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
+ loff_t *pos)
+{
+ /* Allow read operation so that a recursive copy won't fail. */
+ return 0;
+}
+
+static const struct file_operations gcov_reset_fops = {
+ .write = reset_write,
+ .read = reset_read,
+};
+
+/*
+ * Create a node for a given profiling data set and add it to all lists and
+ * debugfs. Needs to be called with node_lock held.
+ */
+static void add_node(struct gcov_info *info)
+{
+ char *filename;
+ char *curr;
+ char *next;
+ struct gcov_node *parent;
+ struct gcov_node *node;
+
+ filename = kstrdup(info->filename, GFP_KERNEL);
+ if (!filename)
+ return;
+ parent = &root_node;
+ /* Create directory nodes along the path. */
+ for (curr = filename; (next = strchr(curr, '/')); curr = next + 1) {
+ if (curr == next)
+ continue;
+ *next = 0;
+ if (strcmp(curr, ".") == 0)
+ continue;
+ if (strcmp(curr, "..") == 0) {
+ if (!parent->parent)
+ goto err_remove;
+ parent = parent->parent;
+ continue;
+ }
+ node = get_child_by_name(parent, curr);
+ if (!node) {
+ node = new_node(parent, NULL, curr);
+ if (!node)
+ goto err_remove;
+ }
+ parent = node;
+ }
+ /* Create file node. */
+ node = new_node(parent, info, curr);
+ if (!node)
+ goto err_remove;
+out:
+ kfree(filename);
+ return;
+
+err_remove:
+ remove_node(parent);
+ goto out;
+}
+
+/*
+ * The profiling data set associated with this node is being unloaded. Store a
+ * copy of the profiling data and turn this node into a "ghost".
+ */
+static int ghost_node(struct gcov_node *node)
+{
+ node->ghost = gcov_info_dup(node->info);
+ if (!node->ghost) {
+ pr_warning("could not save data for '%s' (out of memory)\n",
+ node->info->filename);
+ return -ENOMEM;
+ }
+ node->info = NULL;
+
+ return 0;
+}
+
+/*
+ * Profiling data for this node has been loaded again. Add profiling data
+ * from previous instantiation and turn this node into a regular node.
+ */
+static void revive_node(struct gcov_node *node, struct gcov_info *info)
+{
+ if (gcov_info_is_compatible(node->ghost, info))
+ gcov_info_add(info, node->ghost);
+ else {
+ pr_warning("discarding saved data for '%s' (version changed)\n",
+ info->filename);
+ }
+ gcov_info_free(node->ghost);
+ node->ghost = NULL;
+ node->info = info;
+}
+
+/*
+ * Callback to create/remove profiling files when code compiled with
+ * -fprofile-arcs is loaded/unloaded.
+ */
+void gcov_event(enum gcov_action action, struct gcov_info *info)
+{
+ struct gcov_node *node;
+
+ mutex_lock(&node_lock);
+ node = get_node_by_name(info->filename);
+ switch (action) {
+ case GCOV_ADD:
+ /* Add new node or revive ghost. */
+ if (!node) {
+ add_node(info);
+ break;
+ }
+ if (gcov_persist)
+ revive_node(node, info);
+ else {
+ pr_warning("could not add '%s' (already exists)\n",
+ info->filename);
+ }
+ break;
+ case GCOV_REMOVE:
+ /* Remove node or turn into ghost. */
+ if (!node) {
+ pr_warning("could not remove '%s' (not found)\n",
+ info->filename);
+ break;
+ }
+ if (gcov_persist) {
+ if (!ghost_node(node))
+ break;
+ }
+ remove_node(node);
+ break;
+ }
+ mutex_unlock(&node_lock);
+}
+
+/* Create debugfs entries. */
+static __init int gcov_fs_init(void)
+{
+ int rc = -EIO;
+
+ init_node(&root_node, NULL, NULL, NULL);
+ /*
+ * /sys/kernel/debug/gcov will be parent for the reset control file
+ * and all profiling files.
+ */
+ root_node.dentry = debugfs_create_dir("gcov", NULL);
+ if (!root_node.dentry)
+ goto err_remove;
+ /*
+ * Create reset file which resets all profiling counts when written
+ * to.
+ */
+ reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry,
+ NULL, &gcov_reset_fops);
+ if (!reset_dentry)
+ goto err_remove;
+ /* Replay previous events to get our fs hierarchy up-to-date. */
+ gcov_enable_events();
+ return 0;
+
+err_remove:
+ pr_err("init failed\n");
+ if (root_node.dentry)
+ debugfs_remove(root_node.dentry);
+
+ return rc;
+}
+device_initcall(gcov_fs_init);
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
new file mode 100644
index 00000000000..ae5bb426003
--- /dev/null
+++ b/kernel/gcov/gcc_3_4.c
@@ -0,0 +1,447 @@
+/*
+ * This code provides functions to handle gcc's profiling data format
+ * introduced with gcc 3.4. Future versions of gcc may change the gcov
+ * format (as happened before), so all format-specific information needs
+ * to be kept modular and easily exchangeable.
+ *
+ * This file is based on gcc-internal definitions. Functions and data
+ * structures are defined to be compatible with gcc counterparts.
+ * For a better understanding, refer to gcc source: gcc/gcov-io.h.
+ *
+ * Copyright IBM Corp. 2009
+ * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ * Uses gcc-internal data definitions.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include "gcov.h"
+
+/* Symbolic links to be created for each profiling data file. */
+const struct gcov_link gcov_link[] = {
+ { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
+ { 0, NULL},
+};
+
+/*
+ * Determine whether a counter is active. Based on gcc magic. Doesn't change
+ * at run-time.
+ */
+static int counter_active(struct gcov_info *info, unsigned int type)
+{
+ return (1 << type) & info->ctr_mask;
+}
+
+/* Determine number of active counters. Based on gcc magic. */
+static unsigned int num_counter_active(struct gcov_info *info)
+{
+ unsigned int i;
+ unsigned int result = 0;
+
+ for (i = 0; i < GCOV_COUNTERS; i++) {
+ if (counter_active(info, i))
+ result++;
+ }
+ return result;
+}
+
+/**
+ * gcov_info_reset - reset profiling data to zero
+ * @info: profiling data set
+ */
+void gcov_info_reset(struct gcov_info *info)
+{
+ unsigned int active = num_counter_active(info);
+ unsigned int i;
+
+ for (i = 0; i < active; i++) {
+ memset(info->counts[i].values, 0,
+ info->counts[i].num * sizeof(gcov_type));
+ }
+}
+
+/**
+ * gcov_info_is_compatible - check if profiling data can be added
+ * @info1: first profiling data set
+ * @info2: second profiling data set
+ *
+ * Returns non-zero if profiling data can be added, zero otherwise.
+ */
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
+{
+ return (info1->stamp == info2->stamp);
+}
+
+/**
+ * gcov_info_add - add up profiling data
+ * @dest: profiling data set to which data is added
+ * @source: profiling data set which is added
+ *
+ * Adds profiling counts of @source to @dest.
+ */
+void gcov_info_add(struct gcov_info *dest, struct gcov_info *source)
+{
+ unsigned int i;
+ unsigned int j;
+
+ for (i = 0; i < num_counter_active(dest); i++) {
+ for (j = 0; j < dest->counts[i].num; j++) {
+ dest->counts[i].values[j] +=
+ source->counts[i].values[j];
+ }
+ }
+}
+
+/* Get size of function info entry. Based on gcc magic. */
+static size_t get_fn_size(struct gcov_info *info)
+{
+ size_t size;
+
+ size = sizeof(struct gcov_fn_info) + num_counter_active(info) *
+ sizeof(unsigned int);
+ if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int))
+ size = ALIGN(size, __alignof__(struct gcov_fn_info));
+ return size;
+}
+
+/* Get address of function info entry. Based on gcc magic. */
+static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn)
+{
+ return (struct gcov_fn_info *)
+ ((char *) info->functions + fn * get_fn_size(info));
+}
+
+/**
+ * gcov_info_dup - duplicate profiling data set
+ * @info: profiling data set to duplicate
+ *
+ * Return newly allocated duplicate on success, %NULL on error.
+ */
+struct gcov_info *gcov_info_dup(struct gcov_info *info)
+{
+ struct gcov_info *dup;
+ unsigned int i;
+ unsigned int active;
+
+ /* Duplicate gcov_info. */
+ active = num_counter_active(info);
+ dup = kzalloc(sizeof(struct gcov_info) +
+ sizeof(struct gcov_ctr_info) * active, GFP_KERNEL);
+ if (!dup)
+ return NULL;
+ dup->version = info->version;
+ dup->stamp = info->stamp;
+ dup->n_functions = info->n_functions;
+ dup->ctr_mask = info->ctr_mask;
+ /* Duplicate filename. */
+ dup->filename = kstrdup(info->filename, GFP_KERNEL);
+ if (!dup->filename)
+ goto err_free;
+ /* Duplicate table of functions. */
+ dup->functions = kmemdup(info->functions, info->n_functions *
+ get_fn_size(info), GFP_KERNEL);
+ if (!dup->functions)
+ goto err_free;
+ /* Duplicate counter arrays. */
+ for (i = 0; i < active ; i++) {
+ struct gcov_ctr_info *ctr = &info->counts[i];
+ size_t size = ctr->num * sizeof(gcov_type);
+
+ dup->counts[i].num = ctr->num;
+ dup->counts[i].merge = ctr->merge;
+ dup->counts[i].values = vmalloc(size);
+ if (!dup->counts[i].values)
+ goto err_free;
+ memcpy(dup->counts[i].values, ctr->values, size);
+ }
+ return dup;
+
+err_free:
+ gcov_info_free(dup);
+ return NULL;
+}
+
+/**
+ * gcov_info_free - release memory for profiling data set duplicate
+ * @info: profiling data set duplicate to free
+ */
+void gcov_info_free(struct gcov_info *info)
+{
+ unsigned int active = num_counter_active(info);
+ unsigned int i;
+
+ for (i = 0; i < active ; i++)
+ vfree(info->counts[i].values);
+ kfree(info->functions);
+ kfree(info->filename);
+ kfree(info);
+}
+
+/**
+ * struct type_info - iterator helper array
+ * @ctr_type: counter type
+ * @offset: index of the first value of the current function for this type
+ *
+ * This array is needed to convert the in-memory data format into the in-file
+ * data format:
+ *
+ * In-memory:
+ * for each counter type
+ * for each function
+ * values
+ *
+ * In-file:
+ * for each function
+ * for each counter type
+ * values
+ *
+ * See gcc source gcc/gcov-io.h for more information on data organization.
+ */
+struct type_info {
+ int ctr_type;
+ unsigned int offset;
+};
+
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @record: record type
+ * @function: function number
+ * @type: counter type
+ * @count: index into values array
+ * @num_types: number of counter types
+ * @type_info: helper array to get values-array offset for current function
+ */
+struct gcov_iterator {
+ struct gcov_info *info;
+
+ int record;
+ unsigned int function;
+ unsigned int type;
+ unsigned int count;
+
+ int num_types;
+ struct type_info type_info[0];
+};
+
+static struct gcov_fn_info *get_func(struct gcov_iterator *iter)
+{
+ return get_fn_info(iter->info, iter->function);
+}
+
+static struct type_info *get_type(struct gcov_iterator *iter)
+{
+ return &iter->type_info[iter->type];
+}
+
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+ struct gcov_iterator *iter;
+
+ iter = kzalloc(sizeof(struct gcov_iterator) +
+ num_counter_active(info) * sizeof(struct type_info),
+ GFP_KERNEL);
+ if (iter)
+ iter->info = info;
+
+ return iter;
+}
+
+/**
+ * gcov_iter_free - release memory for iterator
+ * @iter: file iterator to free
+ */
+void gcov_iter_free(struct gcov_iterator *iter)
+{
+ kfree(iter);
+}
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+ return iter->info;
+}
+
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+void gcov_iter_start(struct gcov_iterator *iter)
+{
+ int i;
+
+ iter->record = 0;
+ iter->function = 0;
+ iter->type = 0;
+ iter->count = 0;
+ iter->num_types = 0;
+ for (i = 0; i < GCOV_COUNTERS; i++) {
+ if (counter_active(iter->info, i)) {
+ iter->type_info[iter->num_types].ctr_type = i;
+ iter->type_info[iter->num_types++].offset = 0;
+ }
+ }
+}
+
+/* Mapping of logical record number to actual file content. */
+#define RECORD_FILE_MAGIC 0
+#define RECORD_GCOV_VERSION 1
+#define RECORD_TIME_STAMP 2
+#define RECORD_FUNCTION_TAG 3
+#define RECORD_FUNCTON_TAG_LEN 4
+#define RECORD_FUNCTION_IDENT 5
+#define RECORD_FUNCTION_CHECK 6
+#define RECORD_COUNT_TAG 7
+#define RECORD_COUNT_LEN 8
+#define RECORD_COUNT 9
+
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+int gcov_iter_next(struct gcov_iterator *iter)
+{
+ switch (iter->record) {
+ case RECORD_FILE_MAGIC:
+ case RECORD_GCOV_VERSION:
+ case RECORD_FUNCTION_TAG:
+ case RECORD_FUNCTON_TAG_LEN:
+ case RECORD_FUNCTION_IDENT:
+ case RECORD_COUNT_TAG:
+ /* Advance to next record */
+ iter->record++;
+ break;
+ case RECORD_COUNT:
+ /* Advance to next count */
+ iter->count++;
+ /* fall through */
+ case RECORD_COUNT_LEN:
+ if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
+ iter->record = 9;
+ break;
+ }
+ /* Advance to next counter type */
+ get_type(iter)->offset += iter->count;
+ iter->count = 0;
+ iter->type++;
+ /* fall through */
+ case RECORD_FUNCTION_CHECK:
+ if (iter->type < iter->num_types) {
+ iter->record = 7;
+ break;
+ }
+ /* Advance to next function */
+ iter->type = 0;
+ iter->function++;
+ /* fall through */
+ case RECORD_TIME_STAMP:
+ if (iter->function < iter->info->n_functions)
+ iter->record = 3;
+ else
+ iter->record = -1;
+ break;
+ }
+ /* Check for EOF. */
+ if (iter->record == -1)
+ return -EINVAL;
+ else
+ return 0;
+}
+
+/**
+ * seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file
+ * @seq: seq_file handle
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file.
+ */
+static int seq_write_gcov_u32(struct seq_file *seq, u32 v)
+{
+ return seq_write(seq, &v, sizeof(v));
+}
+
+/**
+ * seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file
+ * @seq: seq_file handle
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first.
+ */
+static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
+{
+ u32 data[2];
+
+ data[0] = (v & 0xffffffffUL);
+ data[1] = (v >> 32);
+ return seq_write(seq, data, sizeof(data));
+}
+
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+ int rc = -EINVAL;
+
+ switch (iter->record) {
+ case RECORD_FILE_MAGIC:
+ rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC);
+ break;
+ case RECORD_GCOV_VERSION:
+ rc = seq_write_gcov_u32(seq, iter->info->version);
+ break;
+ case RECORD_TIME_STAMP:
+ rc = seq_write_gcov_u32(seq, iter->info->stamp);
+ break;
+ case RECORD_FUNCTION_TAG:
+ rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
+ break;
+ case RECORD_FUNCTON_TAG_LEN:
+ rc = seq_write_gcov_u32(seq, 2);
+ break;
+ case RECORD_FUNCTION_IDENT:
+ rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
+ break;
+ case RECORD_FUNCTION_CHECK:
+ rc = seq_write_gcov_u32(seq, get_func(iter)->checksum);
+ break;
+ case RECORD_COUNT_TAG:
+ rc = seq_write_gcov_u32(seq,
+ GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type));
+ break;
+ case RECORD_COUNT_LEN:
+ rc = seq_write_gcov_u32(seq,
+ get_func(iter)->n_ctrs[iter->type] * 2);
+ break;
+ case RECORD_COUNT:
+ rc = seq_write_gcov_u64(seq,
+ iter->info->counts[iter->type].
+ values[iter->count + get_type(iter)->offset]);
+ break;
+ }
+ return rc;
+}
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
new file mode 100644
index 00000000000..060073ebf7a
--- /dev/null
+++ b/kernel/gcov/gcov.h
@@ -0,0 +1,128 @@
+/*
+ * Profiling infrastructure declarations.
+ *
+ * This file is based on gcc-internal definitions. Data structures are
+ * defined to be compatible with gcc counterparts. For a better
+ * understanding, refer to gcc source: gcc/gcov-io.h.
+ *
+ * Copyright IBM Corp. 2009
+ * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ * Uses gcc-internal data definitions.
+ */
+
+#ifndef GCOV_H
+#define GCOV_H GCOV_H
+
+#include <linux/types.h>
+
+/*
+ * Profiling data types used for gcc 3.4 and above - these are defined by
+ * gcc and need to be kept as close to the original definition as possible to
+ * remain compatible.
+ */
+#define GCOV_COUNTERS 5
+#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461)
+#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000)
+#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000)
+#define GCOV_TAG_FOR_COUNTER(count) \
+ (GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17))
+
+#if BITS_PER_LONG >= 64
+typedef long gcov_type;
+#else
+typedef long long gcov_type;
+#endif
+
+/**
+ * struct gcov_fn_info - profiling meta data per function
+ * @ident: object file-unique function identifier
+ * @checksum: function checksum
+ * @n_ctrs: number of values per counter type belonging to this function
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time.
+ */
+struct gcov_fn_info {
+ unsigned int ident;
+ unsigned int checksum;
+ unsigned int n_ctrs[0];
+};
+
+/**
+ * struct gcov_ctr_info - profiling data per counter type
+ * @num: number of counter values for this type
+ * @values: array of counter values for this type
+ * @merge: merge function for counter values of this type (unused)
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the values array.
+ */
+struct gcov_ctr_info {
+ unsigned int num;
+ gcov_type *values;
+ void (*merge)(gcov_type *, unsigned int);
+};
+
+/**
+ * struct gcov_info - profiling data per object file
+ * @version: gcov version magic indicating the gcc version used for compilation
+ * @next: list head for a singly-linked list
+ * @stamp: time stamp
+ * @filename: name of the associated gcov data file
+ * @n_functions: number of instrumented functions
+ * @functions: function data
+ * @ctr_mask: mask specifying which counter types are active
+ * @counts: counter data per counter type
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the next pointer.
+ */
+struct gcov_info {
+ unsigned int version;
+ struct gcov_info *next;
+ unsigned int stamp;
+ const char *filename;
+ unsigned int n_functions;
+ const struct gcov_fn_info *functions;
+ unsigned int ctr_mask;
+ struct gcov_ctr_info counts[0];
+};
+
+/* Base interface. */
+enum gcov_action {
+ GCOV_ADD,
+ GCOV_REMOVE,
+};
+
+void gcov_event(enum gcov_action action, struct gcov_info *info);
+void gcov_enable_events(void);
+
+/* Iterator control. */
+struct seq_file;
+struct gcov_iterator;
+
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info);
+void gcov_iter_free(struct gcov_iterator *iter);
+void gcov_iter_start(struct gcov_iterator *iter);
+int gcov_iter_next(struct gcov_iterator *iter);
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq);
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter);
+
+/* gcov_info control. */
+void gcov_info_reset(struct gcov_info *info);
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2);
+void gcov_info_add(struct gcov_info *dest, struct gcov_info *source);
+struct gcov_info *gcov_info_dup(struct gcov_info *info);
+void gcov_info_free(struct gcov_info *info);
+
+struct gcov_link {
+ enum {
+ OBJ_TREE,
+ SRC_TREE,
+ } dir;
+ const char *ext;
+};
+extern const struct gcov_link gcov_link[];
+
+#endif /* GCOV_H */
diff --git a/kernel/groups.c b/kernel/groups.c
new file mode 100644
index 00000000000..2b45b2ee396
--- /dev/null
+++ b/kernel/groups.c
@@ -0,0 +1,288 @@
+/*
+ * Supplementary group IDs
+ */
+#include <linux/cred.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <asm/uaccess.h>
+
+/* init to 2 - one for init_task, one to ensure it is never freed */
+struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+
+struct group_info *groups_alloc(int gidsetsize)
+{
+ struct group_info *group_info;
+ int nblocks;
+ int i;
+
+ nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
+ /* Make sure we always allocate at least one indirect block pointer */
+ nblocks = nblocks ? : 1;
+ group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
+ if (!group_info)
+ return NULL;
+ group_info->ngroups = gidsetsize;
+ group_info->nblocks = nblocks;
+ atomic_set(&group_info->usage, 1);
+
+ if (gidsetsize <= NGROUPS_SMALL)
+ group_info->blocks[0] = group_info->small_block;
+ else {
+ for (i = 0; i < nblocks; i++) {
+ gid_t *b;
+ b = (void *)__get_free_page(GFP_USER);
+ if (!b)
+ goto out_undo_partial_alloc;
+ group_info->blocks[i] = b;
+ }
+ }
+ return group_info;
+
+out_undo_partial_alloc:
+ while (--i >= 0) {
+ free_page((unsigned long)group_info->blocks[i]);
+ }
+ kfree(group_info);
+ return NULL;
+}
+
+EXPORT_SYMBOL(groups_alloc);
+
+void groups_free(struct group_info *group_info)
+{
+ if (group_info->blocks[0] != group_info->small_block) {
+ int i;
+ for (i = 0; i < group_info->nblocks; i++)
+ free_page((unsigned long)group_info->blocks[i]);
+ }
+ kfree(group_info);
+}
+
+EXPORT_SYMBOL(groups_free);
+
+/* export the group_info to a user-space array */
+static int groups_to_user(gid_t __user *grouplist,
+ const struct group_info *group_info)
+{
+ int i;
+ unsigned int count = group_info->ngroups;
+
+ for (i = 0; i < group_info->nblocks; i++) {
+ unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+ unsigned int len = cp_count * sizeof(*grouplist);
+
+ if (copy_to_user(grouplist, group_info->blocks[i], len))
+ return -EFAULT;
+
+ grouplist += NGROUPS_PER_BLOCK;
+ count -= cp_count;
+ }
+ return 0;
+}
+
+/* fill a group_info from a user-space array - it must be allocated already */
+static int groups_from_user(struct group_info *group_info,
+ gid_t __user *grouplist)
+{
+ int i;
+ unsigned int count = group_info->ngroups;
+
+ for (i = 0; i < group_info->nblocks; i++) {
+ unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+ unsigned int len = cp_count * sizeof(*grouplist);
+
+ if (copy_from_user(group_info->blocks[i], grouplist, len))
+ return -EFAULT;
+
+ grouplist += NGROUPS_PER_BLOCK;
+ count -= cp_count;
+ }
+ return 0;
+}
+
+/* a simple Shell sort */
+static void groups_sort(struct group_info *group_info)
+{
+ int base, max, stride;
+ int gidsetsize = group_info->ngroups;
+
+ for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+ ; /* nothing */
+ stride /= 3;
+
+ while (stride) {
+ max = gidsetsize - stride;
+ for (base = 0; base < max; base++) {
+ int left = base;
+ int right = left + stride;
+ gid_t tmp = GROUP_AT(group_info, right);
+
+ while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
+ GROUP_AT(group_info, right) =
+ GROUP_AT(group_info, left);
+ right = left;
+ left -= stride;
+ }
+ GROUP_AT(group_info, right) = tmp;
+ }
+ stride /= 3;
+ }
+}
+
+/* a simple bsearch */
+int groups_search(const struct group_info *group_info, gid_t grp)
+{
+ unsigned int left, right;
+
+ if (!group_info)
+ return 0;
+
+ left = 0;
+ right = group_info->ngroups;
+ while (left < right) {
+ unsigned int mid = (left+right)/2;
+ int cmp = grp - GROUP_AT(group_info, mid);
+ if (cmp > 0)
+ left = mid + 1;
+ else if (cmp < 0)
+ right = mid;
+ else
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * set_groups - Change a group subscription in a set of credentials
+ * @new: The newly prepared set of credentials to alter
+ * @group_info: The group list to install
+ *
+ * Validate a group subscription and, if valid, insert it into a set
+ * of credentials.
+ */
+int set_groups(struct cred *new, struct group_info *group_info)
+{
+ int retval;
+
+ retval = security_task_setgroups(group_info);
+ if (retval)
+ return retval;
+
+ put_group_info(new->group_info);
+ groups_sort(group_info);
+ get_group_info(group_info);
+ new->group_info = group_info;
+ return 0;
+}
+
+EXPORT_SYMBOL(set_groups);
+
+/**
+ * set_current_groups - Change current's group subscription
+ * @group_info: The group list to impose
+ *
+ * Validate a group subscription and, if valid, impose it upon current's task
+ * security record.
+ */
+int set_current_groups(struct group_info *group_info)
+{
+ struct cred *new;
+ int ret;
+
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+
+ ret = set_groups(new, group_info);
+ if (ret < 0) {
+ abort_creds(new);
+ return ret;
+ }
+
+ return commit_creds(new);
+}
+
+EXPORT_SYMBOL(set_current_groups);
+
+SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
+{
+ const struct cred *cred = current_cred();
+ int i;
+
+ if (gidsetsize < 0)
+ return -EINVAL;
+
+ /* no need to grab task_lock here; it cannot change */
+ i = cred->group_info->ngroups;
+ if (gidsetsize) {
+ if (i > gidsetsize) {
+ i = -EINVAL;
+ goto out;
+ }
+ if (groups_to_user(grouplist, cred->group_info)) {
+ i = -EFAULT;
+ goto out;
+ }
+ }
+out:
+ return i;
+}
+
+/*
+ * SMP: Our groups are copy-on-write. We can set them safely
+ * without another task interfering.
+ */
+
+SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
+{
+ struct group_info *group_info;
+ int retval;
+
+ if (!capable(CAP_SETGID))
+ return -EPERM;
+ if ((unsigned)gidsetsize > NGROUPS_MAX)
+ return -EINVAL;
+
+ group_info = groups_alloc(gidsetsize);
+ if (!group_info)
+ return -ENOMEM;
+ retval = groups_from_user(group_info, grouplist);
+ if (retval) {
+ put_group_info(group_info);
+ return retval;
+ }
+
+ retval = set_current_groups(group_info);
+ put_group_info(group_info);
+
+ return retval;
+}
+
+/*
+ * Check whether we're fsgid/egid or in the supplemental group..
+ */
+int in_group_p(gid_t grp)
+{
+ const struct cred *cred = current_cred();
+ int retval = 1;
+
+ if (grp != cred->fsgid)
+ retval = groups_search(cred->group_info, grp);
+ return retval;
+}
+
+EXPORT_SYMBOL(in_group_p);
+
+int in_egroup_p(gid_t grp)
+{
+ const struct cred *cred = current_cred();
+ int retval = 1;
+
+ if (grp != cred->egid)
+ retval = groups_search(cred->group_info, grp);
+ return retval;
+}
+
+EXPORT_SYMBOL(in_egroup_p);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 2dc30c59c5f..05071bf6a37 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -43,6 +43,8 @@
#include <linux/seq_file.h>
#include <linux/err.h>
#include <linux/debugobjects.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
#include <asm/uaccess.h>
@@ -189,21 +191,65 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
}
}
+
+/*
+ * Get the preferred target CPU for NOHZ
+ */
+static int hrtimer_get_target(int this_cpu, int pinned)
+{
+#ifdef CONFIG_NO_HZ
+ if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
+ int preferred_cpu = get_nohz_load_balancer();
+
+ if (preferred_cpu >= 0)
+ return preferred_cpu;
+ }
+#endif
+ return this_cpu;
+}
+
+/*
+ * With HIGHRES=y we do not migrate the timer when it is expiring
+ * before the next event on the target cpu because we cannot reprogram
+ * the target cpu hardware and we would cause it to fire late.
+ *
+ * Called with cpu_base->lock of target cpu held.
+ */
+static int
+hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
+{
+#ifdef CONFIG_HIGH_RES_TIMERS
+ ktime_t expires;
+
+ if (!new_base->cpu_base->hres_active)
+ return 0;
+
+ expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
+ return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
+#else
+ return 0;
+#endif
+}
+
/*
* Switch the timer base to the current CPU when possible.
*/
static inline struct hrtimer_clock_base *
-switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
+ int pinned)
{
struct hrtimer_clock_base *new_base;
struct hrtimer_cpu_base *new_cpu_base;
+ int this_cpu = smp_processor_id();
+ int cpu = hrtimer_get_target(this_cpu, pinned);
- new_cpu_base = &__get_cpu_var(hrtimer_bases);
+again:
+ new_cpu_base = &per_cpu(hrtimer_bases, cpu);
new_base = &new_cpu_base->clock_base[base->index];
if (base != new_base) {
/*
- * We are trying to schedule the timer on the local CPU.
+ * We are trying to move timer to new_base.
* However we can't change timer's base while it is running,
* so we keep it on the same CPU. No hassle vs. reprogramming
* the event source in the high resolution case. The softirq
@@ -218,6 +264,14 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
timer->base = NULL;
spin_unlock(&base->cpu_base->lock);
spin_lock(&new_base->cpu_base->lock);
+
+ if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
+ cpu = this_cpu;
+ spin_unlock(&new_base->cpu_base->lock);
+ spin_lock(&base->cpu_base->lock);
+ timer->base = base;
+ goto again;
+ }
timer->base = new_base;
}
return new_base;
@@ -235,7 +289,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
return base;
}
-# define switch_hrtimer_base(t, b) (b)
+# define switch_hrtimer_base(t, b, p) (b)
#endif /* !CONFIG_SMP */
@@ -332,6 +386,8 @@ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
return res;
}
+EXPORT_SYMBOL_GPL(ktime_add_safe);
+
#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
static struct debug_obj_descr hrtimer_debug_descr;
@@ -429,6 +485,7 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
debug_object_init_on_stack(timer, &hrtimer_debug_descr);
__hrtimer_init(timer, clock_id, mode);
}
+EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
void destroy_hrtimer_on_stack(struct hrtimer *timer)
{
@@ -501,6 +558,13 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
continue;
timer = rb_entry(base->first, struct hrtimer, node);
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+ /*
+ * clock_was_set() has changed base->offset so the
+ * result might be negative. Fix it up to prevent a
+ * false positive in clockevents_program_event()
+ */
+ if (expires.tv64 < 0)
+ expires.tv64 = 0;
if (expires.tv64 < cpu_base->expires_next.tv64)
cpu_base->expires_next = expires;
}
@@ -614,7 +678,9 @@ void clock_was_set(void)
*/
void hres_timers_resume(void)
{
- /* Retrigger the CPU local events: */
+ WARN_ONCE(!irqs_disabled(),
+ KERN_INFO "hres_timers_resume() called with IRQs enabled!");
+
retrigger_next_event(NULL);
}
@@ -642,14 +708,20 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
* and expiry check is done in the hrtimer_interrupt or in the softirq.
*/
static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
+ struct hrtimer_clock_base *base,
+ int wakeup)
{
if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
- spin_unlock(&base->cpu_base->lock);
- raise_softirq_irqoff(HRTIMER_SOFTIRQ);
- spin_lock(&base->cpu_base->lock);
+ if (wakeup) {
+ spin_unlock(&base->cpu_base->lock);
+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ spin_lock(&base->cpu_base->lock);
+ } else
+ __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+
return 1;
}
+
return 0;
}
@@ -694,7 +766,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
static inline int hrtimer_switch_to_hres(void) { return 0; }
static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
+ struct hrtimer_clock_base *base,
+ int wakeup)
{
return 0;
}
@@ -877,20 +950,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
return 0;
}
-/**
- * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
- * @timer: the timer to be added
- * @tim: expiry time
- * @delta_ns: "slack" range for the timer
- * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
- *
- * Returns:
- * 0 on success
- * 1 when the timer was active
- */
-int
-hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
- const enum hrtimer_mode mode)
+int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+ unsigned long delta_ns, const enum hrtimer_mode mode,
+ int wakeup)
{
struct hrtimer_clock_base *base, *new_base;
unsigned long flags;
@@ -902,9 +964,9 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
ret = remove_hrtimer(timer, base);
/* Switch the timer base, if necessary: */
- new_base = switch_hrtimer_base(timer, base);
+ new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
- if (mode == HRTIMER_MODE_REL) {
+ if (mode & HRTIMER_MODE_REL) {
tim = ktime_add_safe(tim, new_base->get_time());
/*
* CONFIG_TIME_LOW_RES is a temporary way for architectures
@@ -931,12 +993,29 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
* XXX send_remote_softirq() ?
*/
if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
- hrtimer_enqueue_reprogram(timer, new_base);
+ hrtimer_enqueue_reprogram(timer, new_base, wakeup);
unlock_hrtimer_base(timer, &flags);
return ret;
}
+
+/**
+ * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
+ * @timer: the timer to be added
+ * @tim: expiry time
+ * @delta_ns: "slack" range for the timer
+ * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ *
+ * Returns:
+ * 0 on success
+ * 1 when the timer was active
+ */
+int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+ unsigned long delta_ns, const enum hrtimer_mode mode)
+{
+ return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
+}
EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
/**
@@ -952,7 +1031,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
int
hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
{
- return hrtimer_start_range_ns(timer, tim, 0, mode);
+ return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
}
EXPORT_SYMBOL_GPL(hrtimer_start);
@@ -1156,6 +1235,29 @@ static void __run_hrtimer(struct hrtimer *timer)
#ifdef CONFIG_HIGH_RES_TIMERS
+static int force_clock_reprogram;
+
+/*
+ * After 5 iteration's attempts, we consider that hrtimer_interrupt()
+ * is hanging, which could happen with something that slows the interrupt
+ * such as the tracing. Then we force the clock reprogramming for each future
+ * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
+ * threshold that we will overwrite.
+ * The next tick event will be scheduled to 3 times we currently spend on
+ * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
+ * 1/4 of their time to process the hrtimer interrupts. This is enough to
+ * let it running without serious starvation.
+ */
+
+static inline void
+hrtimer_interrupt_hanging(struct clock_event_device *dev,
+ ktime_t try_time)
+{
+ force_clock_reprogram = 1;
+ dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
+ printk(KERN_WARNING "hrtimer: interrupt too slow, "
+ "forcing clock min delta to %lu ns\n", dev->min_delta_ns);
+}
/*
* High resolution timer interrupt
* Called with interrupts disabled
@@ -1165,6 +1267,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
struct hrtimer_clock_base *base;
ktime_t expires_next, now;
+ int nr_retries = 0;
int i;
BUG_ON(!cpu_base->hres_active);
@@ -1172,18 +1275,30 @@ void hrtimer_interrupt(struct clock_event_device *dev)
dev->next_event.tv64 = KTIME_MAX;
retry:
+ /* 5 retries is enough to notice a hang */
+ if (!(++nr_retries % 5))
+ hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
+
now = ktime_get();
expires_next.tv64 = KTIME_MAX;
+ spin_lock(&cpu_base->lock);
+ /*
+ * We set expires_next to KTIME_MAX here with cpu_base->lock
+ * held to prevent that a timer is enqueued in our queue via
+ * the migration code. This does not affect enqueueing of
+ * timers which run their callback and need to be requeued on
+ * this CPU.
+ */
+ cpu_base->expires_next.tv64 = KTIME_MAX;
+
base = cpu_base->clock_base;
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
ktime_t basenow;
struct rb_node *node;
- spin_lock(&cpu_base->lock);
-
basenow = ktime_add(now, base->offset);
while ((node = base->first)) {
@@ -1216,15 +1331,19 @@ void hrtimer_interrupt(struct clock_event_device *dev)
__run_hrtimer(timer);
}
- spin_unlock(&cpu_base->lock);
base++;
}
+ /*
+ * Store the new expiry value so the migration code can verify
+ * against it.
+ */
cpu_base->expires_next = expires_next;
+ spin_unlock(&cpu_base->lock);
/* Reprogramming necessary ? */
if (expires_next.tv64 != KTIME_MAX) {
- if (tick_program_event(expires_next, 0))
+ if (tick_program_event(expires_next, force_clock_reprogram))
goto retry;
}
}
@@ -1359,6 +1478,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
sl->timer.function = hrtimer_wakeup;
sl->task = task;
}
+EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
{
@@ -1578,6 +1698,10 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
break;
#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DYING:
+ case CPU_DYING_FROZEN:
+ clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);
+ break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
{
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
new file mode 100644
index 00000000000..022a4927b78
--- /dev/null
+++ b/kernel/hung_task.c
@@ -0,0 +1,217 @@
+/*
+ * Detect Hung Task
+ *
+ * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/lockdep.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+/*
+ * The number of tasks checked:
+ */
+unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+
+/*
+ * Limit number of tasks checked in a batch.
+ *
+ * This value controls the preemptibility of khungtaskd since preemption
+ * is disabled during the critical section. It also controls the size of
+ * the RCU grace period. So it needs to be upper-bound.
+ */
+#define HUNG_TASK_BATCHING 1024
+
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
+
+unsigned long __read_mostly sysctl_hung_task_warnings = 10;
+
+static int __read_mostly did_panic;
+
+static struct task_struct *watchdog_task;
+
+/*
+ * Should we panic (and reboot, if panic_timeout= is set) when a
+ * hung task is detected:
+ */
+unsigned int __read_mostly sysctl_hung_task_panic =
+ CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
+
+static int __init hung_task_panic_setup(char *str)
+{
+ sysctl_hung_task_panic = simple_strtoul(str, NULL, 0);
+
+ return 1;
+}
+__setup("hung_task_panic=", hung_task_panic_setup);
+
+static int
+hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ did_panic = 1;
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+ .notifier_call = hung_task_panic,
+};
+
+static void check_hung_task(struct task_struct *t, unsigned long timeout)
+{
+ unsigned long switch_count = t->nvcsw + t->nivcsw;
+
+ /*
+ * Ensure the task is not frozen.
+ * Also, when a freshly created task is scheduled once, changes
+ * its state to TASK_UNINTERRUPTIBLE without having ever been
+ * switched out once, it musn't be checked.
+ */
+ if (unlikely(t->flags & PF_FROZEN || !switch_count))
+ return;
+
+ if (switch_count != t->last_switch_count) {
+ t->last_switch_count = switch_count;
+ return;
+ }
+ if (!sysctl_hung_task_warnings)
+ return;
+ sysctl_hung_task_warnings--;
+
+ /*
+ * Ok, the task did not get scheduled for more than 2 minutes,
+ * complain:
+ */
+ printk(KERN_ERR "INFO: task %s:%d blocked for more than "
+ "%ld seconds.\n", t->comm, t->pid, timeout);
+ printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+ " disables this message.\n");
+ sched_show_task(t);
+ __debug_show_held_locks(t);
+
+ touch_nmi_watchdog();
+
+ if (sysctl_hung_task_panic)
+ panic("hung_task: blocked tasks");
+}
+
+/*
+ * To avoid extending the RCU grace period for an unbounded amount of time,
+ * periodically exit the critical section and enter a new one.
+ *
+ * For preemptible RCU it is sufficient to call rcu_read_unlock in order
+ * exit the grace period. For classic RCU, a reschedule is required.
+ */
+static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
+{
+ get_task_struct(g);
+ get_task_struct(t);
+ rcu_read_unlock();
+ cond_resched();
+ rcu_read_lock();
+ put_task_struct(t);
+ put_task_struct(g);
+}
+
+/*
+ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
+ * a really long time (120 seconds). If that happens, print out
+ * a warning.
+ */
+static void check_hung_uninterruptible_tasks(unsigned long timeout)
+{
+ int max_count = sysctl_hung_task_check_count;
+ int batch_count = HUNG_TASK_BATCHING;
+ struct task_struct *g, *t;
+
+ /*
+ * If the system crashed already then all bets are off,
+ * do not report extra hung tasks:
+ */
+ if (test_taint(TAINT_DIE) || did_panic)
+ return;
+
+ rcu_read_lock();
+ do_each_thread(g, t) {
+ if (!--max_count)
+ goto unlock;
+ if (!--batch_count) {
+ batch_count = HUNG_TASK_BATCHING;
+ rcu_lock_break(g, t);
+ /* Exit if t or g was unhashed during refresh. */
+ if (t->state == TASK_DEAD || g->state == TASK_DEAD)
+ goto unlock;
+ }
+ /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
+ if (t->state == TASK_UNINTERRUPTIBLE)
+ check_hung_task(t, timeout);
+ } while_each_thread(g, t);
+ unlock:
+ rcu_read_unlock();
+}
+
+static unsigned long timeout_jiffies(unsigned long timeout)
+{
+ /* timeout of 0 will disable the watchdog */
+ return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT;
+}
+
+/*
+ * Process updating of timeout sysctl
+ */
+int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+
+ if (ret || !write)
+ goto out;
+
+ wake_up_process(watchdog_task);
+
+ out:
+ return ret;
+}
+
+/*
+ * kthread which checks for tasks stuck in D state
+ */
+static int watchdog(void *dummy)
+{
+ set_user_nice(current, 0);
+
+ for ( ; ; ) {
+ unsigned long timeout = sysctl_hung_task_timeout_secs;
+
+ while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
+ timeout = sysctl_hung_task_timeout_secs;
+
+ check_hung_uninterruptible_tasks(timeout);
+ }
+
+ return 0;
+}
+
+static int __init hung_task_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+ watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
+
+ return 0;
+}
+
+module_init(hung_task_init);
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 4dd5b1edac9..7d047808419 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,4 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
-obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
+obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
+obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f63c706d25e..c1660194d11 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -46,7 +46,10 @@ void dynamic_irq_init(unsigned int irq)
desc->irq_count = 0;
desc->irqs_unhandled = 0;
#ifdef CONFIG_SMP
- cpumask_setall(&desc->affinity);
+ cpumask_setall(desc->affinity);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ cpumask_clear(desc->pending_mask);
+#endif
#endif
spin_unlock_irqrestore(&desc->lock, flags);
}
@@ -78,6 +81,7 @@ void dynamic_irq_cleanup(unsigned int irq)
desc->handle_irq = handle_bad_irq;
desc->chip = &no_irq_chip;
desc->name = NULL;
+ clear_kstat_irqs(desc);
spin_unlock_irqrestore(&desc->lock, flags);
}
@@ -218,6 +222,34 @@ int set_irq_chip_data(unsigned int irq, void *data)
}
EXPORT_SYMBOL(set_irq_chip_data);
+/**
+ * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
+ *
+ * @irq: Interrupt number
+ * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag
+ *
+ * The IRQ_NESTED_THREAD flag indicates that on
+ * request_threaded_irq() no separate interrupt thread should be
+ * created for the irq as the handler are called nested in the
+ * context of a demultiplexing interrupt handler thread.
+ */
+void set_irq_nested_thread(unsigned int irq, int nest)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ if (!desc)
+ return;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ if (nest)
+ desc->status |= IRQ_NESTED_THREAD;
+ else
+ desc->status &= ~IRQ_NESTED_THREAD;
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+EXPORT_SYMBOL_GPL(set_irq_nested_thread);
+
/*
* default enable function
*/
@@ -290,10 +322,50 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
desc->chip->mask_ack(irq);
else {
desc->chip->mask(irq);
- desc->chip->ack(irq);
+ if (desc->chip->ack)
+ desc->chip->ack(irq);
}
}
+/*
+ * handle_nested_irq - Handle a nested irq from a irq thread
+ * @irq: the interrupt number
+ *
+ * Handle interrupts which are nested into a threaded interrupt
+ * handler. The handler function is called inside the calling
+ * threads context.
+ */
+void handle_nested_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction *action;
+ irqreturn_t action_ret;
+
+ might_sleep();
+
+ spin_lock_irq(&desc->lock);
+
+ kstat_incr_irqs_this_cpu(irq, desc);
+
+ action = desc->action;
+ if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+ goto out_unlock;
+
+ desc->status |= IRQ_INPROGRESS;
+ spin_unlock_irq(&desc->lock);
+
+ action_ret = action->thread_fn(action->irq, action->dev_id);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret);
+
+ spin_lock_irq(&desc->lock);
+ desc->status &= ~IRQ_INPROGRESS;
+
+out_unlock:
+ spin_unlock_irq(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_nested_irq);
+
/**
* handle_simple_irq - Simple and software-decoded IRQs.
* @irq: the interrupt number
@@ -354,7 +426,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
spin_lock(&desc->lock);
mask_ack_irq(desc, irq);
- desc = irq_remap_to_desc(irq, desc);
if (unlikely(desc->status & IRQ_INPROGRESS))
goto out_unlock;
@@ -378,11 +449,15 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
spin_lock(&desc->lock);
desc->status &= ~IRQ_INPROGRESS;
- if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
+
+ if (unlikely(desc->status & IRQ_ONESHOT))
+ desc->status |= IRQ_MASKED;
+ else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
desc->chip->unmask(irq);
out_unlock:
spin_unlock(&desc->lock);
}
+EXPORT_SYMBOL_GPL(handle_level_irq);
/**
* handle_fasteoi_irq - irq handler for transparent controllers
@@ -432,7 +507,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
desc->status &= ~IRQ_INPROGRESS;
out:
desc->chip->eoi(irq);
- desc = irq_remap_to_desc(irq, desc);
spin_unlock(&desc->lock);
}
@@ -469,14 +543,13 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
!desc->action)) {
desc->status |= (IRQ_PENDING | IRQ_MASKED);
mask_ack_irq(desc, irq);
- desc = irq_remap_to_desc(irq, desc);
goto out_unlock;
}
kstat_incr_irqs_this_cpu(irq, desc);
/* Start handling the irq */
- desc->chip->ack(irq);
- desc = irq_remap_to_desc(irq, desc);
+ if (desc->chip->ack)
+ desc->chip->ack(irq);
/* Mark the IRQ currently in progress.*/
desc->status |= IRQ_INPROGRESS;
@@ -537,10 +610,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
if (!noirqdebug)
note_interrupt(irq, desc, action_ret);
- if (desc->chip->eoi) {
+ if (desc->chip->eoi)
desc->chip->eoi(irq);
- desc = irq_remap_to_desc(irq, desc);
- }
}
void
@@ -571,14 +642,13 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
desc->chip = &dummy_irq_chip;
}
+ chip_bus_lock(irq, desc);
spin_lock_irqsave(&desc->lock, flags);
/* Uninstall? */
if (handle == handle_bad_irq) {
- if (desc->chip != &no_irq_chip) {
+ if (desc->chip != &no_irq_chip)
mask_ack_irq(desc, irq);
- desc = irq_remap_to_desc(irq, desc);
- }
desc->status |= IRQ_DISABLED;
desc->depth = 1;
}
@@ -592,7 +662,9 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
desc->chip->startup(irq);
}
spin_unlock_irqrestore(&desc->lock, flags);
+ chip_bus_sync_unlock(irq, desc);
}
+EXPORT_SYMBOL_GPL(__set_irq_handler);
void
set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 38a25b8d8bf..d06df9c41cb 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -26,10 +26,12 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
}
/**
- * devm_request_irq - allocate an interrupt line for a managed device
+ * devm_request_threaded_irq - allocate an interrupt line for a managed device
* @dev: device to request interrupt for
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs
+ * @thread_fn: function to be called in a threaded interrupt context. NULL
+ * for devices which handle everything in @handler
* @irqflags: Interrupt type flags
* @devname: An ascii name for the claiming device
* @dev_id: A cookie passed back to the handler function
@@ -42,9 +44,10 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
* If an IRQ allocated with this function needs to be freed
* separately, dev_free_irq() must be used.
*/
-int devm_request_irq(struct device *dev, unsigned int irq,
- irq_handler_t handler, unsigned long irqflags,
- const char *devname, void *dev_id)
+int devm_request_threaded_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler, irq_handler_t thread_fn,
+ unsigned long irqflags, const char *devname,
+ void *dev_id)
{
struct irq_devres *dr;
int rc;
@@ -54,7 +57,8 @@ int devm_request_irq(struct device *dev, unsigned int irq,
if (!dr)
return -ENOMEM;
- rc = request_irq(irq, handler, irqflags, devname, dev_id);
+ rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname,
+ dev_id);
if (rc) {
devres_free(dr);
return rc;
@@ -66,7 +70,7 @@ int devm_request_irq(struct device *dev, unsigned int irq,
return 0;
}
-EXPORT_SYMBOL(devm_request_irq);
+EXPORT_SYMBOL(devm_request_threaded_irq);
/**
* devm_free_irq - free an interrupt
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c20db0be917..a81cf80554d 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,12 +11,15 @@
*/
#include <linux/irq.h>
+#include <linux/slab.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/rculist.h>
#include <linux/hash.h>
+#include <linux/bootmem.h>
+#include <trace/events/irq.h>
#include "internals.h"
@@ -39,6 +42,18 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
ack_bad_irq(irq);
}
+#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
+static void __init init_irq_default_affinity(void)
+{
+ alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+ cpumask_setall(irq_default_affinity);
+}
+#else
+static void __init init_irq_default_affinity(void)
+{
+}
+#endif
+
/*
* Linux has a controller-independent interrupt architecture.
* Every controller has a 'controller-template', that is used
@@ -57,6 +72,7 @@ int nr_irqs = NR_IRQS;
EXPORT_SYMBOL_GPL(nr_irqs);
#ifdef CONFIG_SPARSE_IRQ
+
static struct irq_desc irq_desc_init = {
.irq = -1,
.status = IRQ_DISABLED,
@@ -64,44 +80,50 @@ static struct irq_desc irq_desc_init = {
.handle_irq = handle_bad_irq,
.depth = 1,
.lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-#ifdef CONFIG_SMP
- .affinity = CPU_MASK_ALL
-#endif
};
-void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
{
- unsigned long bytes;
- char *ptr;
- int node;
-
- /* Compute how many bytes we need per irq and allocate them */
- bytes = nr * sizeof(unsigned int);
+ void *ptr;
- node = cpu_to_node(cpu);
- ptr = kzalloc_node(bytes, GFP_ATOMIC, node);
- printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", cpu, node);
+ if (slab_is_available())
+ ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
+ GFP_ATOMIC, node);
+ else
+ ptr = alloc_bootmem_node(NODE_DATA(node),
+ nr * sizeof(*desc->kstat_irqs));
- if (ptr)
- desc->kstat_irqs = (unsigned int *)ptr;
+ /*
+ * don't overwite if can not get new one
+ * init_copy_kstat_irqs() could still use old one
+ */
+ if (ptr) {
+ printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node);
+ desc->kstat_irqs = ptr;
+ }
}
-static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
+static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
{
memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
spin_lock_init(&desc->lock);
desc->irq = irq;
#ifdef CONFIG_SMP
- desc->cpu = cpu;
+ desc->node = node;
#endif
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
- init_kstat_irqs(desc, cpu, nr_cpu_ids);
+ init_kstat_irqs(desc, node, nr_cpu_ids);
if (!desc->kstat_irqs) {
printk(KERN_ERR "can not alloc kstat_irqs\n");
BUG_ON(1);
}
- arch_init_chip_data(desc, cpu);
+ if (!alloc_desc_masks(desc, node, false)) {
+ printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
+ BUG_ON(1);
+ }
+ init_desc_masks(desc);
+ arch_init_chip_data(desc, node);
}
/*
@@ -109,7 +131,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
*/
DEFINE_SPINLOCK(sparse_irq_lock);
-struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly;
+struct irq_desc **irq_desc_ptrs __read_mostly;
static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
[0 ... NR_IRQS_LEGACY-1] = {
@@ -119,33 +141,48 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
.handle_irq = handle_bad_irq,
.depth = 1,
.lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-#ifdef CONFIG_SMP
- .affinity = CPU_MASK_ALL
-#endif
}
};
-/* FIXME: use bootmem alloc ...*/
-static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
+static unsigned int *kstat_irqs_legacy;
int __init early_irq_init(void)
{
struct irq_desc *desc;
int legacy_count;
+ int node;
int i;
+ init_irq_default_affinity();
+
+ /* initialize nr_irqs based on nr_cpu_ids */
+ arch_probe_nr_irqs();
+ printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
+
desc = irq_desc_legacy;
legacy_count = ARRAY_SIZE(irq_desc_legacy);
+ node = first_online_node;
+
+ /* allocate irq_desc_ptrs array based on nr_irqs */
+ irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
+
+ /* allocate based on nr_cpu_ids */
+ kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
+ sizeof(int), GFP_NOWAIT, node);
for (i = 0; i < legacy_count; i++) {
desc[i].irq = i;
- desc[i].kstat_irqs = kstat_irqs_legacy[i];
+#ifdef CONFIG_SMP
+ desc[i].node = node;
+#endif
+ desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-
+ alloc_desc_masks(&desc[i], node, true);
+ init_desc_masks(&desc[i]);
irq_desc_ptrs[i] = desc + i;
}
- for (i = legacy_count; i < NR_IRQS; i++)
+ for (i = legacy_count; i < nr_irqs; i++)
irq_desc_ptrs[i] = NULL;
return arch_early_irq_init();
@@ -153,19 +190,20 @@ int __init early_irq_init(void)
struct irq_desc *irq_to_desc(unsigned int irq)
{
- return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL;
+ if (irq_desc_ptrs && irq < nr_irqs)
+ return irq_desc_ptrs[irq];
+
+ return NULL;
}
-struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
{
struct irq_desc *desc;
unsigned long flags;
- int node;
- if (irq >= NR_IRQS) {
- printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n",
- irq, NR_IRQS);
- WARN_ON(1);
+ if (irq >= nr_irqs) {
+ WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
+ irq, nr_irqs);
return NULL;
}
@@ -180,15 +218,17 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
if (desc)
goto out_unlock;
- node = cpu_to_node(cpu);
- desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
- printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n",
- irq, cpu, node);
+ if (slab_is_available())
+ desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+ else
+ desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
+
+ printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
if (!desc) {
printk(KERN_ERR "can not alloc irq_desc\n");
BUG_ON(1);
}
- init_one_irq_desc(irq, desc, cpu);
+ init_one_irq_desc(irq, desc, node);
irq_desc_ptrs[irq] = desc;
@@ -207,24 +247,29 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
.handle_irq = handle_bad_irq,
.depth = 1,
.lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
-#ifdef CONFIG_SMP
- .affinity = CPU_MASK_ALL
-#endif
}
};
+static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
int __init early_irq_init(void)
{
struct irq_desc *desc;
int count;
int i;
+ init_irq_default_affinity();
+
+ printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
+
desc = irq_desc;
count = ARRAY_SIZE(irq_desc);
- for (i = 0; i < count; i++)
+ for (i = 0; i < count; i++) {
desc[i].irq = i;
-
+ alloc_desc_masks(&desc[i], 0, true);
+ init_desc_masks(&desc[i]);
+ desc[i].kstat_irqs = kstat_irqs_all[i];
+ }
return arch_early_irq_init();
}
@@ -233,12 +278,17 @@ struct irq_desc *irq_to_desc(unsigned int irq)
return (irq < NR_IRQS) ? irq_desc + irq : NULL;
}
-struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
{
return irq_to_desc(irq);
}
#endif /* !CONFIG_SPARSE_IRQ */
+void clear_kstat_irqs(struct irq_desc *desc)
+{
+ memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
+}
+
/*
* What should we do if we get a hw irq event on an illegal vector?
* Each architecture has to answer this themself.
@@ -300,6 +350,15 @@ irqreturn_t no_action(int cpl, void *dev_id)
return IRQ_NONE;
}
+static void warn_no_thread(unsigned int irq, struct irqaction *action)
+{
+ if (test_and_set_bit(IRQTF_WARNED, &action->thread_flags))
+ return;
+
+ printk(KERN_WARNING "IRQ %d device %s returned IRQ_WAKE_THREAD "
+ "but no thread function available.", irq, action->name);
+}
+
/**
* handle_IRQ_event - irq action chain handler
* @irq: the interrupt number
@@ -316,9 +375,50 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
local_irq_enable_in_hardirq();
do {
+ trace_irq_handler_entry(irq, action);
ret = action->handler(irq, action->dev_id);
- if (ret == IRQ_HANDLED)
+ trace_irq_handler_exit(irq, action, ret);
+
+ switch (ret) {
+ case IRQ_WAKE_THREAD:
+ /*
+ * Set result to handled so the spurious check
+ * does not trigger.
+ */
+ ret = IRQ_HANDLED;
+
+ /*
+ * Catch drivers which return WAKE_THREAD but
+ * did not set up a thread function
+ */
+ if (unlikely(!action->thread_fn)) {
+ warn_no_thread(irq, action);
+ break;
+ }
+
+ /*
+ * Wake up the handler thread for this
+ * action. In case the thread crashed and was
+ * killed we just pretend that we handled the
+ * interrupt. The hardirq handler above has
+ * disabled the device interrupt, so no irq
+ * storm is lurking.
+ */
+ if (likely(!test_bit(IRQTF_DIED,
+ &action->thread_flags))) {
+ set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
+ wake_up_process(action->thread);
+ }
+
+ /* Fall through to add to randomness */
+ case IRQ_HANDLED:
status |= action->flags;
+ break;
+
+ default:
+ break;
+ }
+
retval |= ret;
action = action->next;
} while (action);
@@ -331,6 +431,11 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
}
#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
+
+#ifdef CONFIG_ENABLE_WARN_DEPRECATED
+# warning __do_IRQ is deprecated. Please convert to proper flow handlers
+#endif
+
/**
* __do_IRQ - original all in one highlevel IRQ handler
* @irq: the interrupt number
@@ -356,11 +461,8 @@ unsigned int __do_IRQ(unsigned int irq)
/*
* No locking required for CPU-local interrupts:
*/
- if (desc->chip->ack) {
+ if (desc->chip->ack)
desc->chip->ack(irq);
- /* get new one */
- desc = irq_remap_to_desc(irq, desc);
- }
if (likely(!(desc->status & IRQ_DISABLED))) {
action_ret = handle_IRQ_event(irq, desc->action);
if (!noirqdebug)
@@ -371,10 +473,8 @@ unsigned int __do_IRQ(unsigned int irq)
}
spin_lock(&desc->lock);
- if (desc->chip->ack) {
+ if (desc->chip->ack)
desc->chip->ack(irq);
- desc = irq_remap_to_desc(irq, desc);
- }
/*
* REPLAY is when Linux resends an IRQ that was dropped earlier
* WAITING is used by probe to mark irqs that are being tested
@@ -451,12 +551,10 @@ void early_init_irq_lock_class(void)
}
}
-#ifdef CONFIG_SPARSE_IRQ
unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
{
struct irq_desc *desc = irq_to_desc(irq);
return desc ? desc->kstat_irqs[cpu] : 0;
}
-#endif
EXPORT_SYMBOL(kstat_irqs_cpu);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e6d0a43cc12..1b5d742c6a7 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -12,11 +12,21 @@ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
unsigned long flags);
+extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
+extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
extern struct lock_class_key irq_desc_lock_class;
-extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
+extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
+extern void clear_kstat_irqs(struct irq_desc *desc);
extern spinlock_t sparse_irq_lock;
+
+#ifdef CONFIG_SPARSE_IRQ
+/* irq_desc_ptrs allocated at boot time */
+extern struct irq_desc **irq_desc_ptrs;
+#else
+/* irq_desc_ptrs is a fixed size array */
extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
+#endif
#ifdef CONFIG_PROC_FS
extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
@@ -32,6 +42,21 @@ static inline void unregister_handler_proc(unsigned int irq,
extern int irq_select_affinity_usr(unsigned int irq);
+extern void irq_set_thread_affinity(struct irq_desc *desc);
+
+/* Inline functions for support of irq chips on slow busses */
+static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc)
+{
+ if (unlikely(desc->chip->bus_lock))
+ desc->chip->bus_lock(irq);
+}
+
+static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc)
+{
+ if (unlikely(desc->chip->bus_sync_unlock))
+ desc->chip->bus_sync_unlock(irq);
+}
+
/*
* Debugging printout:
*/
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index cd0cd8dcb34..bde4c667d24 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -8,24 +8,15 @@
*/
#include <linux/irq.h>
+#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
+#include <linux/sched.h>
#include "internals.h"
-#ifdef CONFIG_SMP
-cpumask_var_t irq_default_affinity;
-
-static int init_irq_default_affinity(void)
-{
- alloc_cpumask_var(&irq_default_affinity, GFP_KERNEL);
- cpumask_setall(irq_default_affinity);
- return 0;
-}
-core_initcall(init_irq_default_affinity);
-
/**
* synchronize_irq - wait for pending IRQ handlers (on other CPUs)
* @irq: interrupt number to wait for
@@ -61,9 +52,18 @@ void synchronize_irq(unsigned int irq)
/* Oops, that failed? */
} while (status & IRQ_INPROGRESS);
+
+ /*
+ * We made sure that no hardirq handler is running. Now verify
+ * that no threaded handlers are active.
+ */
+ wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active));
}
EXPORT_SYMBOL(synchronize_irq);
+#ifdef CONFIG_SMP
+cpumask_var_t irq_default_affinity;
+
/**
* irq_can_set_affinity - Check if the affinity of a given irq can be set
* @irq: Interrupt to check
@@ -81,6 +81,26 @@ int irq_can_set_affinity(unsigned int irq)
}
/**
+ * irq_set_thread_affinity - Notify irq threads to adjust affinity
+ * @desc: irq descriptor which has affitnity changed
+ *
+ * We just set IRQTF_AFFINITY and delegate the affinity setting
+ * to the interrupt thread itself. We can not call
+ * set_cpus_allowed_ptr() here as we hold desc->lock and this
+ * code can be called from hard interrupt context.
+ */
+void irq_set_thread_affinity(struct irq_desc *desc)
+{
+ struct irqaction *action = desc->action;
+
+ while (action) {
+ if (action->thread)
+ set_bit(IRQTF_AFFINITY, &action->thread_flags);
+ action = action->next;
+ }
+}
+
+/**
* irq_set_affinity - Set the irq affinity of a given irq
* @irq: Interrupt to set affinity
* @cpumask: cpumask
@@ -97,16 +117,21 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
spin_lock_irqsave(&desc->lock, flags);
#ifdef CONFIG_GENERIC_PENDING_IRQ
- if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
- cpumask_copy(&desc->affinity, cpumask);
- desc->chip->set_affinity(irq, cpumask);
- } else {
+ if (desc->status & IRQ_MOVE_PCNTXT) {
+ if (!desc->chip->set_affinity(irq, cpumask)) {
+ cpumask_copy(desc->affinity, cpumask);
+ irq_set_thread_affinity(desc);
+ }
+ }
+ else {
desc->status |= IRQ_MOVE_PENDING;
- cpumask_copy(&desc->pending_mask, cpumask);
+ cpumask_copy(desc->pending_mask, cpumask);
}
#else
- cpumask_copy(&desc->affinity, cpumask);
- desc->chip->set_affinity(irq, cpumask);
+ if (!desc->chip->set_affinity(irq, cpumask)) {
+ cpumask_copy(desc->affinity, cpumask);
+ irq_set_thread_affinity(desc);
+ }
#endif
desc->status |= IRQ_AFFINITY_SET;
spin_unlock_irqrestore(&desc->lock, flags);
@@ -117,7 +142,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
/*
* Generic version of the affinity autoselector.
*/
-int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
+static int setup_affinity(unsigned int irq, struct irq_desc *desc)
{
if (!irq_can_set_affinity(irq))
return 0;
@@ -127,21 +152,21 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
* one of the targets is online.
*/
if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
- if (cpumask_any_and(&desc->affinity, cpu_online_mask)
+ if (cpumask_any_and(desc->affinity, cpu_online_mask)
< nr_cpu_ids)
goto set_affinity;
else
desc->status &= ~IRQ_AFFINITY_SET;
}
- cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity);
+ cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);
set_affinity:
- desc->chip->set_affinity(irq, &desc->affinity);
+ desc->chip->set_affinity(irq, desc->affinity);
return 0;
}
#else
-static inline int do_irq_select_affinity(unsigned int irq, struct irq_desc *d)
+static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
{
return irq_select_affinity(irq);
}
@@ -157,19 +182,35 @@ int irq_select_affinity_usr(unsigned int irq)
int ret;
spin_lock_irqsave(&desc->lock, flags);
- ret = do_irq_select_affinity(irq, desc);
+ ret = setup_affinity(irq, desc);
+ if (!ret)
+ irq_set_thread_affinity(desc);
spin_unlock_irqrestore(&desc->lock, flags);
return ret;
}
#else
-static inline int do_irq_select_affinity(int irq, struct irq_desc *desc)
+static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
{
return 0;
}
#endif
+void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
+{
+ if (suspend) {
+ if (!desc->action || (desc->action->flags & IRQF_TIMER))
+ return;
+ desc->status |= IRQ_SUSPENDED;
+ }
+
+ if (!desc->depth++) {
+ desc->status |= IRQ_DISABLED;
+ desc->chip->disable(irq);
+ }
+}
+
/**
* disable_irq_nosync - disable an irq without waiting
* @irq: Interrupt to disable
@@ -189,12 +230,11 @@ void disable_irq_nosync(unsigned int irq)
if (!desc)
return;
+ chip_bus_lock(irq, desc);
spin_lock_irqsave(&desc->lock, flags);
- if (!desc->depth++) {
- desc->status |= IRQ_DISABLED;
- desc->chip->disable(irq);
- }
+ __disable_irq(desc, irq, false);
spin_unlock_irqrestore(&desc->lock, flags);
+ chip_bus_sync_unlock(irq, desc);
}
EXPORT_SYMBOL(disable_irq_nosync);
@@ -223,15 +263,21 @@ void disable_irq(unsigned int irq)
}
EXPORT_SYMBOL(disable_irq);
-static void __enable_irq(struct irq_desc *desc, unsigned int irq)
+void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
{
+ if (resume)
+ desc->status &= ~IRQ_SUSPENDED;
+
switch (desc->depth) {
case 0:
+ err_out:
WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
break;
case 1: {
unsigned int status = desc->status & ~IRQ_DISABLED;
+ if (desc->status & IRQ_SUSPENDED)
+ goto err_out;
/* Prevent probing on this irq: */
desc->status = status | IRQ_NOPROBE;
check_irq_resend(desc, irq);
@@ -250,7 +296,8 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq)
* matches the last disable, processing of interrupts on this
* IRQ line is re-enabled.
*
- * This function may be called from IRQ context.
+ * This function may be called from IRQ context only when
+ * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
*/
void enable_irq(unsigned int irq)
{
@@ -260,9 +307,11 @@ void enable_irq(unsigned int irq)
if (!desc)
return;
+ chip_bus_lock(irq, desc);
spin_lock_irqsave(&desc->lock, flags);
- __enable_irq(desc, irq);
+ __enable_irq(desc, irq, false);
spin_unlock_irqrestore(&desc->lock, flags);
+ chip_bus_sync_unlock(irq, desc);
}
EXPORT_SYMBOL(enable_irq);
@@ -393,16 +442,175 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
}
/*
+ * Default primary interrupt handler for threaded interrupts. Is
+ * assigned as primary handler when request_threaded_irq is called
+ * with handler == NULL. Useful for oneshot interrupts.
+ */
+static irqreturn_t irq_default_primary_handler(int irq, void *dev_id)
+{
+ return IRQ_WAKE_THREAD;
+}
+
+/*
+ * Primary handler for nested threaded interrupts. Should never be
+ * called.
+ */
+static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
+{
+ WARN(1, "Primary handler called for nested irq %d\n", irq);
+ return IRQ_NONE;
+}
+
+static int irq_wait_for_interrupt(struct irqaction *action)
+{
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (test_and_clear_bit(IRQTF_RUNTHREAD,
+ &action->thread_flags)) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+ schedule();
+ }
+ return -1;
+}
+
+/*
+ * Oneshot interrupts keep the irq line masked until the threaded
+ * handler finished. unmask if the interrupt has not been disabled and
+ * is marked MASKED.
+ */
+static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
+{
+ chip_bus_lock(irq, desc);
+ spin_lock_irq(&desc->lock);
+ if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
+ desc->status &= ~IRQ_MASKED;
+ desc->chip->unmask(irq);
+ }
+ spin_unlock_irq(&desc->lock);
+ chip_bus_sync_unlock(irq, desc);
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Check whether we need to change the affinity of the interrupt thread.
+ */
+static void
+irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
+{
+ cpumask_var_t mask;
+
+ if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
+ return;
+
+ /*
+ * In case we are out of memory we set IRQTF_AFFINITY again and
+ * try again next time
+ */
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
+ set_bit(IRQTF_AFFINITY, &action->thread_flags);
+ return;
+ }
+
+ spin_lock_irq(&desc->lock);
+ cpumask_copy(mask, desc->affinity);
+ spin_unlock_irq(&desc->lock);
+
+ set_cpus_allowed_ptr(current, mask);
+ free_cpumask_var(mask);
+}
+#else
+static inline void
+irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
+#endif
+
+/*
+ * Interrupt handler thread
+ */
+static int irq_thread(void *data)
+{
+ struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
+ struct irqaction *action = data;
+ struct irq_desc *desc = irq_to_desc(action->irq);
+ int wake, oneshot = desc->status & IRQ_ONESHOT;
+
+ sched_setscheduler(current, SCHED_FIFO, &param);
+ current->irqaction = action;
+
+ while (!irq_wait_for_interrupt(action)) {
+
+ irq_thread_check_affinity(desc, action);
+
+ atomic_inc(&desc->threads_active);
+
+ spin_lock_irq(&desc->lock);
+ if (unlikely(desc->status & IRQ_DISABLED)) {
+ /*
+ * CHECKME: We might need a dedicated
+ * IRQ_THREAD_PENDING flag here, which
+ * retriggers the thread in check_irq_resend()
+ * but AFAICT IRQ_PENDING should be fine as it
+ * retriggers the interrupt itself --- tglx
+ */
+ desc->status |= IRQ_PENDING;
+ spin_unlock_irq(&desc->lock);
+ } else {
+ spin_unlock_irq(&desc->lock);
+
+ action->thread_fn(action->irq, action->dev_id);
+
+ if (oneshot)
+ irq_finalize_oneshot(action->irq, desc);
+ }
+
+ wake = atomic_dec_and_test(&desc->threads_active);
+
+ if (wake && waitqueue_active(&desc->wait_for_threads))
+ wake_up(&desc->wait_for_threads);
+ }
+
+ /*
+ * Clear irqaction. Otherwise exit_irq_thread() would make
+ * fuzz about an active irq thread going into nirvana.
+ */
+ current->irqaction = NULL;
+ return 0;
+}
+
+/*
+ * Called from do_exit()
+ */
+void exit_irq_thread(void)
+{
+ struct task_struct *tsk = current;
+
+ if (!tsk->irqaction)
+ return;
+
+ printk(KERN_ERR
+ "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
+ tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
+
+ /*
+ * Set the THREAD DIED flag to prevent further wakeups of the
+ * soon to be gone threaded handler.
+ */
+ set_bit(IRQTF_DIED, &tsk->irqaction->flags);
+}
+
+/*
* Internal function to register an irqaction - typically used to
* allocate special interrupts that are part of the architecture.
*/
static int
-__setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
+__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
{
- struct irqaction *old, **p;
+ struct irqaction *old, **old_ptr;
const char *old_name = NULL;
unsigned long flags;
- int shared = 0;
+ int nested, shared = 0;
int ret;
if (!desc)
@@ -427,12 +635,53 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
rand_initialize_irq(irq);
}
+ /* Oneshot interrupts are not allowed with shared */
+ if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
+ return -EINVAL;
+
+ /*
+ * Check whether the interrupt nests into another interrupt
+ * thread.
+ */
+ nested = desc->status & IRQ_NESTED_THREAD;
+ if (nested) {
+ if (!new->thread_fn)
+ return -EINVAL;
+ /*
+ * Replace the primary handler which was provided from
+ * the driver for non nested interrupt handling by the
+ * dummy function which warns when called.
+ */
+ new->handler = irq_nested_primary_handler;
+ }
+
+ /*
+ * Create a handler thread when a thread function is supplied
+ * and the interrupt does not nest into another interrupt
+ * thread.
+ */
+ if (new->thread_fn && !nested) {
+ struct task_struct *t;
+
+ t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
+ new->name);
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+ /*
+ * We keep the reference to the task struct even if
+ * the thread dies to avoid that the interrupt code
+ * references an already freed task_struct.
+ */
+ get_task_struct(t);
+ new->thread = t;
+ }
+
/*
* The following block of code has to be executed atomically
*/
spin_lock_irqsave(&desc->lock, flags);
- p = &desc->action;
- old = *p;
+ old_ptr = &desc->action;
+ old = *old_ptr;
if (old) {
/*
* Can't share interrupts unless both agree to and are
@@ -455,8 +704,8 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
/* add new interrupt at end of irq queue */
do {
- p = &old->next;
- old = *p;
+ old_ptr = &old->next;
+ old = *old_ptr;
} while (old);
shared = 1;
}
@@ -464,15 +713,15 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
if (!shared) {
irq_chip_set_defaults(desc->chip);
+ init_waitqueue_head(&desc->wait_for_threads);
+
/* Setup the type (level, edge polarity) if configured: */
if (new->flags & IRQF_TRIGGER_MASK) {
ret = __irq_set_trigger(desc, irq,
new->flags & IRQF_TRIGGER_MASK);
- if (ret) {
- spin_unlock_irqrestore(&desc->lock, flags);
- return ret;
- }
+ if (ret)
+ goto out_thread;
} else
compat_irq_chip_set_default_handler(desc);
#if defined(CONFIG_IRQ_PER_CPU)
@@ -480,9 +729,12 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
desc->status |= IRQ_PER_CPU;
#endif
- desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
+ desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT |
IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
+ if (new->flags & IRQF_ONESHOT)
+ desc->status |= IRQ_ONESHOT;
+
if (!(desc->status & IRQ_NOAUTOEN)) {
desc->depth = 0;
desc->status &= ~IRQ_DISABLED;
@@ -496,7 +748,7 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
desc->status |= IRQ_NO_BALANCING;
/* Set default affinity mask once everything is setup */
- do_irq_select_affinity(irq, desc);
+ setup_affinity(irq, desc);
} else if ((new->flags & IRQF_TRIGGER_MASK)
&& (new->flags & IRQF_TRIGGER_MASK)
@@ -507,7 +759,8 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
(int)(new->flags & IRQF_TRIGGER_MASK));
}
- *p = new;
+ new->irq = irq;
+ *old_ptr = new;
/* Reset broken irq detection when installing new handler */
desc->irq_count = 0;
@@ -519,12 +772,18 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
*/
if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
desc->status &= ~IRQ_SPURIOUS_DISABLED;
- __enable_irq(desc, irq);
+ __enable_irq(desc, irq, false);
}
spin_unlock_irqrestore(&desc->lock, flags);
- new->irq = irq;
+ /*
+ * Strictly no need to wake it up, but hung_task complains
+ * when no hard interrupt wakes the thread up.
+ */
+ if (new->thread)
+ wake_up_process(new->thread);
+
register_irq_proc(irq, desc);
new->dir = NULL;
register_handler_proc(irq, new);
@@ -540,8 +799,19 @@ mismatch:
dump_stack();
}
#endif
+ ret = -EBUSY;
+
+out_thread:
spin_unlock_irqrestore(&desc->lock, flags);
- return -EBUSY;
+ if (new->thread) {
+ struct task_struct *t = new->thread;
+
+ new->thread = NULL;
+ if (likely(!test_bit(IRQTF_DIED, &new->thread_flags)))
+ kthread_stop(t);
+ put_task_struct(t);
+ }
+ return ret;
}
/**
@@ -557,97 +827,144 @@ int setup_irq(unsigned int irq, struct irqaction *act)
return __setup_irq(irq, desc, act);
}
+EXPORT_SYMBOL_GPL(setup_irq);
-/**
- * free_irq - free an interrupt
- * @irq: Interrupt line to free
- * @dev_id: Device identity to free
- *
- * Remove an interrupt handler. The handler is removed and if the
- * interrupt line is no longer in use by any driver it is disabled.
- * On a shared IRQ the caller must ensure the interrupt is disabled
- * on the card it drives before calling this function. The function
- * does not return until any executing interrupts for this IRQ
- * have completed.
- *
- * This function must not be called from interrupt context.
+ /*
+ * Internal function to unregister an irqaction - used to free
+ * regular and special interrupts that are part of the architecture.
*/
-void free_irq(unsigned int irq, void *dev_id)
+static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
- struct irqaction **p;
+ struct irqaction *action, **action_ptr;
unsigned long flags;
- WARN_ON(in_interrupt());
+ WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
if (!desc)
- return;
+ return NULL;
spin_lock_irqsave(&desc->lock, flags);
- p = &desc->action;
+
+ /*
+ * There can be multiple actions per IRQ descriptor, find the right
+ * one based on the dev_id:
+ */
+ action_ptr = &desc->action;
for (;;) {
- struct irqaction *action = *p;
+ action = *action_ptr;
+
+ if (!action) {
+ WARN(1, "Trying to free already-free IRQ %d\n", irq);
+ spin_unlock_irqrestore(&desc->lock, flags);
- if (action) {
- struct irqaction **pp = p;
+ return NULL;
+ }
- p = &action->next;
- if (action->dev_id != dev_id)
- continue;
+ if (action->dev_id == dev_id)
+ break;
+ action_ptr = &action->next;
+ }
- /* Found it - now remove it from the list of entries */
- *pp = action->next;
+ /* Found it - now remove it from the list of entries: */
+ *action_ptr = action->next;
- /* Currently used only by UML, might disappear one day.*/
+ /* Currently used only by UML, might disappear one day: */
#ifdef CONFIG_IRQ_RELEASE_METHOD
- if (desc->chip->release)
- desc->chip->release(irq, dev_id);
+ if (desc->chip->release)
+ desc->chip->release(irq, dev_id);
#endif
- if (!desc->action) {
- desc->status |= IRQ_DISABLED;
- if (desc->chip->shutdown)
- desc->chip->shutdown(irq);
- else
- desc->chip->disable(irq);
- }
- spin_unlock_irqrestore(&desc->lock, flags);
- unregister_handler_proc(irq, action);
+ /* If this was the last handler, shut down the IRQ line: */
+ if (!desc->action) {
+ desc->status |= IRQ_DISABLED;
+ if (desc->chip->shutdown)
+ desc->chip->shutdown(irq);
+ else
+ desc->chip->disable(irq);
+ }
+
+ spin_unlock_irqrestore(&desc->lock, flags);
+
+ unregister_handler_proc(irq, action);
+
+ /* Make sure it's not being used on another CPU: */
+ synchronize_irq(irq);
- /* Make sure it's not being used on another CPU */
- synchronize_irq(irq);
-#ifdef CONFIG_DEBUG_SHIRQ
- /*
- * It's a shared IRQ -- the driver ought to be
- * prepared for it to happen even now it's
- * being freed, so let's make sure.... We do
- * this after actually deregistering it, to
- * make sure that a 'real' IRQ doesn't run in
- * parallel with our fake
- */
- if (action->flags & IRQF_SHARED) {
- local_irq_save(flags);
- action->handler(irq, dev_id);
- local_irq_restore(flags);
- }
-#endif
- kfree(action);
- return;
- }
- printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
#ifdef CONFIG_DEBUG_SHIRQ
- dump_stack();
+ /*
+ * It's a shared IRQ -- the driver ought to be prepared for an IRQ
+ * event to happen even now it's being freed, so let's make sure that
+ * is so by doing an extra call to the handler ....
+ *
+ * ( We do this after actually deregistering it, to make sure that a
+ * 'real' IRQ doesn't run in * parallel with our fake. )
+ */
+ if (action->flags & IRQF_SHARED) {
+ local_irq_save(flags);
+ action->handler(irq, dev_id);
+ local_irq_restore(flags);
+ }
#endif
- spin_unlock_irqrestore(&desc->lock, flags);
- return;
+
+ if (action->thread) {
+ if (!test_bit(IRQTF_DIED, &action->thread_flags))
+ kthread_stop(action->thread);
+ put_task_struct(action->thread);
}
+
+ return action;
+}
+
+/**
+ * remove_irq - free an interrupt
+ * @irq: Interrupt line to free
+ * @act: irqaction for the interrupt
+ *
+ * Used to remove interrupts statically setup by the early boot process.
+ */
+void remove_irq(unsigned int irq, struct irqaction *act)
+{
+ __free_irq(irq, act->dev_id);
+}
+EXPORT_SYMBOL_GPL(remove_irq);
+
+/**
+ * free_irq - free an interrupt allocated with request_irq
+ * @irq: Interrupt line to free
+ * @dev_id: Device identity to free
+ *
+ * Remove an interrupt handler. The handler is removed and if the
+ * interrupt line is no longer in use by any driver it is disabled.
+ * On a shared IRQ the caller must ensure the interrupt is disabled
+ * on the card it drives before calling this function. The function
+ * does not return until any executing interrupts for this IRQ
+ * have completed.
+ *
+ * This function must not be called from interrupt context.
+ */
+void free_irq(unsigned int irq, void *dev_id)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc)
+ return;
+
+ chip_bus_lock(irq, desc);
+ kfree(__free_irq(irq, dev_id));
+ chip_bus_sync_unlock(irq, desc);
}
EXPORT_SYMBOL(free_irq);
/**
- * request_irq - allocate an interrupt line
+ * request_threaded_irq - allocate an interrupt line
* @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs
+ * @handler: Function to be called when the IRQ occurs.
+ * Primary handler for threaded interrupts
+ * If NULL and thread_fn != NULL the default
+ * primary handler is installed
+ * @thread_fn: Function called from the irq handler thread
+ * If NULL, no irq thread is created
* @irqflags: Interrupt type flags
* @devname: An ascii name for the claiming device
* @dev_id: A cookie passed back to the handler function
@@ -659,6 +976,15 @@ EXPORT_SYMBOL(free_irq);
* raises, you must take care both to initialise your hardware
* and to set up the interrupt handler in the right order.
*
+ * If you want to set up a threaded irq handler for your device
+ * then you need to supply @handler and @thread_fn. @handler ist
+ * still called in hard interrupt context and has to check
+ * whether the interrupt originates from the device. If yes it
+ * needs to disable the interrupt on the device and return
+ * IRQ_WAKE_THREAD which will wake up the handler thread and run
+ * @thread_fn. This split handler design is necessary to support
+ * shared interrupts.
+ *
* Dev_id must be globally unique. Normally the address of the
* device data structure is used as the cookie. Since the handler
* receives this value it makes sense to use it.
@@ -674,8 +1000,9 @@ EXPORT_SYMBOL(free_irq);
* IRQF_TRIGGER_* Specify active edge(s) or level
*
*/
-int request_irq(unsigned int irq, irq_handler_t handler,
- unsigned long irqflags, const char *devname, void *dev_id)
+int request_threaded_irq(unsigned int irq, irq_handler_t handler,
+ irq_handler_t thread_fn, unsigned long irqflags,
+ const char *devname, void *dev_id)
{
struct irqaction *action;
struct irq_desc *desc;
@@ -687,11 +1014,12 @@ int request_irq(unsigned int irq, irq_handler_t handler,
* the behavior is classified as "will not fix" so we need to
* start nudging drivers away from using that idiom.
*/
- if ((irqflags & (IRQF_SHARED|IRQF_DISABLED))
- == (IRQF_SHARED|IRQF_DISABLED))
- pr_warning("IRQ %d/%s: IRQF_DISABLED is not "
- "guaranteed on shared IRQs\n",
- irq, devname);
+ if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) ==
+ (IRQF_SHARED|IRQF_DISABLED)) {
+ pr_warning(
+ "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n",
+ irq, devname);
+ }
#ifdef CONFIG_LOCKDEP
/*
@@ -714,21 +1042,27 @@ int request_irq(unsigned int irq, irq_handler_t handler,
if (desc->status & IRQ_NOREQUEST)
return -EINVAL;
- if (!handler)
- return -EINVAL;
- action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC);
+ if (!handler) {
+ if (!thread_fn)
+ return -EINVAL;
+ handler = irq_default_primary_handler;
+ }
+
+ action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
if (!action)
return -ENOMEM;
action->handler = handler;
+ action->thread_fn = thread_fn;
action->flags = irqflags;
- cpus_clear(action->mask);
action->name = devname;
- action->next = NULL;
action->dev_id = dev_id;
+ chip_bus_lock(irq, desc);
retval = __setup_irq(irq, desc, action);
+ chip_bus_sync_unlock(irq, desc);
+
if (retval)
kfree(action);
@@ -753,4 +1087,4 @@ int request_irq(unsigned int irq, irq_handler_t handler,
#endif
return retval;
}
-EXPORT_SYMBOL(request_irq);
+EXPORT_SYMBOL(request_threaded_irq);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index bd72329e630..fcb6c96f262 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,5 +1,8 @@
#include <linux/irq.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
void move_masked_irq(int irq)
{
@@ -18,7 +21,7 @@ void move_masked_irq(int irq)
desc->status &= ~IRQ_MOVE_PENDING;
- if (unlikely(cpumask_empty(&desc->pending_mask)))
+ if (unlikely(cpumask_empty(desc->pending_mask)))
return;
if (!desc->chip->set_affinity)
@@ -38,13 +41,14 @@ void move_masked_irq(int irq)
* For correct operation this depends on the caller
* masking the irqs.
*/
- if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask)
- < nr_cpu_ids)) {
- cpumask_and(&desc->affinity,
- &desc->pending_mask, cpu_online_mask);
- desc->chip->set_affinity(irq, &desc->affinity);
- }
- cpumask_clear(&desc->pending_mask);
+ if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
+ < nr_cpu_ids))
+ if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
+ cpumask_copy(desc->affinity, desc->pending_mask);
+ irq_set_thread_affinity(desc);
+ }
+
+ cpumask_clear(desc->pending_mask);
}
void move_native_irq(int irq)
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index ecf765c6a77..3fd30197da2 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -15,18 +15,13 @@
static void init_copy_kstat_irqs(struct irq_desc *old_desc,
struct irq_desc *desc,
- int cpu, int nr)
+ int node, int nr)
{
- unsigned long bytes;
+ init_kstat_irqs(desc, node, nr);
- init_kstat_irqs(desc, cpu, nr);
-
- if (desc->kstat_irqs != old_desc->kstat_irqs) {
- /* Compute how many bytes we need per irq and allocate them */
- bytes = nr * sizeof(unsigned int);
-
- memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
- }
+ if (desc->kstat_irqs != old_desc->kstat_irqs)
+ memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
+ nr * sizeof(*desc->kstat_irqs));
}
static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
@@ -38,30 +33,37 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
old_desc->kstat_irqs = NULL;
}
-static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
- struct irq_desc *desc, int cpu)
+static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+ struct irq_desc *desc, int node)
{
memcpy(desc, old_desc, sizeof(struct irq_desc));
+ if (!alloc_desc_masks(desc, node, false)) {
+ printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
+ "for migration.\n", irq);
+ return false;
+ }
spin_lock_init(&desc->lock);
- desc->cpu = cpu;
+ desc->node = node;
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
- init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
- arch_init_copy_chip_data(old_desc, desc, cpu);
+ init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
+ init_copy_desc_masks(old_desc, desc);
+ arch_init_copy_chip_data(old_desc, desc, node);
+ return true;
}
static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
{
free_kstat_irqs(old_desc, desc);
+ free_desc_masks(old_desc, desc);
arch_free_chip_data(old_desc, desc);
}
static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
- int cpu)
+ int node)
{
struct irq_desc *desc;
unsigned int irq;
unsigned long flags;
- int node;
irq = old_desc->irq;
@@ -71,48 +73,46 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
desc = irq_desc_ptrs[irq];
if (desc && old_desc != desc)
- goto out_unlock;
+ goto out_unlock;
- node = cpu_to_node(cpu);
desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
if (!desc) {
- printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq);
+ printk(KERN_ERR "irq %d: can not get new irq_desc "
+ "for migration.\n", irq);
+ /* still use old one */
+ desc = old_desc;
+ goto out_unlock;
+ }
+ if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
/* still use old one */
+ kfree(desc);
desc = old_desc;
goto out_unlock;
}
- init_copy_one_irq_desc(irq, old_desc, desc, cpu);
irq_desc_ptrs[irq] = desc;
+ spin_unlock_irqrestore(&sparse_irq_lock, flags);
/* free the old one */
free_one_irq_desc(old_desc, desc);
kfree(old_desc);
+ return desc;
+
out_unlock:
spin_unlock_irqrestore(&sparse_irq_lock, flags);
return desc;
}
-struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
{
- int old_cpu;
- int node, old_node;
-
- /* those all static, do move them */
- if (desc->irq < NR_IRQS_LEGACY)
+ /* those static or target node is -1, do not move them */
+ if (desc->irq < NR_IRQS_LEGACY || node == -1)
return desc;
- old_cpu = desc->cpu;
- if (old_cpu != cpu) {
- node = cpu_to_node(cpu);
- old_node = cpu_to_node(old_cpu);
- if (old_node != node)
- desc = __real_move_irq_desc(desc, cpu);
- else
- desc->cpu = cpu;
- }
+ if (desc->node != node)
+ desc = __real_move_irq_desc(desc, node);
return desc;
}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
new file mode 100644
index 00000000000..a0bb09e7986
--- /dev/null
+++ b/kernel/irq/pm.c
@@ -0,0 +1,79 @@
+/*
+ * linux/kernel/irq/pm.c
+ *
+ * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ *
+ * This file contains power management functions related to interrupts.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+/**
+ * suspend_device_irqs - disable all currently enabled interrupt lines
+ *
+ * During system-wide suspend or hibernation device drivers need to be prevented
+ * from receiving interrupts and this function is provided for this purpose.
+ * It marks all interrupt lines in use, except for the timer ones, as disabled
+ * and sets the IRQ_SUSPENDED flag for each of them.
+ */
+void suspend_device_irqs(void)
+{
+ struct irq_desc *desc;
+ int irq;
+
+ for_each_irq_desc(irq, desc) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ __disable_irq(desc, irq, true);
+ spin_unlock_irqrestore(&desc->lock, flags);
+ }
+
+ for_each_irq_desc(irq, desc)
+ if (desc->status & IRQ_SUSPENDED)
+ synchronize_irq(irq);
+}
+EXPORT_SYMBOL_GPL(suspend_device_irqs);
+
+/**
+ * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
+ *
+ * Enable all interrupt lines previously disabled by suspend_device_irqs() that
+ * have the IRQ_SUSPENDED flag set.
+ */
+void resume_device_irqs(void)
+{
+ struct irq_desc *desc;
+ int irq;
+
+ for_each_irq_desc(irq, desc) {
+ unsigned long flags;
+
+ if (!(desc->status & IRQ_SUSPENDED))
+ continue;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ __enable_irq(desc, irq, true);
+ spin_unlock_irqrestore(&desc->lock, flags);
+ }
+}
+EXPORT_SYMBOL_GPL(resume_device_irqs);
+
+/**
+ * check_wakeup_irqs - check if any wake-up interrupts are pending
+ */
+int check_wakeup_irqs(void)
+{
+ struct irq_desc *desc;
+ int irq;
+
+ for_each_irq_desc(irq, desc)
+ if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING))
+ return -EBUSY;
+
+ return 0;
+}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index aae3f742bce..692363dd591 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -20,11 +20,11 @@ static struct proc_dir_entry *root_irq_dir;
static int irq_affinity_proc_show(struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long)m->private);
- const struct cpumask *mask = &desc->affinity;
+ const struct cpumask *mask = desc->affinity;
#ifdef CONFIG_GENERIC_PENDING_IRQ
if (desc->status & IRQ_MOVE_PENDING)
- mask = &desc->pending_mask;
+ mask = desc->pending_mask;
#endif
seq_cpumask(m, mask);
seq_putc(m, '\n');
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 89c7117acf2..090c3763f3a 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -70,8 +70,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
- if (!desc->chip || !desc->chip->retrigger ||
- !desc->chip->retrigger(irq)) {
+ if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) {
#ifdef CONFIG_HARDIRQS_SW_RESEND
/* Set it pending and activate the softirq: */
set_bit(irq, irqs_resend);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dd364c11e56..114e704760f 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -104,7 +104,7 @@ static int misrouted_irq(int irq)
return ok;
}
-static void poll_spurious_irqs(unsigned long dummy)
+static void poll_all_shared_irqs(void)
{
struct irq_desc *desc;
int i;
@@ -123,11 +123,23 @@ static void poll_spurious_irqs(unsigned long dummy)
try_one_irq(i, desc);
}
+}
+
+static void poll_spurious_irqs(unsigned long dummy)
+{
+ poll_all_shared_irqs();
mod_timer(&poll_spurious_irq_timer,
jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
}
+#ifdef CONFIG_DEBUG_SHIRQ
+void debug_poll_all_shared_irqs(void)
+{
+ poll_all_shared_irqs();
+}
+#endif
+
/*
* If 99,900 of the previous 100,000 interrupts have not been handled
* then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -285,7 +297,6 @@ static int __init irqfixup_setup(char *str)
__setup("irqfixup", irqfixup_setup);
module_param(irqfixup, int, 0644);
-MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode");
static int __init irqpoll_setup(char *str)
{
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 6a5fe93dd8b..58762f7077e 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -62,7 +62,7 @@ int do_getitimer(int which, struct itimerval *value)
struct task_cputime cputime;
cputime_t utime;
- thread_group_cputime(tsk, &cputime);
+ thread_group_cputimer(tsk, &cputime);
utime = cputime.utime;
if (cputime_le(cval, utime)) { /* about to fire */
cval = jiffies_to_cputime(1);
@@ -82,7 +82,7 @@ int do_getitimer(int which, struct itimerval *value)
struct task_cputime times;
cputime_t ptime;
- thread_group_cputime(tsk, &times);
+ thread_group_cputimer(tsk, &times);
ptime = cputime_add(times.utime, times.stime);
if (cputime_le(cval, ptime)) { /* about to fire */
cval = jiffies_to_cputime(1);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index e694afa0eb8..3a29dbe7898 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,19 +30,24 @@
#define all_var 0
#endif
-extern const unsigned long kallsyms_addresses[];
-extern const u8 kallsyms_names[];
+/*
+ * These will be re-linked against their real values
+ * during the second link stage.
+ */
+extern const unsigned long kallsyms_addresses[] __attribute__((weak));
+extern const u8 kallsyms_names[] __attribute__((weak));
-/* tell the compiler that the count isn't in the small data section if the arch
- * has one (eg: FRV)
+/*
+ * Tell the compiler that the count isn't in the small data section if the arch
+ * has one (eg: FRV).
*/
extern const unsigned long kallsyms_num_syms
- __attribute__((__section__(".rodata")));
+__attribute__((weak, section(".rodata")));
-extern const u8 kallsyms_token_table[];
-extern const u16 kallsyms_token_index[];
+extern const u8 kallsyms_token_table[] __attribute__((weak));
+extern const u16 kallsyms_token_index[] __attribute__((weak));
-extern const unsigned long kallsyms_markers[];
+extern const unsigned long kallsyms_markers[] __attribute__((weak));
static inline int is_kernel_inittext(unsigned long addr)
{
@@ -74,31 +79,37 @@ static int is_ksym_addr(unsigned long addr)
return is_kernel_text(addr) || is_kernel_inittext(addr);
}
-/* expand a compressed symbol data into the resulting uncompressed string,
- given the offset to where the symbol is in the compressed stream */
+/*
+ * Expand a compressed symbol data into the resulting uncompressed string,
+ * given the offset to where the symbol is in the compressed stream.
+ */
static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
{
int len, skipped_first = 0;
const u8 *tptr, *data;
- /* get the compressed symbol length from the first symbol byte */
+ /* Get the compressed symbol length from the first symbol byte. */
data = &kallsyms_names[off];
len = *data;
data++;
- /* update the offset to return the offset for the next symbol on
- * the compressed stream */
+ /*
+ * Update the offset to return the offset for the next symbol on
+ * the compressed stream.
+ */
off += len + 1;
- /* for every byte on the compressed symbol data, copy the table
- entry for that byte */
- while(len) {
- tptr = &kallsyms_token_table[ kallsyms_token_index[*data] ];
+ /*
+ * For every byte on the compressed symbol data, copy the table
+ * entry for that byte.
+ */
+ while (len) {
+ tptr = &kallsyms_token_table[kallsyms_token_index[*data]];
data++;
len--;
while (*tptr) {
- if(skipped_first) {
+ if (skipped_first) {
*result = *tptr;
result++;
} else
@@ -109,36 +120,46 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
*result = '\0';
- /* return to offset to the next symbol */
+ /* Return to offset to the next symbol. */
return off;
}
-/* get symbol type information. This is encoded as a single char at the
- * begining of the symbol name */
+/*
+ * Get symbol type information. This is encoded as a single char at the
+ * beginning of the symbol name.
+ */
static char kallsyms_get_symbol_type(unsigned int off)
{
- /* get just the first code, look it up in the token table, and return the
- * first char from this token */
- return kallsyms_token_table[ kallsyms_token_index[ kallsyms_names[off+1] ] ];
+ /*
+ * Get just the first code, look it up in the token table,
+ * and return the first char from this token.
+ */
+ return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]];
}
-/* find the offset on the compressed stream given and index in the
- * kallsyms array */
+/*
+ * Find the offset on the compressed stream given and index in the
+ * kallsyms array.
+ */
static unsigned int get_symbol_offset(unsigned long pos)
{
const u8 *name;
int i;
- /* use the closest marker we have. We have markers every 256 positions,
- * so that should be close enough */
- name = &kallsyms_names[ kallsyms_markers[pos>>8] ];
+ /*
+ * Use the closest marker we have. We have markers every 256 positions,
+ * so that should be close enough.
+ */
+ name = &kallsyms_names[kallsyms_markers[pos >> 8]];
- /* sequentially scan all the symbols up to the point we're searching for.
- * Every symbol is stored in a [<len>][<len> bytes of data] format, so we
- * just need to add the len to the current pointer for every symbol we
- * wish to skip */
- for(i = 0; i < (pos&0xFF); i++)
+ /*
+ * Sequentially scan all the symbols up to the point we're searching
+ * for. Every symbol is stored in a [<len>][<len> bytes of data] format,
+ * so we just need to add the len to the current pointer for every
+ * symbol we wish to skip.
+ */
+ for (i = 0; i < (pos & 0xFF); i++)
name = name + (*name) + 1;
return name - kallsyms_names;
@@ -160,6 +181,25 @@ unsigned long kallsyms_lookup_name(const char *name)
return module_kallsyms_lookup_name(name);
}
+int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
+ unsigned long),
+ void *data)
+{
+ char namebuf[KSYM_NAME_LEN];
+ unsigned long i;
+ unsigned int off;
+ int ret;
+
+ for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
+ off = kallsyms_expand_symbol(off, namebuf);
+ ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
+ if (ret != 0)
+ return ret;
+ }
+ return module_kallsyms_on_each_symbol(fn, data);
+}
+EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol);
+
static unsigned long get_symbol_pos(unsigned long addr,
unsigned long *symbolsize,
unsigned long *offset)
@@ -167,7 +207,10 @@ static unsigned long get_symbol_pos(unsigned long addr,
unsigned long symbol_start = 0, symbol_end = 0;
unsigned long i, low, high, mid;
- /* do a binary search on the sorted kallsyms_addresses array */
+ /* This kernel should never had been booted. */
+ BUG_ON(!kallsyms_addresses);
+
+ /* Do a binary search on the sorted kallsyms_addresses array. */
low = 0;
high = kallsyms_num_syms;
@@ -180,15 +223,15 @@ static unsigned long get_symbol_pos(unsigned long addr,
}
/*
- * search for the first aliased symbol. Aliased
- * symbols are symbols with the same address
+ * Search for the first aliased symbol. Aliased
+ * symbols are symbols with the same address.
*/
while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
--low;
symbol_start = kallsyms_addresses[low];
- /* Search for next non-aliased symbol */
+ /* Search for next non-aliased symbol. */
for (i = low + 1; i < kallsyms_num_syms; i++) {
if (kallsyms_addresses[i] > symbol_start) {
symbol_end = kallsyms_addresses[i];
@@ -196,7 +239,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
}
}
- /* if we found no next symbol, we use the end of the section */
+ /* If we found no next symbol, we use the end of the section. */
if (!symbol_end) {
if (is_kernel_inittext(addr))
symbol_end = (unsigned long)_einittext;
@@ -229,10 +272,10 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
/*
* Lookup an address
- * - modname is set to NULL if it's in the kernel
- * - we guarantee that the returned name is valid until we reschedule even if
- * it resides in a module
- * - we also guarantee that modname will be valid until rescheduled
+ * - modname is set to NULL if it's in the kernel.
+ * - We guarantee that the returned name is valid until we reschedule even if.
+ * It resides in a module.
+ * - We also guarantee that modname will be valid until rescheduled.
*/
const char *kallsyms_lookup(unsigned long addr,
unsigned long *symbolsize,
@@ -253,7 +296,7 @@ const char *kallsyms_lookup(unsigned long addr,
return namebuf;
}
- /* see if it's in a module */
+ /* See if it's in a module. */
return module_address_lookup(addr, symbolsize, offset, modname,
namebuf);
}
@@ -271,7 +314,7 @@ int lookup_symbol_name(unsigned long addr, char *symname)
kallsyms_expand_symbol(get_symbol_offset(pos), symname);
return 0;
}
- /* see if it's in a module */
+ /* See if it's in a module. */
return lookup_module_symbol_name(addr, symname);
}
@@ -290,7 +333,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
modname[0] = '\0';
return 0;
}
- /* see if it's in a module */
+ /* See if it's in a module. */
return lookup_module_symbol_attrs(addr, size, offset, modname, name);
}
@@ -319,6 +362,7 @@ int sprint_symbol(char *buffer, unsigned long address)
return len;
}
+EXPORT_SYMBOL_GPL(sprint_symbol);
/* Look up a kernel symbol and print it to the kernel messages. */
void __print_symbol(const char *fmt, unsigned long address)
@@ -329,13 +373,13 @@ void __print_symbol(const char *fmt, unsigned long address)
printk(fmt, buffer);
}
+EXPORT_SYMBOL(__print_symbol);
/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
-struct kallsym_iter
-{
+struct kallsym_iter {
loff_t pos;
unsigned long value;
- unsigned int nameoff; /* If iterating in core kernel symbols */
+ unsigned int nameoff; /* If iterating in core kernel symbols. */
char type;
char name[KSYM_NAME_LEN];
char module_name[MODULE_NAME_LEN];
@@ -381,7 +425,7 @@ static int update_iter(struct kallsym_iter *iter, loff_t pos)
iter->pos = pos;
return get_ksymbol_mod(iter);
}
-
+
/* If we're not on the desired position, reset to new position. */
if (pos != iter->pos)
reset_iter(iter, pos);
@@ -416,23 +460,25 @@ static int s_show(struct seq_file *m, void *p)
{
struct kallsym_iter *iter = m->private;
- /* Some debugging symbols have no name. Ignore them. */
+ /* Some debugging symbols have no name. Ignore them. */
if (!iter->name[0])
return 0;
if (iter->module_name[0]) {
char type;
- /* Label it "global" if it is exported,
- * "local" if not exported. */
+ /*
+ * Label it "global" if it is exported,
+ * "local" if not exported.
+ */
type = iter->exported ? toupper(iter->type) :
tolower(iter->type);
seq_printf(m, "%0*lx %c %s\t[%s]\n",
- (int)(2*sizeof(void*)),
+ (int)(2 * sizeof(void *)),
iter->value, type, iter->name, iter->module_name);
} else
seq_printf(m, "%0*lx %c %s\n",
- (int)(2*sizeof(void*)),
+ (int)(2 * sizeof(void *)),
iter->value, iter->type, iter->name);
return 0;
}
@@ -446,9 +492,11 @@ static const struct seq_operations kallsyms_op = {
static int kallsyms_open(struct inode *inode, struct file *file)
{
- /* We keep iterator in m->private, since normal case is to
+ /*
+ * We keep iterator in m->private, since normal case is to
* s_start from where we left off, so we avoid doing
- * using get_symbol_offset for every symbol */
+ * using get_symbol_offset for every symbol.
+ */
struct kallsym_iter *iter;
int ret;
@@ -477,7 +525,4 @@ static int __init kallsyms_init(void)
proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
return 0;
}
-__initcall(kallsyms_init);
-
-EXPORT_SYMBOL(__print_symbol);
-EXPORT_SYMBOL_GPL(sprint_symbol);
+device_initcall(kallsyms_init);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 8a6d7b08864..f336e2107f9 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -42,7 +42,7 @@
note_buf_t* crash_notes;
/* vmcoreinfo stuff */
-unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
size_t vmcoreinfo_size;
size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
@@ -1130,7 +1130,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
return;
memset(&prstatus, 0, sizeof(prstatus));
prstatus.pr_pid = current->pid;
- elf_core_copy_regs(&prstatus.pr_reg, regs);
+ elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
&prstatus, sizeof(prstatus));
final_note(buf);
@@ -1228,7 +1228,7 @@ static int __init parse_crashkernel_mem(char *cmdline,
} while (*cur++ == ',');
if (*crash_size > 0) {
- while (*cur != ' ' && *cur != '@')
+ while (*cur && *cur != ' ' && *cur != '@')
cur++;
if (*cur == '@') {
cur++;
@@ -1409,6 +1409,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(list_head, prev);
VMCOREINFO_OFFSET(vm_struct, addr);
VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+ log_buf_kexec_setup();
VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
VMCOREINFO_NUMBER(NR_FREE_PAGES);
VMCOREINFO_NUMBER(PG_lru);
@@ -1447,22 +1448,25 @@ int kernel_kexec(void)
goto Restore_console;
}
suspend_console();
- error = device_suspend(PMSG_FREEZE);
+ error = dpm_suspend_start(PMSG_FREEZE);
if (error)
goto Resume_console;
- error = disable_nonboot_cpus();
- if (error)
- goto Resume_devices;
- device_pm_lock();
- local_irq_disable();
- /* At this point, device_suspend() has been called,
- * but *not* device_power_down(). We *must*
- * device_power_down() now. Otherwise, drivers for
+ /* At this point, dpm_suspend_start() has been called,
+ * but *not* dpm_suspend_noirq(). We *must* call
+ * dpm_suspend_noirq() now. Otherwise, drivers for
* some devices (e.g. interrupt controllers) become
* desynchronized with the actual state of the
* hardware at resume time, and evil weirdness ensues.
*/
- error = device_power_down(PMSG_FREEZE);
+ error = dpm_suspend_noirq(PMSG_FREEZE);
+ if (error)
+ goto Resume_devices;
+ error = disable_nonboot_cpus();
+ if (error)
+ goto Enable_cpus;
+ local_irq_disable();
+ /* Suspend system devices */
+ error = sysdev_suspend(PMSG_FREEZE);
if (error)
goto Enable_irqs;
} else
@@ -1477,13 +1481,14 @@ int kernel_kexec(void)
#ifdef CONFIG_KEXEC_JUMP
if (kexec_image->preserve_context) {
- device_power_up(PMSG_RESTORE);
+ sysdev_resume();
Enable_irqs:
local_irq_enable();
- device_pm_unlock();
+ Enable_cpus:
enable_nonboot_cpus();
+ dpm_resume_noirq(PMSG_RESTORE);
Resume_devices:
- device_resume(PMSG_RESTORE);
+ dpm_resume_end(PMSG_RESTORE);
Resume_console:
resume_console();
thaw_processes();
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index bc41ad0f24f..26539e3228e 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -72,9 +72,9 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
/*
* round up to the next power of 2, since our 'let the indices
- * wrap' tachnique works only in this case.
+ * wrap' technique works only in this case.
*/
- if (size & (size - 1)) {
+ if (!is_power_of_2(size)) {
BUG_ON(size > 0x80000000);
size = roundup_pow_of_two(size);
}
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index e4dcfb2272a..9147a3190c9 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -1583,8 +1583,8 @@ static void sysrq_handle_gdb(int key, struct tty_struct *tty)
static struct sysrq_key_op sysrq_gdb_op = {
.handler = sysrq_handle_gdb,
- .help_msg = "Gdb",
- .action_msg = "GDB",
+ .help_msg = "debug(G)",
+ .action_msg = "DEBUG",
};
#endif
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a27a5f64443..9fcb53a11f8 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -24,7 +24,6 @@
#include <linux/unistd.h>
#include <linux/kmod.h>
#include <linux/slab.h>
-#include <linux/mnt_namespace.h>
#include <linux/completion.h>
#include <linux/file.h>
#include <linux/fdtable.h>
@@ -38,6 +37,8 @@
#include <linux/suspend.h>
#include <asm/uaccess.h>
+#include <trace/events/module.h>
+
extern int max_threads;
static struct workqueue_struct *khelper_wq;
@@ -50,7 +51,8 @@ static struct workqueue_struct *khelper_wq;
char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
/**
- * request_module - try to load a kernel module
+ * __request_module - try to load a kernel module
+ * @wait: wait (or not) for the operation to complete
* @fmt: printf style format string for the name of the module
* @...: arguments as specified in the format string
*
@@ -63,7 +65,7 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
* If module auto-loading support is disabled then this function
* becomes a no-operation.
*/
-int request_module(const char *fmt, ...)
+int __request_module(bool wait, const char *fmt, ...)
{
va_list args;
char module_name[MODULE_NAME_LEN];
@@ -78,6 +80,10 @@ int request_module(const char *fmt, ...)
#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
static int kmod_loop_msg;
+ ret = security_kernel_module_request();
+ if (ret)
+ return ret;
+
va_start(args, fmt);
ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
va_end(args);
@@ -108,11 +114,14 @@ int request_module(const char *fmt, ...)
return -ENOMEM;
}
- ret = call_usermodehelper(modprobe_path, argv, envp, 1);
+ trace_module_request(module_name, wait, _RET_IP_);
+
+ ret = call_usermodehelper(modprobe_path, argv, envp,
+ wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
atomic_dec(&kmod_concurrent);
return ret;
}
-EXPORT_SYMBOL(request_module);
+EXPORT_SYMBOL(__request_module);
#endif /* CONFIG_MODULES */
struct subprocess_info {
@@ -167,7 +176,7 @@ static int ____call_usermodehelper(void *data)
}
/* We can run anywhere, unlike our parent keventd(). */
- set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR);
+ set_cpus_allowed_ptr(current, cpu_all_mask);
/*
* Our parent is keventd, which runs with elevated scheduling priority.
@@ -368,8 +377,10 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
sub_info->argv = argv;
sub_info->envp = envp;
sub_info->cred = prepare_usermodehelper_creds();
- if (!sub_info->cred)
+ if (!sub_info->cred) {
+ kfree(sub_info);
return NULL;
+ }
out:
return sub_info;
@@ -459,6 +470,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
int retval = 0;
BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
+ validate_creds(sub_info->cred);
helper_lock();
if (sub_info->path[0] == '\0')
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 7ba8cd9845c..ef177d653b2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -43,6 +43,7 @@
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/kdebug.h>
+#include <linux/memory.h>
#include <asm-generic/sections.h>
#include <asm/cacheflush.h>
@@ -67,7 +68,7 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
/* NOTE: change this value only with kprobe_mutex held */
-static bool kprobe_enabled;
+static bool kprobes_all_disarmed;
static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
@@ -102,7 +103,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
struct kprobe_insn_page {
- struct hlist_node hlist;
+ struct list_head list;
kprobe_opcode_t *insns; /* Page of instruction slots */
char slot_used[INSNS_PER_PAGE];
int nused;
@@ -116,7 +117,7 @@ enum kprobe_slot_state {
};
static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
-static struct hlist_head kprobe_insn_pages;
+static LIST_HEAD(kprobe_insn_pages);
static int kprobe_garbage_slots;
static int collect_garbage_slots(void);
@@ -151,10 +152,9 @@ loop_end:
static kprobe_opcode_t __kprobes *__get_insn_slot(void)
{
struct kprobe_insn_page *kip;
- struct hlist_node *pos;
retry:
- hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
+ list_for_each_entry(kip, &kprobe_insn_pages, list) {
if (kip->nused < INSNS_PER_PAGE) {
int i;
for (i = 0; i < INSNS_PER_PAGE; i++) {
@@ -188,8 +188,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
kfree(kip);
return NULL;
}
- INIT_HLIST_NODE(&kip->hlist);
- hlist_add_head(&kip->hlist, &kprobe_insn_pages);
+ INIT_LIST_HEAD(&kip->list);
+ list_add(&kip->list, &kprobe_insn_pages);
memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
kip->slot_used[0] = SLOT_USED;
kip->nused = 1;
@@ -218,12 +218,8 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
* so as not to have to set it up again the
* next time somebody inserts a probe.
*/
- hlist_del(&kip->hlist);
- if (hlist_empty(&kprobe_insn_pages)) {
- INIT_HLIST_NODE(&kip->hlist);
- hlist_add_head(&kip->hlist,
- &kprobe_insn_pages);
- } else {
+ if (!list_is_singular(&kprobe_insn_pages)) {
+ list_del(&kip->list);
module_free(NULL, kip->insns);
kfree(kip);
}
@@ -234,18 +230,13 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
static int __kprobes collect_garbage_slots(void)
{
- struct kprobe_insn_page *kip;
- struct hlist_node *pos, *next;
- int safety;
+ struct kprobe_insn_page *kip, *next;
/* Ensure no-one is preepmted on the garbages */
- mutex_unlock(&kprobe_insn_mutex);
- safety = check_safety();
- mutex_lock(&kprobe_insn_mutex);
- if (safety != 0)
+ if (check_safety())
return -EAGAIN;
- hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
+ list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
int i;
if (kip->ngarbage == 0)
continue;
@@ -263,19 +254,17 @@ static int __kprobes collect_garbage_slots(void)
void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
{
struct kprobe_insn_page *kip;
- struct hlist_node *pos;
mutex_lock(&kprobe_insn_mutex);
- hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
+ list_for_each_entry(kip, &kprobe_insn_pages, list) {
if (kip->insns <= slot &&
slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
int i = (slot - kip->insns) / MAX_INSN_SIZE;
if (dirty) {
kip->slot_used[i] = SLOT_DIRTY;
kip->ngarbage++;
- } else {
+ } else
collect_one_slot(kip, i);
- }
break;
}
}
@@ -318,6 +307,22 @@ struct kprobe __kprobes *get_kprobe(void *addr)
return NULL;
}
+/* Arm a kprobe with text_mutex */
+static void __kprobes arm_kprobe(struct kprobe *kp)
+{
+ mutex_lock(&text_mutex);
+ arch_arm_kprobe(kp);
+ mutex_unlock(&text_mutex);
+}
+
+/* Disarm a kprobe with text_mutex */
+static void __kprobes disarm_kprobe(struct kprobe *kp)
+{
+ mutex_lock(&text_mutex);
+ arch_disarm_kprobe(kp);
+ mutex_unlock(&text_mutex);
+}
+
/*
* Aggregate handlers for multiple kprobes support - these handlers
* take care of invoking the individual kprobe handlers on p->list
@@ -327,7 +332,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
struct kprobe *kp;
list_for_each_entry_rcu(kp, &p->list, list) {
- if (kp->pre_handler && !kprobe_gone(kp)) {
+ if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
set_kprobe_instance(kp);
if (kp->pre_handler(kp, regs))
return 1;
@@ -343,7 +348,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
struct kprobe *kp;
list_for_each_entry_rcu(kp, &p->list, list) {
- if (kp->post_handler && !kprobe_gone(kp)) {
+ if (kp->post_handler && likely(!kprobe_disabled(kp))) {
set_kprobe_instance(kp);
kp->post_handler(kp, regs, flags);
reset_kprobe_instance();
@@ -517,20 +522,28 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
}
/*
-* Add the new probe to old_p->list. Fail if this is the
+* Add the new probe to ap->list. Fail if this is the
* second jprobe at the address - two jprobes can't coexist
*/
-static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
+static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
{
+ BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
if (p->break_handler) {
- if (old_p->break_handler)
+ if (ap->break_handler)
return -EEXIST;
- list_add_tail_rcu(&p->list, &old_p->list);
- old_p->break_handler = aggr_break_handler;
+ list_add_tail_rcu(&p->list, &ap->list);
+ ap->break_handler = aggr_break_handler;
} else
- list_add_rcu(&p->list, &old_p->list);
- if (p->post_handler && !old_p->post_handler)
- old_p->post_handler = aggr_post_handler;
+ list_add_rcu(&p->list, &ap->list);
+ if (p->post_handler && !ap->post_handler)
+ ap->post_handler = aggr_post_handler;
+
+ if (kprobe_disabled(ap) && !kprobe_disabled(p)) {
+ ap->flags &= ~KPROBE_FLAG_DISABLED;
+ if (!kprobes_all_disarmed)
+ /* Arm the breakpoint again. */
+ arm_kprobe(ap);
+ }
return 0;
}
@@ -543,6 +556,7 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
copy_kprobe(p, ap);
flush_insn_slot(ap);
ap->addr = p->addr;
+ ap->flags = p->flags;
ap->pre_handler = aggr_pre_handler;
ap->fault_handler = aggr_fault_handler;
/* We don't care the kprobe which has gone. */
@@ -565,44 +579,59 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
struct kprobe *p)
{
int ret = 0;
- struct kprobe *ap;
+ struct kprobe *ap = old_p;
+
+ if (old_p->pre_handler != aggr_pre_handler) {
+ /* If old_p is not an aggr_probe, create new aggr_kprobe. */
+ ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+ if (!ap)
+ return -ENOMEM;
+ add_aggr_kprobe(ap, old_p);
+ }
- if (kprobe_gone(old_p)) {
+ if (kprobe_gone(ap)) {
/*
* Attempting to insert new probe at the same location that
* had a probe in the module vaddr area which already
* freed. So, the instruction slot has already been
* released. We need a new slot for the new probe.
*/
- ret = arch_prepare_kprobe(old_p);
+ ret = arch_prepare_kprobe(ap);
if (ret)
+ /*
+ * Even if fail to allocate new slot, don't need to
+ * free aggr_probe. It will be used next time, or
+ * freed by unregister_kprobe.
+ */
return ret;
- }
- if (old_p->pre_handler == aggr_pre_handler) {
- copy_kprobe(old_p, p);
- ret = add_new_kprobe(old_p, p);
- ap = old_p;
- } else {
- ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
- if (!ap) {
- if (kprobe_gone(old_p))
- arch_remove_kprobe(old_p);
- return -ENOMEM;
- }
- add_aggr_kprobe(ap, old_p);
- copy_kprobe(ap, p);
- ret = add_new_kprobe(ap, p);
- }
- if (kprobe_gone(old_p)) {
+
/*
- * If the old_p has gone, its breakpoint has been disarmed.
- * We have to arm it again after preparing real kprobes.
+ * Clear gone flag to prevent allocating new slot again, and
+ * set disabled flag because it is not armed yet.
*/
- ap->flags &= ~KPROBE_FLAG_GONE;
- if (kprobe_enabled)
- arch_arm_kprobe(ap);
+ ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
+ | KPROBE_FLAG_DISABLED;
}
- return ret;
+
+ copy_kprobe(ap, p);
+ return add_new_kprobe(ap, p);
+}
+
+/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
+static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
+{
+ struct kprobe *kp;
+
+ list_for_each_entry_rcu(kp, &p->list, list) {
+ if (!kprobe_disabled(kp))
+ /*
+ * There is an active probe on the list.
+ * We can't disable aggr_kprobe.
+ */
+ return 0;
+ }
+ p->flags |= KPROBE_FLAG_DISABLED;
+ return 1;
}
static int __kprobes in_kprobes_functions(unsigned long addr)
@@ -657,13 +686,15 @@ int __kprobes register_kprobe(struct kprobe *p)
p->addr = addr;
preempt_disable();
- if (!__kernel_text_address((unsigned long) p->addr) ||
+ if (!kernel_text_address((unsigned long) p->addr) ||
in_kprobes_functions((unsigned long) p->addr)) {
preempt_enable();
return -EINVAL;
}
- p->flags = 0;
+ /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
+ p->flags &= KPROBE_FLAG_DISABLED;
+
/*
* Check if are we probing a module.
*/
@@ -699,17 +730,20 @@ int __kprobes register_kprobe(struct kprobe *p)
goto out;
}
+ mutex_lock(&text_mutex);
ret = arch_prepare_kprobe(p);
if (ret)
- goto out;
+ goto out_unlock_text;
INIT_HLIST_NODE(&p->hlist);
hlist_add_head_rcu(&p->hlist,
&kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
- if (kprobe_enabled)
+ if (!kprobes_all_disarmed && !kprobe_disabled(p))
arch_arm_kprobe(p);
+out_unlock_text:
+ mutex_unlock(&text_mutex);
out:
mutex_unlock(&kprobe_mutex);
@@ -718,26 +752,39 @@ out:
return ret;
}
+EXPORT_SYMBOL_GPL(register_kprobe);
-/*
- * Unregister a kprobe without a scheduler synchronization.
- */
-static int __kprobes __unregister_kprobe_top(struct kprobe *p)
+/* Check passed kprobe is valid and return kprobe in kprobe_table. */
+static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
{
struct kprobe *old_p, *list_p;
old_p = get_kprobe(p->addr);
if (unlikely(!old_p))
- return -EINVAL;
+ return NULL;
if (p != old_p) {
list_for_each_entry_rcu(list_p, &old_p->list, list)
if (list_p == p)
/* kprobe p is a valid probe */
- goto valid_p;
- return -EINVAL;
+ goto valid;
+ return NULL;
}
-valid_p:
+valid:
+ return old_p;
+}
+
+/*
+ * Unregister a kprobe without a scheduler synchronization.
+ */
+static int __kprobes __unregister_kprobe_top(struct kprobe *p)
+{
+ struct kprobe *old_p, *list_p;
+
+ old_p = __get_valid_kprobe(p);
+ if (old_p == NULL)
+ return -EINVAL;
+
if (old_p == p ||
(old_p->pre_handler == aggr_pre_handler &&
list_is_singular(&old_p->list))) {
@@ -746,8 +793,8 @@ valid_p:
* enabled and not gone - otherwise, the breakpoint would
* already have been removed. We save on flushing icache.
*/
- if (kprobe_enabled && !kprobe_gone(old_p))
- arch_disarm_kprobe(p);
+ if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
+ disarm_kprobe(p);
hlist_del_rcu(&old_p->hlist);
} else {
if (p->break_handler && !kprobe_gone(p))
@@ -761,6 +808,11 @@ valid_p:
}
noclean:
list_del_rcu(&p->list);
+ if (!kprobe_disabled(old_p)) {
+ try_to_disable_aggr_kprobe(old_p);
+ if (!kprobes_all_disarmed && kprobe_disabled(old_p))
+ disarm_kprobe(old_p);
+ }
}
return 0;
}
@@ -796,11 +848,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num)
}
return ret;
}
+EXPORT_SYMBOL_GPL(register_kprobes);
void __kprobes unregister_kprobe(struct kprobe *p)
{
unregister_kprobes(&p, 1);
}
+EXPORT_SYMBOL_GPL(unregister_kprobe);
void __kprobes unregister_kprobes(struct kprobe **kps, int num)
{
@@ -819,6 +873,7 @@ void __kprobes unregister_kprobes(struct kprobe **kps, int num)
if (kps[i]->addr)
__unregister_kprobe_bottom(kps[i]);
}
+EXPORT_SYMBOL_GPL(unregister_kprobes);
static struct notifier_block kprobe_exceptions_nb = {
.notifier_call = kprobe_exceptions_notify,
@@ -858,16 +913,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
}
return ret;
}
+EXPORT_SYMBOL_GPL(register_jprobes);
int __kprobes register_jprobe(struct jprobe *jp)
{
return register_jprobes(&jp, 1);
}
+EXPORT_SYMBOL_GPL(register_jprobe);
void __kprobes unregister_jprobe(struct jprobe *jp)
{
unregister_jprobes(&jp, 1);
}
+EXPORT_SYMBOL_GPL(unregister_jprobe);
void __kprobes unregister_jprobes(struct jprobe **jps, int num)
{
@@ -887,6 +945,7 @@ void __kprobes unregister_jprobes(struct jprobe **jps, int num)
__unregister_kprobe_bottom(&jps[i]->kp);
}
}
+EXPORT_SYMBOL_GPL(unregister_jprobes);
#ifdef CONFIG_KRETPROBES
/*
@@ -912,10 +971,8 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
ri->rp = rp;
ri->task = current;
- if (rp->entry_handler && rp->entry_handler(ri, regs)) {
- spin_unlock_irqrestore(&rp->lock, flags);
+ if (rp->entry_handler && rp->entry_handler(ri, regs))
return 0;
- }
arch_prepare_kretprobe(ri, regs);
@@ -982,6 +1039,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
free_rp_inst(rp);
return ret;
}
+EXPORT_SYMBOL_GPL(register_kretprobe);
int __kprobes register_kretprobes(struct kretprobe **rps, int num)
{
@@ -999,11 +1057,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num)
}
return ret;
}
+EXPORT_SYMBOL_GPL(register_kretprobes);
void __kprobes unregister_kretprobe(struct kretprobe *rp)
{
unregister_kretprobes(&rp, 1);
}
+EXPORT_SYMBOL_GPL(unregister_kretprobe);
void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
{
@@ -1025,24 +1085,30 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
}
}
}
+EXPORT_SYMBOL_GPL(unregister_kretprobes);
#else /* CONFIG_KRETPROBES */
int __kprobes register_kretprobe(struct kretprobe *rp)
{
return -ENOSYS;
}
+EXPORT_SYMBOL_GPL(register_kretprobe);
int __kprobes register_kretprobes(struct kretprobe **rps, int num)
{
return -ENOSYS;
}
+EXPORT_SYMBOL_GPL(register_kretprobes);
+
void __kprobes unregister_kretprobe(struct kretprobe *rp)
{
}
+EXPORT_SYMBOL_GPL(unregister_kretprobe);
void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
{
}
+EXPORT_SYMBOL_GPL(unregister_kretprobes);
static int __kprobes pre_handler_kretprobe(struct kprobe *p,
struct pt_regs *regs)
@@ -1056,6 +1122,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
static void __kprobes kill_kprobe(struct kprobe *p)
{
struct kprobe *kp;
+
p->flags |= KPROBE_FLAG_GONE;
if (p->pre_handler == aggr_pre_handler) {
/*
@@ -1168,8 +1235,8 @@ static int __init init_kprobes(void)
}
}
- /* By default, kprobes are enabled */
- kprobe_enabled = true;
+ /* By default, kprobes are armed */
+ kprobes_all_disarmed = false;
err = arch_init_kprobes();
if (!err)
@@ -1197,12 +1264,18 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
else
kprobe_type = "k";
if (sym)
- seq_printf(pi, "%p %s %s+0x%x %s %s\n", p->addr, kprobe_type,
- sym, offset, (modname ? modname : " "),
- (kprobe_gone(p) ? "[GONE]" : ""));
+ seq_printf(pi, "%p %s %s+0x%x %s %s%s\n",
+ p->addr, kprobe_type, sym, offset,
+ (modname ? modname : " "),
+ (kprobe_gone(p) ? "[GONE]" : ""),
+ ((kprobe_disabled(p) && !kprobe_gone(p)) ?
+ "[DISABLED]" : ""));
else
- seq_printf(pi, "%p %s %p %s\n", p->addr, kprobe_type, p->addr,
- (kprobe_gone(p) ? "[GONE]" : ""));
+ seq_printf(pi, "%p %s %p %s%s\n",
+ p->addr, kprobe_type, p->addr,
+ (kprobe_gone(p) ? "[GONE]" : ""),
+ ((kprobe_disabled(p) && !kprobe_gone(p)) ?
+ "[DISABLED]" : ""));
}
static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1267,7 +1340,72 @@ static struct file_operations debugfs_kprobes_operations = {
.release = seq_release,
};
-static void __kprobes enable_all_kprobes(void)
+/* Disable one kprobe */
+int __kprobes disable_kprobe(struct kprobe *kp)
+{
+ int ret = 0;
+ struct kprobe *p;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* Check whether specified probe is valid. */
+ p = __get_valid_kprobe(kp);
+ if (unlikely(p == NULL)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* If the probe is already disabled (or gone), just return */
+ if (kprobe_disabled(kp))
+ goto out;
+
+ kp->flags |= KPROBE_FLAG_DISABLED;
+ if (p != kp)
+ /* When kp != p, p is always enabled. */
+ try_to_disable_aggr_kprobe(p);
+
+ if (!kprobes_all_disarmed && kprobe_disabled(p))
+ disarm_kprobe(p);
+out:
+ mutex_unlock(&kprobe_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(disable_kprobe);
+
+/* Enable one kprobe */
+int __kprobes enable_kprobe(struct kprobe *kp)
+{
+ int ret = 0;
+ struct kprobe *p;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* Check whether specified probe is valid. */
+ p = __get_valid_kprobe(kp);
+ if (unlikely(p == NULL)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (kprobe_gone(kp)) {
+ /* This kprobe has gone, we couldn't enable it. */
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!kprobes_all_disarmed && kprobe_disabled(p))
+ arm_kprobe(p);
+
+ p->flags &= ~KPROBE_FLAG_DISABLED;
+ if (p != kp)
+ kp->flags &= ~KPROBE_FLAG_DISABLED;
+out:
+ mutex_unlock(&kprobe_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(enable_kprobe);
+
+static void __kprobes arm_all_kprobes(void)
{
struct hlist_head *head;
struct hlist_node *node;
@@ -1276,18 +1414,20 @@ static void __kprobes enable_all_kprobes(void)
mutex_lock(&kprobe_mutex);
- /* If kprobes are already enabled, just return */
- if (kprobe_enabled)
+ /* If kprobes are armed, just return */
+ if (!kprobes_all_disarmed)
goto already_enabled;
+ mutex_lock(&text_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry_rcu(p, node, head, hlist)
- if (!kprobe_gone(p))
+ if (!kprobe_disabled(p))
arch_arm_kprobe(p);
}
+ mutex_unlock(&text_mutex);
- kprobe_enabled = true;
+ kprobes_all_disarmed = false;
printk(KERN_INFO "Kprobes globally enabled\n");
already_enabled:
@@ -1295,7 +1435,7 @@ already_enabled:
return;
}
-static void __kprobes disable_all_kprobes(void)
+static void __kprobes disarm_all_kprobes(void)
{
struct hlist_head *head;
struct hlist_node *node;
@@ -1304,20 +1444,22 @@ static void __kprobes disable_all_kprobes(void)
mutex_lock(&kprobe_mutex);
- /* If kprobes are already disabled, just return */
- if (!kprobe_enabled)
+ /* If kprobes are already disarmed, just return */
+ if (kprobes_all_disarmed)
goto already_disabled;
- kprobe_enabled = false;
+ kprobes_all_disarmed = true;
printk(KERN_INFO "Kprobes globally disabled\n");
+ mutex_lock(&text_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry_rcu(p, node, head, hlist) {
- if (!arch_trampoline_kprobe(p) && !kprobe_gone(p))
+ if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
arch_disarm_kprobe(p);
}
}
+ mutex_unlock(&text_mutex);
mutex_unlock(&kprobe_mutex);
/* Allow all currently running kprobes to complete */
synchronize_sched();
@@ -1338,7 +1480,7 @@ static ssize_t read_enabled_file_bool(struct file *file,
{
char buf[3];
- if (kprobe_enabled)
+ if (!kprobes_all_disarmed)
buf[0] = '1';
else
buf[0] = '0';
@@ -1361,12 +1503,12 @@ static ssize_t write_enabled_file_bool(struct file *file,
case 'y':
case 'Y':
case '1':
- enable_all_kprobes();
+ arm_all_kprobes();
break;
case 'n':
case 'N':
case '0':
- disable_all_kprobes();
+ disarm_all_kprobes();
break;
}
@@ -1409,16 +1551,5 @@ late_initcall(debugfs_kprobe_init);
module_init(init_kprobes);
-EXPORT_SYMBOL_GPL(register_kprobe);
-EXPORT_SYMBOL_GPL(unregister_kprobe);
-EXPORT_SYMBOL_GPL(register_kprobes);
-EXPORT_SYMBOL_GPL(unregister_kprobes);
-EXPORT_SYMBOL_GPL(register_jprobe);
-EXPORT_SYMBOL_GPL(unregister_jprobe);
-EXPORT_SYMBOL_GPL(register_jprobes);
-EXPORT_SYMBOL_GPL(unregister_jprobes);
+/* defined in arch/.../kernel/kprobes.c */
EXPORT_SYMBOL_GPL(jprobe_return);
-EXPORT_SYMBOL_GPL(register_kretprobe);
-EXPORT_SYMBOL_GPL(unregister_kretprobe);
-EXPORT_SYMBOL_GPL(register_kretprobes);
-EXPORT_SYMBOL_GPL(unregister_kretprobes);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4fbc456f393..5fe709982ca 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -9,27 +9,22 @@
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/err.h>
+#include <linux/cpuset.h>
#include <linux/unistd.h>
#include <linux/file.h>
#include <linux/module.h>
#include <linux/mutex.h>
-#include <trace/sched.h>
-
-#define KTHREAD_NICE_LEVEL (-5)
+#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;
-DEFINE_TRACE(sched_kthread_stop);
-DEFINE_TRACE(sched_kthread_stop_ret);
-
struct kthread_create_info
{
/* Information passed to kthread() from kthreadd. */
int (*threadfn)(void *data);
void *data;
- struct completion started;
/* Result passed back to kthread_create() from kthreadd. */
struct task_struct *result;
@@ -38,17 +33,13 @@ struct kthread_create_info
struct list_head list;
};
-struct kthread_stop_info
-{
- struct task_struct *k;
- int err;
- struct completion done;
+struct kthread {
+ int should_stop;
+ struct completion exited;
};
-/* Thread stopping is done by setthing this var: lock serializes
- * multiple kthread_stop calls. */
-static DEFINE_MUTEX(kthread_stop_lock);
-static struct kthread_stop_info kthread_stop_info;
+#define to_kthread(tsk) \
+ container_of((tsk)->vfork_done, struct kthread, exited)
/**
* kthread_should_stop - should this kthread return now?
@@ -59,35 +50,35 @@ static struct kthread_stop_info kthread_stop_info;
*/
int kthread_should_stop(void)
{
- return (kthread_stop_info.k == current);
+ return to_kthread(current)->should_stop;
}
EXPORT_SYMBOL(kthread_should_stop);
static int kthread(void *_create)
{
+ /* Copy data: it's on kthread's stack */
struct kthread_create_info *create = _create;
- int (*threadfn)(void *data);
- void *data;
- int ret = -EINTR;
+ int (*threadfn)(void *data) = create->threadfn;
+ void *data = create->data;
+ struct kthread self;
+ int ret;
- /* Copy data: it's on kthread's stack */
- threadfn = create->threadfn;
- data = create->data;
+ self.should_stop = 0;
+ init_completion(&self.exited);
+ current->vfork_done = &self.exited;
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
- complete(&create->started);
+ create->result = current;
+ complete(&create->done);
schedule();
- if (!kthread_should_stop())
+ ret = -EINTR;
+ if (!self.should_stop)
ret = threadfn(data);
- /* It might have exited on its own, w/o kthread_stop. Check. */
- if (kthread_should_stop()) {
- kthread_stop_info.err = ret;
- complete(&kthread_stop_info.done);
- }
- return 0;
+ /* we can't just return, we must preserve "self" on stack */
+ do_exit(ret);
}
static void create_kthread(struct kthread_create_info *create)
@@ -98,21 +89,8 @@ static void create_kthread(struct kthread_create_info *create)
pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
create->result = ERR_PTR(pid);
- } else {
- struct sched_param param = { .sched_priority = 0 };
- wait_for_completion(&create->started);
- read_lock(&tasklist_lock);
- create->result = find_task_by_pid_ns(pid, &init_pid_ns);
- read_unlock(&tasklist_lock);
- /*
- * root may have changed our (kthreadd's) priority or CPU mask.
- * The kernel thread should not inherit these properties.
- */
- sched_setscheduler(create->result, SCHED_NORMAL, &param);
- set_user_nice(create->result, KTHREAD_NICE_LEVEL);
- set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR);
+ complete(&create->done);
}
- complete(&create->done);
}
/**
@@ -143,7 +121,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
create.threadfn = threadfn;
create.data = data;
- init_completion(&create.started);
init_completion(&create.done);
spin_lock(&kthread_create_lock);
@@ -154,11 +131,19 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
wait_for_completion(&create.done);
if (!IS_ERR(create.result)) {
+ struct sched_param param = { .sched_priority = 0 };
va_list args;
+
va_start(args, namefmt);
vsnprintf(create.result->comm, sizeof(create.result->comm),
namefmt, args);
va_end(args);
+ /*
+ * root may have changed our (kthreadd's) priority or CPU mask.
+ * The kernel thread should not inherit these properties.
+ */
+ sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
+ set_cpus_allowed_ptr(create.result, cpu_all_mask);
}
return create.result;
}
@@ -192,40 +177,34 @@ EXPORT_SYMBOL(kthread_bind);
* @k: thread created by kthread_create().
*
* Sets kthread_should_stop() for @k to return true, wakes it, and
- * waits for it to exit. Your threadfn() must not call do_exit()
- * itself if you use this function! This can also be called after
- * kthread_create() instead of calling wake_up_process(): the thread
- * will exit without calling threadfn().
+ * waits for it to exit. This can also be called after kthread_create()
+ * instead of calling wake_up_process(): the thread will exit without
+ * calling threadfn().
+ *
+ * If threadfn() may call do_exit() itself, the caller must ensure
+ * task_struct can't go away.
*
* Returns the result of threadfn(), or %-EINTR if wake_up_process()
* was never called.
*/
int kthread_stop(struct task_struct *k)
{
+ struct kthread *kthread;
int ret;
- mutex_lock(&kthread_stop_lock);
-
- /* It could exit after stop_info.k set, but before wake_up_process. */
- get_task_struct(k);
-
trace_sched_kthread_stop(k);
+ get_task_struct(k);
- /* Must init completion *before* thread sees kthread_stop_info.k */
- init_completion(&kthread_stop_info.done);
- smp_wmb();
+ kthread = to_kthread(k);
+ barrier(); /* it might have exited */
+ if (k->vfork_done != NULL) {
+ kthread->should_stop = 1;
+ wake_up_process(k);
+ wait_for_completion(&kthread->exited);
+ }
+ ret = k->exit_code;
- /* Now set kthread_should_stop() to true, and wake it up. */
- kthread_stop_info.k = k;
- wake_up_process(k);
put_task_struct(k);
-
- /* Once it dies, reset stop ptr, gather result and we're done. */
- wait_for_completion(&kthread_stop_info.done);
- kthread_stop_info.k = NULL;
- ret = kthread_stop_info.err;
- mutex_unlock(&kthread_stop_lock);
-
trace_sched_kthread_stop_ret(ret);
return ret;
@@ -239,8 +218,8 @@ int kthreadd(void *unused)
/* Setup a clean context for our children to inherit. */
set_task_comm(tsk, "kthreadd");
ignore_signals(tsk);
- set_user_nice(tsk, KTHREAD_NICE_LEVEL);
- set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR);
+ set_cpus_allowed_ptr(tsk, cpu_all_mask);
+ set_mems_allowed(node_possible_map);
current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 449db466bdb..ca07c5c0c91 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -9,6 +9,44 @@
* as published by the Free Software Foundation; version 2
* of the License.
*/
+
+/*
+ * CONFIG_LATENCYTOP enables a kernel latency tracking infrastructure that is
+ * used by the "latencytop" userspace tool. The latency that is tracked is not
+ * the 'traditional' interrupt latency (which is primarily caused by something
+ * else consuming CPU), but instead, it is the latency an application encounters
+ * because the kernel sleeps on its behalf for various reasons.
+ *
+ * This code tracks 2 levels of statistics:
+ * 1) System level latency
+ * 2) Per process latency
+ *
+ * The latency is stored in fixed sized data structures in an accumulated form;
+ * if the "same" latency cause is hit twice, this will be tracked as one entry
+ * in the data structure. Both the count, total accumulated latency and maximum
+ * latency are tracked in this data structure. When the fixed size structure is
+ * full, no new causes are tracked until the buffer is flushed by writing to
+ * the /proc file; the userspace tool does this on a regular basis.
+ *
+ * A latency cause is identified by a stringified backtrace at the point that
+ * the scheduler gets invoked. The userland tool will use this string to
+ * identify the cause of the latency in human readable form.
+ *
+ * The information is exported via /proc/latency_stats and /proc/<pid>/latency.
+ * These files look like this:
+ *
+ * Latency Top version : v0.1
+ * 70 59433 4897 i915_irq_wait drm_ioctl vfs_ioctl do_vfs_ioctl sys_ioctl
+ * | | | |
+ * | | | +----> the stringified backtrace
+ * | | +---------> The maximum latency for this entry in microseconds
+ * | +--------------> The accumulated latency for this entry (microseconds)
+ * +-------------------> The number of times this entry is hit
+ *
+ * (note: the average latency is the accumulated latency divided by the number
+ * of times)
+ */
+
#include <linux/latencytop.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
@@ -72,7 +110,7 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
firstnonnull = i;
continue;
}
- for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
+ for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
unsigned long record = lat->backtrace[q];
if (latency_record[i].backtrace[q] != record) {
@@ -101,31 +139,52 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
memcpy(&latency_record[i], lat, sizeof(struct latency_record));
}
-static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat)
+/*
+ * Iterator to store a backtrace into a latency record entry
+ */
+static inline void store_stacktrace(struct task_struct *tsk,
+ struct latency_record *lat)
{
struct stack_trace trace;
memset(&trace, 0, sizeof(trace));
trace.max_entries = LT_BACKTRACEDEPTH;
trace.entries = &lat->backtrace[0];
- trace.skip = 0;
save_stack_trace_tsk(tsk, &trace);
}
+/**
+ * __account_scheduler_latency - record an occured latency
+ * @tsk - the task struct of the task hitting the latency
+ * @usecs - the duration of the latency in microseconds
+ * @inter - 1 if the sleep was interruptible, 0 if uninterruptible
+ *
+ * This function is the main entry point for recording latency entries
+ * as called by the scheduler.
+ *
+ * This function has a few special cases to deal with normal 'non-latency'
+ * sleeps: specifically, interruptible sleep longer than 5 msec is skipped
+ * since this usually is caused by waiting for events via select() and co.
+ *
+ * Negative latencies (caused by time going backwards) are also explicitly
+ * skipped.
+ */
void __sched
-account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
+__account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
{
unsigned long flags;
int i, q;
struct latency_record lat;
- if (!latencytop_enabled)
- return;
-
/* Long interruptible waits are generally user requested... */
if (inter && usecs > 5000)
return;
+ /* Negative sleeps are time going backwards */
+ /* Zero-time sleeps are non-interesting */
+ if (usecs <= 0)
+ return;
+
memset(&lat, 0, sizeof(lat));
lat.count = 1;
lat.time = usecs;
@@ -143,12 +202,12 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
if (tsk->latency_record_count >= LT_SAVECOUNT)
goto out_unlock;
- for (i = 0; i < LT_SAVECOUNT ; i++) {
+ for (i = 0; i < LT_SAVECOUNT; i++) {
struct latency_record *mylat;
int same = 1;
mylat = &tsk->latency_record[i];
- for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
+ for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
unsigned long record = lat.backtrace[q];
if (mylat->backtrace[q] != record) {
@@ -186,7 +245,7 @@ static int lstats_show(struct seq_file *m, void *v)
for (i = 0; i < MAXLR; i++) {
if (latency_record[i].backtrace[0]) {
int q;
- seq_printf(m, "%i %li %li ",
+ seq_printf(m, "%i %lu %lu ",
latency_record[i].count,
latency_record[i].time,
latency_record[i].max);
@@ -223,7 +282,7 @@ static int lstats_open(struct inode *inode, struct file *filp)
return single_open(filp, lstats_show, NULL);
}
-static struct file_operations lstats_fops = {
+static const struct file_operations lstats_fops = {
.open = lstats_open,
.read = seq_read,
.write = lstats_write,
@@ -236,4 +295,4 @@ static int __init init_lstats_procfs(void)
proc_create("latency_stats", 0644, NULL, &lstats_fops);
return 0;
}
-__initcall(init_lstats_procfs);
+device_initcall(init_lstats_procfs);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 06b0c3568f0..f74d2d7aa60 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -41,11 +41,16 @@
#include <linux/utsname.h>
#include <linux/hash.h>
#include <linux/ftrace.h>
+#include <linux/stringify.h>
+#include <linux/bitops.h>
#include <asm/sections.h>
#include "lockdep_internals.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/lockdep.h>
+
#ifdef CONFIG_PROVE_LOCKING
int prove_locking = 1;
module_param(prove_locking, int, 0644);
@@ -310,12 +315,14 @@ EXPORT_SYMBOL(lockdep_on);
#if VERBOSE
# define HARDIRQ_VERBOSE 1
# define SOFTIRQ_VERBOSE 1
+# define RECLAIM_VERBOSE 1
#else
# define HARDIRQ_VERBOSE 0
# define SOFTIRQ_VERBOSE 0
+# define RECLAIM_VERBOSE 0
#endif
-#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
+#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE
/*
* Quick filtering for interesting events:
*/
@@ -360,11 +367,21 @@ static int save_trace(struct stack_trace *trace)
save_stack_trace(trace);
+ /*
+ * Some daft arches put -1 at the end to indicate its a full trace.
+ *
+ * <rant> this is buggy anyway, since it takes a whole extra entry so a
+ * complete trace that maxes out the entries provided will be reported
+ * as incomplete, friggin useless </rant>
+ */
+ if (trace->entries[trace->nr_entries-1] == ULONG_MAX)
+ trace->nr_entries--;
+
trace->max_entries = trace->nr_entries;
nr_stack_trace_entries += trace->nr_entries;
- if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
+ if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
if (!debug_locks_off_graph_unlock())
return 0;
@@ -382,20 +399,6 @@ unsigned int nr_hardirq_chains;
unsigned int nr_softirq_chains;
unsigned int nr_process_chains;
unsigned int max_lockdep_depth;
-unsigned int max_recursion_depth;
-
-static unsigned int lockdep_dependency_gen_id;
-
-static bool lockdep_dependency_visit(struct lock_class *source,
- unsigned int depth)
-{
- if (!depth)
- lockdep_dependency_gen_id++;
- if (source->dep_gen_id == lockdep_dependency_gen_id)
- return true;
- source->dep_gen_id = lockdep_dependency_gen_id;
- return false;
-}
#ifdef CONFIG_DEBUG_LOCKDEP
/*
@@ -425,35 +428,26 @@ atomic_t redundant_softirqs_on;
atomic_t redundant_softirqs_off;
atomic_t nr_unused_locks;
atomic_t nr_cyclic_checks;
-atomic_t nr_cyclic_check_recursions;
atomic_t nr_find_usage_forwards_checks;
-atomic_t nr_find_usage_forwards_recursions;
atomic_t nr_find_usage_backwards_checks;
-atomic_t nr_find_usage_backwards_recursions;
-# define debug_atomic_inc(ptr) atomic_inc(ptr)
-# define debug_atomic_dec(ptr) atomic_dec(ptr)
-# define debug_atomic_read(ptr) atomic_read(ptr)
-#else
-# define debug_atomic_inc(ptr) do { } while (0)
-# define debug_atomic_dec(ptr) do { } while (0)
-# define debug_atomic_read(ptr) 0
#endif
/*
* Locking printouts:
*/
+#define __USAGE(__STATE) \
+ [LOCK_USED_IN_##__STATE] = "IN-"__stringify(__STATE)"-W", \
+ [LOCK_ENABLED_##__STATE] = __stringify(__STATE)"-ON-W", \
+ [LOCK_USED_IN_##__STATE##_READ] = "IN-"__stringify(__STATE)"-R",\
+ [LOCK_ENABLED_##__STATE##_READ] = __stringify(__STATE)"-ON-R",
+
static const char *usage_str[] =
{
- [LOCK_USED] = "initial-use ",
- [LOCK_USED_IN_HARDIRQ] = "in-hardirq-W",
- [LOCK_USED_IN_SOFTIRQ] = "in-softirq-W",
- [LOCK_ENABLED_SOFTIRQS] = "softirq-on-W",
- [LOCK_ENABLED_HARDIRQS] = "hardirq-on-W",
- [LOCK_USED_IN_HARDIRQ_READ] = "in-hardirq-R",
- [LOCK_USED_IN_SOFTIRQ_READ] = "in-softirq-R",
- [LOCK_ENABLED_SOFTIRQS_READ] = "softirq-on-R",
- [LOCK_ENABLED_HARDIRQS_READ] = "hardirq-on-R",
+#define LOCKDEP_STATE(__STATE) __USAGE(__STATE)
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+ [LOCK_USED] = "INITIAL USE",
};
const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
@@ -461,46 +455,45 @@ const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
}
-void
-get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4)
+static inline unsigned long lock_flag(enum lock_usage_bit bit)
{
- *c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.';
-
- if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
- *c1 = '+';
- else
- if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
- *c1 = '-';
+ return 1UL << bit;
+}
- if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
- *c2 = '+';
- else
- if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
- *c2 = '-';
+static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
+{
+ char c = '.';
- if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
- *c3 = '-';
- if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) {
- *c3 = '+';
- if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
- *c3 = '?';
+ if (class->usage_mask & lock_flag(bit + 2))
+ c = '+';
+ if (class->usage_mask & lock_flag(bit)) {
+ c = '-';
+ if (class->usage_mask & lock_flag(bit + 2))
+ c = '?';
}
- if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
- *c4 = '-';
- if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) {
- *c4 = '+';
- if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
- *c4 = '?';
- }
+ return c;
+}
+
+void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
+{
+ int i = 0;
+
+#define LOCKDEP_STATE(__STATE) \
+ usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE); \
+ usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE##_READ);
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+
+ usage[i] = '\0';
}
static void print_lock_name(struct lock_class *class)
{
- char str[KSYM_NAME_LEN], c1, c2, c3, c4;
+ char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
const char *name;
- get_usage_chars(class, &c1, &c2, &c3, &c4);
+ get_usage_chars(class, usage);
name = class->name;
if (!name) {
@@ -513,7 +506,7 @@ static void print_lock_name(struct lock_class *class)
if (class->subclass)
printk("/%d", class->subclass);
}
- printk("){%c%c%c%c}", c1, c2, c3, c4);
+ printk("){%s}", usage);
}
static void print_lockdep_cache(struct lockdep_map *lock)
@@ -552,58 +545,6 @@ static void lockdep_print_held_locks(struct task_struct *curr)
}
}
-static void print_lock_class_header(struct lock_class *class, int depth)
-{
- int bit;
-
- printk("%*s->", depth, "");
- print_lock_name(class);
- printk(" ops: %lu", class->ops);
- printk(" {\n");
-
- for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
- if (class->usage_mask & (1 << bit)) {
- int len = depth;
-
- len += printk("%*s %s", depth, "", usage_str[bit]);
- len += printk(" at:\n");
- print_stack_trace(class->usage_traces + bit, len);
- }
- }
- printk("%*s }\n", depth, "");
-
- printk("%*s ... key at: ",depth,"");
- print_ip_sym((unsigned long)class->key);
-}
-
-/*
- * printk all lock dependencies starting at <entry>:
- */
-static void __used
-print_lock_dependencies(struct lock_class *class, int depth)
-{
- struct lock_list *entry;
-
- if (lockdep_dependency_visit(class, depth))
- return;
-
- if (DEBUG_LOCKS_WARN_ON(depth >= 20))
- return;
-
- print_lock_class_header(class, depth);
-
- list_for_each_entry(entry, &class->locks_after, entry) {
- if (DEBUG_LOCKS_WARN_ON(!entry->class))
- return;
-
- print_lock_dependencies(entry->class, depth + 1);
-
- printk("%*s ... acquired at:\n",depth,"");
- print_stack_trace(&entry->trace, 2);
- printk("\n");
- }
-}
-
static void print_kernel_version(void)
{
printk("%s %.*s\n", init_utsname()->release,
@@ -796,6 +737,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
printk("turning off the locking correctness validator.\n");
+ dump_stack();
return NULL;
}
class = lock_classes + nr_lock_classes++;
@@ -859,6 +801,7 @@ static struct lock_list *alloc_list_entry(void)
printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
printk("turning off the locking correctness validator.\n");
+ dump_stack();
return NULL;
}
return list_entries + nr_list_entries++;
@@ -897,22 +840,203 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
}
/*
+ * For good efficiency of modular, we use power of 2
+ */
+#define MAX_CIRCULAR_QUEUE_SIZE 4096UL
+#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
+
+/*
+ * The circular_queue and helpers is used to implement the
+ * breadth-first search(BFS)algorithem, by which we can build
+ * the shortest path from the next lock to be acquired to the
+ * previous held lock if there is a circular between them.
+ */
+struct circular_queue {
+ unsigned long element[MAX_CIRCULAR_QUEUE_SIZE];
+ unsigned int front, rear;
+};
+
+static struct circular_queue lock_cq;
+
+unsigned int max_bfs_queue_depth;
+
+static unsigned int lockdep_dependency_gen_id;
+
+static inline void __cq_init(struct circular_queue *cq)
+{
+ cq->front = cq->rear = 0;
+ lockdep_dependency_gen_id++;
+}
+
+static inline int __cq_empty(struct circular_queue *cq)
+{
+ return (cq->front == cq->rear);
+}
+
+static inline int __cq_full(struct circular_queue *cq)
+{
+ return ((cq->rear + 1) & CQ_MASK) == cq->front;
+}
+
+static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
+{
+ if (__cq_full(cq))
+ return -1;
+
+ cq->element[cq->rear] = elem;
+ cq->rear = (cq->rear + 1) & CQ_MASK;
+ return 0;
+}
+
+static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
+{
+ if (__cq_empty(cq))
+ return -1;
+
+ *elem = cq->element[cq->front];
+ cq->front = (cq->front + 1) & CQ_MASK;
+ return 0;
+}
+
+static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
+{
+ return (cq->rear - cq->front) & CQ_MASK;
+}
+
+static inline void mark_lock_accessed(struct lock_list *lock,
+ struct lock_list *parent)
+{
+ unsigned long nr;
+
+ nr = lock - list_entries;
+ WARN_ON(nr >= nr_list_entries);
+ lock->parent = parent;
+ lock->class->dep_gen_id = lockdep_dependency_gen_id;
+}
+
+static inline unsigned long lock_accessed(struct lock_list *lock)
+{
+ unsigned long nr;
+
+ nr = lock - list_entries;
+ WARN_ON(nr >= nr_list_entries);
+ return lock->class->dep_gen_id == lockdep_dependency_gen_id;
+}
+
+static inline struct lock_list *get_lock_parent(struct lock_list *child)
+{
+ return child->parent;
+}
+
+static inline int get_lock_depth(struct lock_list *child)
+{
+ int depth = 0;
+ struct lock_list *parent;
+
+ while ((parent = get_lock_parent(child))) {
+ child = parent;
+ depth++;
+ }
+ return depth;
+}
+
+static int __bfs(struct lock_list *source_entry,
+ void *data,
+ int (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry,
+ int forward)
+{
+ struct lock_list *entry;
+ struct list_head *head;
+ struct circular_queue *cq = &lock_cq;
+ int ret = 1;
+
+ if (match(source_entry, data)) {
+ *target_entry = source_entry;
+ ret = 0;
+ goto exit;
+ }
+
+ if (forward)
+ head = &source_entry->class->locks_after;
+ else
+ head = &source_entry->class->locks_before;
+
+ if (list_empty(head))
+ goto exit;
+
+ __cq_init(cq);
+ __cq_enqueue(cq, (unsigned long)source_entry);
+
+ while (!__cq_empty(cq)) {
+ struct lock_list *lock;
+
+ __cq_dequeue(cq, (unsigned long *)&lock);
+
+ if (!lock->class) {
+ ret = -2;
+ goto exit;
+ }
+
+ if (forward)
+ head = &lock->class->locks_after;
+ else
+ head = &lock->class->locks_before;
+
+ list_for_each_entry(entry, head, entry) {
+ if (!lock_accessed(entry)) {
+ unsigned int cq_depth;
+ mark_lock_accessed(entry, lock);
+ if (match(entry, data)) {
+ *target_entry = entry;
+ ret = 0;
+ goto exit;
+ }
+
+ if (__cq_enqueue(cq, (unsigned long)entry)) {
+ ret = -1;
+ goto exit;
+ }
+ cq_depth = __cq_get_elem_count(cq);
+ if (max_bfs_queue_depth < cq_depth)
+ max_bfs_queue_depth = cq_depth;
+ }
+ }
+ }
+exit:
+ return ret;
+}
+
+static inline int __bfs_forwards(struct lock_list *src_entry,
+ void *data,
+ int (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
+{
+ return __bfs(src_entry, data, match, target_entry, 1);
+
+}
+
+static inline int __bfs_backwards(struct lock_list *src_entry,
+ void *data,
+ int (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
+{
+ return __bfs(src_entry, data, match, target_entry, 0);
+
+}
+
+/*
* Recursive, forwards-direction lock-dependency checking, used for
* both noncyclic checking and for hardirq-unsafe/softirq-unsafe
* checking.
- *
- * (to keep the stackframe of the recursive functions small we
- * use these global variables, and we also mark various helper
- * functions as noinline.)
*/
-static struct held_lock *check_source, *check_target;
/*
* Print a dependency chain entry (this is only done when a deadlock
* has been detected):
*/
static noinline int
-print_circular_bug_entry(struct lock_list *target, unsigned int depth)
+print_circular_bug_entry(struct lock_list *target, int depth)
{
if (debug_locks_silent)
return 0;
@@ -929,11 +1053,13 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
* header first:
*/
static noinline int
-print_circular_bug_header(struct lock_list *entry, unsigned int depth)
+print_circular_bug_header(struct lock_list *entry, unsigned int depth,
+ struct held_lock *check_src,
+ struct held_lock *check_tgt)
{
struct task_struct *curr = current;
- if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ if (debug_locks_silent)
return 0;
printk("\n=======================================================\n");
@@ -942,9 +1068,9 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
printk( "-------------------------------------------------------\n");
printk("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
- print_lock(check_source);
+ print_lock(check_src);
printk("\nbut task is already holding lock:\n");
- print_lock(check_target);
+ print_lock(check_tgt);
printk("\nwhich lock already depends on the new lock.\n\n");
printk("\nthe existing dependency chain (in reverse order) is:\n");
@@ -953,19 +1079,36 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
return 0;
}
-static noinline int print_circular_bug_tail(void)
+static inline int class_equal(struct lock_list *entry, void *data)
+{
+ return entry->class == data;
+}
+
+static noinline int print_circular_bug(struct lock_list *this,
+ struct lock_list *target,
+ struct held_lock *check_src,
+ struct held_lock *check_tgt)
{
struct task_struct *curr = current;
- struct lock_list this;
+ struct lock_list *parent;
+ int depth;
- if (debug_locks_silent)
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- this.class = hlock_class(check_source);
- if (!save_trace(&this.trace))
+ if (!save_trace(&this->trace))
return 0;
- print_circular_bug_entry(&this, 0);
+ depth = get_lock_depth(target);
+
+ print_circular_bug_header(target, depth, check_src, check_tgt);
+
+ parent = get_lock_parent(target);
+
+ while (parent) {
+ print_circular_bug_entry(parent, --depth);
+ parent = get_lock_parent(parent);
+ }
printk("\nother info that might help us debug this:\n\n");
lockdep_print_held_locks(curr);
@@ -976,73 +1119,69 @@ static noinline int print_circular_bug_tail(void)
return 0;
}
-#define RECURSION_LIMIT 40
-
-static int noinline print_infinite_recursion_bug(void)
+static noinline int print_bfs_bug(int ret)
{
if (!debug_locks_off_graph_unlock())
return 0;
- WARN_ON(1);
+ WARN(1, "lockdep bfs error:%d\n", ret);
return 0;
}
-unsigned long __lockdep_count_forward_deps(struct lock_class *class,
- unsigned int depth)
+static int noop_count(struct lock_list *entry, void *data)
{
- struct lock_list *entry;
- unsigned long ret = 1;
+ (*(unsigned long *)data)++;
+ return 0;
+}
- if (lockdep_dependency_visit(class, depth))
- return 0;
+unsigned long __lockdep_count_forward_deps(struct lock_list *this)
+{
+ unsigned long count = 0;
+ struct lock_list *uninitialized_var(target_entry);
- /*
- * Recurse this class's dependency list:
- */
- list_for_each_entry(entry, &class->locks_after, entry)
- ret += __lockdep_count_forward_deps(entry->class, depth + 1);
+ __bfs_forwards(this, (void *)&count, noop_count, &target_entry);
- return ret;
+ return count;
}
-
unsigned long lockdep_count_forward_deps(struct lock_class *class)
{
unsigned long ret, flags;
+ struct lock_list this;
+
+ this.parent = NULL;
+ this.class = class;
local_irq_save(flags);
__raw_spin_lock(&lockdep_lock);
- ret = __lockdep_count_forward_deps(class, 0);
+ ret = __lockdep_count_forward_deps(&this);
__raw_spin_unlock(&lockdep_lock);
local_irq_restore(flags);
return ret;
}
-unsigned long __lockdep_count_backward_deps(struct lock_class *class,
- unsigned int depth)
+unsigned long __lockdep_count_backward_deps(struct lock_list *this)
{
- struct lock_list *entry;
- unsigned long ret = 1;
+ unsigned long count = 0;
+ struct lock_list *uninitialized_var(target_entry);
- if (lockdep_dependency_visit(class, depth))
- return 0;
- /*
- * Recurse this class's dependency list:
- */
- list_for_each_entry(entry, &class->locks_before, entry)
- ret += __lockdep_count_backward_deps(entry->class, depth + 1);
+ __bfs_backwards(this, (void *)&count, noop_count, &target_entry);
- return ret;
+ return count;
}
unsigned long lockdep_count_backward_deps(struct lock_class *class)
{
unsigned long ret, flags;
+ struct lock_list this;
+
+ this.parent = NULL;
+ this.class = class;
local_irq_save(flags);
__raw_spin_lock(&lockdep_lock);
- ret = __lockdep_count_backward_deps(class, 0);
+ ret = __lockdep_count_backward_deps(&this);
__raw_spin_unlock(&lockdep_lock);
local_irq_restore(flags);
@@ -1054,29 +1193,16 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
* lead to <target>. Print an error and return 0 if it does.
*/
static noinline int
-check_noncircular(struct lock_class *source, unsigned int depth)
+check_noncircular(struct lock_list *root, struct lock_class *target,
+ struct lock_list **target_entry)
{
- struct lock_list *entry;
+ int result;
- if (lockdep_dependency_visit(source, depth))
- return 1;
+ debug_atomic_inc(&nr_cyclic_checks);
- debug_atomic_inc(&nr_cyclic_check_recursions);
- if (depth > max_recursion_depth)
- max_recursion_depth = depth;
- if (depth >= RECURSION_LIMIT)
- return print_infinite_recursion_bug();
- /*
- * Check this lock's dependency list:
- */
- list_for_each_entry(entry, &source->locks_after, entry) {
- if (entry->class == hlock_class(check_target))
- return print_circular_bug_header(entry, depth+1);
- debug_atomic_inc(&nr_cyclic_checks);
- if (!check_noncircular(entry->class, depth+1))
- return print_circular_bug_entry(entry, depth+1);
- }
- return 1;
+ result = __bfs_forwards(root, target, class_equal, target_entry);
+
+ return result;
}
#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
@@ -1085,103 +1211,121 @@ check_noncircular(struct lock_class *source, unsigned int depth)
* proving that two subgraphs can be connected by a new dependency
* without creating any illegal irq-safe -> irq-unsafe lock dependency.
*/
-static enum lock_usage_bit find_usage_bit;
-static struct lock_class *forwards_match, *backwards_match;
+
+static inline int usage_match(struct lock_list *entry, void *bit)
+{
+ return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
+}
+
+
/*
* Find a node in the forwards-direction dependency sub-graph starting
- * at <source> that matches <find_usage_bit>.
+ * at @root->class that matches @bit.
*
- * Return 2 if such a node exists in the subgraph, and put that node
- * into <forwards_match>.
+ * Return 0 if such a node exists in the subgraph, and put that node
+ * into *@target_entry.
*
- * Return 1 otherwise and keep <forwards_match> unchanged.
- * Return 0 on error.
+ * Return 1 otherwise and keep *@target_entry unchanged.
+ * Return <0 on error.
*/
-static noinline int
-find_usage_forwards(struct lock_class *source, unsigned int depth)
+static int
+find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
+ struct lock_list **target_entry)
{
- struct lock_list *entry;
- int ret;
-
- if (lockdep_dependency_visit(source, depth))
- return 1;
-
- if (depth > max_recursion_depth)
- max_recursion_depth = depth;
- if (depth >= RECURSION_LIMIT)
- return print_infinite_recursion_bug();
+ int result;
debug_atomic_inc(&nr_find_usage_forwards_checks);
- if (source->usage_mask & (1 << find_usage_bit)) {
- forwards_match = source;
- return 2;
- }
- /*
- * Check this lock's dependency list:
- */
- list_for_each_entry(entry, &source->locks_after, entry) {
- debug_atomic_inc(&nr_find_usage_forwards_recursions);
- ret = find_usage_forwards(entry->class, depth+1);
- if (ret == 2 || ret == 0)
- return ret;
- }
- return 1;
+ result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
+
+ return result;
}
/*
* Find a node in the backwards-direction dependency sub-graph starting
- * at <source> that matches <find_usage_bit>.
+ * at @root->class that matches @bit.
*
- * Return 2 if such a node exists in the subgraph, and put that node
- * into <backwards_match>.
+ * Return 0 if such a node exists in the subgraph, and put that node
+ * into *@target_entry.
*
- * Return 1 otherwise and keep <backwards_match> unchanged.
- * Return 0 on error.
+ * Return 1 otherwise and keep *@target_entry unchanged.
+ * Return <0 on error.
*/
-static noinline int
-find_usage_backwards(struct lock_class *source, unsigned int depth)
+static int
+find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
+ struct lock_list **target_entry)
{
- struct lock_list *entry;
- int ret;
+ int result;
- if (lockdep_dependency_visit(source, depth))
- return 1;
+ debug_atomic_inc(&nr_find_usage_backwards_checks);
- if (!__raw_spin_is_locked(&lockdep_lock))
- return DEBUG_LOCKS_WARN_ON(1);
+ result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
- if (depth > max_recursion_depth)
- max_recursion_depth = depth;
- if (depth >= RECURSION_LIMIT)
- return print_infinite_recursion_bug();
+ return result;
+}
- debug_atomic_inc(&nr_find_usage_backwards_checks);
- if (source->usage_mask & (1 << find_usage_bit)) {
- backwards_match = source;
- return 2;
- }
+static void print_lock_class_header(struct lock_class *class, int depth)
+{
+ int bit;
- if (!source && debug_locks_off_graph_unlock()) {
- WARN_ON(1);
- return 0;
- }
+ printk("%*s->", depth, "");
+ print_lock_name(class);
+ printk(" ops: %lu", class->ops);
+ printk(" {\n");
- /*
- * Check this lock's dependency list:
- */
- list_for_each_entry(entry, &source->locks_before, entry) {
- debug_atomic_inc(&nr_find_usage_backwards_recursions);
- ret = find_usage_backwards(entry->class, depth+1);
- if (ret == 2 || ret == 0)
- return ret;
+ for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
+ if (class->usage_mask & (1 << bit)) {
+ int len = depth;
+
+ len += printk("%*s %s", depth, "", usage_str[bit]);
+ len += printk(" at:\n");
+ print_stack_trace(class->usage_traces + bit, len);
+ }
}
- return 1;
+ printk("%*s }\n", depth, "");
+
+ printk("%*s ... key at: ",depth,"");
+ print_ip_sym((unsigned long)class->key);
+}
+
+/*
+ * printk the shortest lock dependencies from @start to @end in reverse order:
+ */
+static void __used
+print_shortest_lock_dependencies(struct lock_list *leaf,
+ struct lock_list *root)
+{
+ struct lock_list *entry = leaf;
+ int depth;
+
+ /*compute depth from generated tree by BFS*/
+ depth = get_lock_depth(leaf);
+
+ do {
+ print_lock_class_header(entry->class, depth);
+ printk("%*s ... acquired at:\n", depth, "");
+ print_stack_trace(&entry->trace, 2);
+ printk("\n");
+
+ if (depth == 0 && (entry != root)) {
+ printk("lockdep:%s bad BFS generated tree\n", __func__);
+ break;
+ }
+
+ entry = get_lock_parent(entry);
+ depth--;
+ } while (entry && (depth >= 0));
+
+ return;
}
static int
print_bad_irq_dependency(struct task_struct *curr,
+ struct lock_list *prev_root,
+ struct lock_list *next_root,
+ struct lock_list *backwards_entry,
+ struct lock_list *forwards_entry,
struct held_lock *prev,
struct held_lock *next,
enum lock_usage_bit bit1,
@@ -1214,26 +1358,32 @@ print_bad_irq_dependency(struct task_struct *curr,
printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
irqclass);
- print_lock_name(backwards_match);
+ print_lock_name(backwards_entry->class);
printk("\n... which became %s-irq-safe at:\n", irqclass);
- print_stack_trace(backwards_match->usage_traces + bit1, 1);
+ print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
printk("\nto a %s-irq-unsafe lock:\n", irqclass);
- print_lock_name(forwards_match);
+ print_lock_name(forwards_entry->class);
printk("\n... which became %s-irq-unsafe at:\n", irqclass);
printk("...");
- print_stack_trace(forwards_match->usage_traces + bit2, 1);
+ print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
printk("\nother info that might help us debug this:\n\n");
lockdep_print_held_locks(curr);
- printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass);
- print_lock_dependencies(backwards_match, 0);
+ printk("\nthe dependencies between %s-irq-safe lock", irqclass);
+ printk(" and the holding lock:\n");
+ if (!save_trace(&prev_root->trace))
+ return 0;
+ print_shortest_lock_dependencies(backwards_entry, prev_root);
- printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass);
- print_lock_dependencies(forwards_match, 0);
+ printk("\nthe dependencies between the lock to be acquired");
+ printk(" and %s-irq-unsafe lock:\n", irqclass);
+ if (!save_trace(&next_root->trace))
+ return 0;
+ print_shortest_lock_dependencies(forwards_entry, next_root);
printk("\nstack backtrace:\n");
dump_stack();
@@ -1247,25 +1397,76 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
enum lock_usage_bit bit_forwards, const char *irqclass)
{
int ret;
+ struct lock_list this, that;
+ struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *uninitialized_var(target_entry1);
+
+ this.parent = NULL;
- find_usage_bit = bit_backwards;
- /* fills in <backwards_match> */
- ret = find_usage_backwards(hlock_class(prev), 0);
- if (!ret || ret == 1)
+ this.class = hlock_class(prev);
+ ret = find_usage_backwards(&this, bit_backwards, &target_entry);
+ if (ret < 0)
+ return print_bfs_bug(ret);
+ if (ret == 1)
return ret;
- find_usage_bit = bit_forwards;
- ret = find_usage_forwards(hlock_class(next), 0);
- if (!ret || ret == 1)
+ that.parent = NULL;
+ that.class = hlock_class(next);
+ ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
+ if (ret < 0)
+ return print_bfs_bug(ret);
+ if (ret == 1)
return ret;
- /* ret == 2 */
- return print_bad_irq_dependency(curr, prev, next,
+
+ return print_bad_irq_dependency(curr, &this, &that,
+ target_entry, target_entry1,
+ prev, next,
bit_backwards, bit_forwards, irqclass);
}
-static int
-check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next)
+static const char *state_names[] = {
+#define LOCKDEP_STATE(__STATE) \
+ __stringify(__STATE),
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+static const char *state_rnames[] = {
+#define LOCKDEP_STATE(__STATE) \
+ __stringify(__STATE)"-READ",
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+static inline const char *state_name(enum lock_usage_bit bit)
+{
+ return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2];
+}
+
+static int exclusive_bit(int new_bit)
+{
+ /*
+ * USED_IN
+ * USED_IN_READ
+ * ENABLED
+ * ENABLED_READ
+ *
+ * bit 0 - write/read
+ * bit 1 - used_in/enabled
+ * bit 2+ state
+ */
+
+ int state = new_bit & ~3;
+ int dir = new_bit & 2;
+
+ /*
+ * keep state, bit flip the direction and strip read.
+ */
+ return state | (dir ^ 2);
+}
+
+static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next, enum lock_usage_bit bit)
{
/*
* Prove that the new dependency does not connect a hardirq-safe
@@ -1273,38 +1474,34 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
* the backwards-subgraph starting at <prev>, and the
* forwards-subgraph starting at <next>:
*/
- if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
- LOCK_ENABLED_HARDIRQS, "hard"))
+ if (!check_usage(curr, prev, next, bit,
+ exclusive_bit(bit), state_name(bit)))
return 0;
+ bit++; /* _READ */
+
/*
* Prove that the new dependency does not connect a hardirq-safe-read
* lock with a hardirq-unsafe lock - to achieve this we search
* the backwards-subgraph starting at <prev>, and the
* forwards-subgraph starting at <next>:
*/
- if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
- LOCK_ENABLED_HARDIRQS, "hard-read"))
+ if (!check_usage(curr, prev, next, bit,
+ exclusive_bit(bit), state_name(bit)))
return 0;
- /*
- * Prove that the new dependency does not connect a softirq-safe
- * lock with a softirq-unsafe lock - to achieve this we search
- * the backwards-subgraph starting at <prev>, and the
- * forwards-subgraph starting at <next>:
- */
- if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
- LOCK_ENABLED_SOFTIRQS, "soft"))
- return 0;
- /*
- * Prove that the new dependency does not connect a softirq-safe-read
- * lock with a softirq-unsafe lock - to achieve this we search
- * the backwards-subgraph starting at <prev>, and the
- * forwards-subgraph starting at <next>:
- */
- if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
- LOCK_ENABLED_SOFTIRQS, "soft"))
+ return 1;
+}
+
+static int
+check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next)
+{
+#define LOCKDEP_STATE(__STATE) \
+ if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \
return 0;
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
return 1;
}
@@ -1435,6 +1632,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
{
struct lock_list *entry;
int ret;
+ struct lock_list this;
+ struct lock_list *uninitialized_var(target_entry);
/*
* Prove that the new <prev> -> <next> dependency would not
@@ -1445,10 +1644,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* We are using global variables to control the recursion, to
* keep the stackframe size of the recursive functions low:
*/
- check_source = next;
- check_target = prev;
- if (!(check_noncircular(hlock_class(next), 0)))
- return print_circular_bug_tail();
+ this.class = hlock_class(next);
+ this.parent = NULL;
+ ret = check_noncircular(&this, hlock_class(prev), &target_entry);
+ if (unlikely(!ret))
+ return print_circular_bug(&this, target_entry, next, prev);
+ else if (unlikely(ret < 0))
+ return print_bfs_bug(ret);
if (!check_prev_add_irq(curr, prev, next))
return 0;
@@ -1649,6 +1851,7 @@ cache_hit:
printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
printk("turning off the locking correctness validator.\n");
+ dump_stack();
return 0;
}
chain = lock_chains + nr_lock_chains++;
@@ -1846,7 +2049,8 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
* print irq inversion bug:
*/
static int
-print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
+print_irq_inversion_bug(struct task_struct *curr,
+ struct lock_list *root, struct lock_list *other,
struct held_lock *this, int forwards,
const char *irqclass)
{
@@ -1861,20 +2065,19 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
curr->comm, task_pid_nr(curr));
print_lock(this);
if (forwards)
- printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass);
+ printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
else
- printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass);
- print_lock_name(other);
+ printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
+ print_lock_name(other->class);
printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
printk("\nother info that might help us debug this:\n");
lockdep_print_held_locks(curr);
- printk("\nthe first lock's dependencies:\n");
- print_lock_dependencies(hlock_class(this), 0);
-
- printk("\nthe second lock's dependencies:\n");
- print_lock_dependencies(other, 0);
+ printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
+ if (!save_trace(&root->trace))
+ return 0;
+ print_shortest_lock_dependencies(other, root);
printk("\nstack backtrace:\n");
dump_stack();
@@ -1891,14 +2094,19 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit bit, const char *irqclass)
{
int ret;
-
- find_usage_bit = bit;
- /* fills in <forwards_match> */
- ret = find_usage_forwards(hlock_class(this), 0);
- if (!ret || ret == 1)
+ struct lock_list root;
+ struct lock_list *uninitialized_var(target_entry);
+
+ root.parent = NULL;
+ root.class = hlock_class(this);
+ ret = find_usage_forwards(&root, bit, &target_entry);
+ if (ret < 0)
+ return print_bfs_bug(ret);
+ if (ret == 1)
return ret;
- return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass);
+ return print_irq_inversion_bug(curr, &root, target_entry,
+ this, 1, irqclass);
}
/*
@@ -1910,14 +2118,19 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit bit, const char *irqclass)
{
int ret;
-
- find_usage_bit = bit;
- /* fills in <backwards_match> */
- ret = find_usage_backwards(hlock_class(this), 0);
- if (!ret || ret == 1)
+ struct lock_list root;
+ struct lock_list *uninitialized_var(target_entry);
+
+ root.parent = NULL;
+ root.class = hlock_class(this);
+ ret = find_usage_backwards(&root, bit, &target_entry);
+ if (ret < 0)
+ return print_bfs_bug(ret);
+ if (ret == 1)
return ret;
- return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass);
+ return print_irq_inversion_bug(curr, &root, target_entry,
+ this, 1, irqclass);
}
void print_irqtrace_events(struct task_struct *curr)
@@ -1933,7 +2146,7 @@ void print_irqtrace_events(struct task_struct *curr)
print_ip_sym(curr->softirq_disable_ip);
}
-static int hardirq_verbose(struct lock_class *class)
+static int HARDIRQ_verbose(struct lock_class *class)
{
#if HARDIRQ_VERBOSE
return class_filter(class);
@@ -1941,7 +2154,7 @@ static int hardirq_verbose(struct lock_class *class)
return 0;
}
-static int softirq_verbose(struct lock_class *class)
+static int SOFTIRQ_verbose(struct lock_class *class)
{
#if SOFTIRQ_VERBOSE
return class_filter(class);
@@ -1949,185 +2162,95 @@ static int softirq_verbose(struct lock_class *class)
return 0;
}
+static int RECLAIM_FS_verbose(struct lock_class *class)
+{
+#if RECLAIM_VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+
#define STRICT_READ_CHECKS 1
-static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
+static int (*state_verbose_f[])(struct lock_class *class) = {
+#define LOCKDEP_STATE(__STATE) \
+ __STATE##_verbose,
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+static inline int state_verbose(enum lock_usage_bit bit,
+ struct lock_class *class)
+{
+ return state_verbose_f[bit >> 2](class);
+}
+
+typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
+ enum lock_usage_bit bit, const char *name);
+
+static int
+mark_lock_irq(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit)
{
- int ret = 1;
+ int excl_bit = exclusive_bit(new_bit);
+ int read = new_bit & 1;
+ int dir = new_bit & 2;
- switch(new_bit) {
- case LOCK_USED_IN_HARDIRQ:
- if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
- return 0;
- if (!valid_state(curr, this, new_bit,
- LOCK_ENABLED_HARDIRQS_READ))
- return 0;
- /*
- * just marked it hardirq-safe, check that this lock
- * took no hardirq-unsafe lock in the past:
- */
- if (!check_usage_forwards(curr, this,
- LOCK_ENABLED_HARDIRQS, "hard"))
- return 0;
-#if STRICT_READ_CHECKS
- /*
- * just marked it hardirq-safe, check that this lock
- * took no hardirq-unsafe-read lock in the past:
- */
- if (!check_usage_forwards(curr, this,
- LOCK_ENABLED_HARDIRQS_READ, "hard-read"))
- return 0;
-#endif
- if (hardirq_verbose(hlock_class(this)))
- ret = 2;
- break;
- case LOCK_USED_IN_SOFTIRQ:
- if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
- return 0;
- if (!valid_state(curr, this, new_bit,
- LOCK_ENABLED_SOFTIRQS_READ))
- return 0;
- /*
- * just marked it softirq-safe, check that this lock
- * took no softirq-unsafe lock in the past:
- */
- if (!check_usage_forwards(curr, this,
- LOCK_ENABLED_SOFTIRQS, "soft"))
- return 0;
-#if STRICT_READ_CHECKS
- /*
- * just marked it softirq-safe, check that this lock
- * took no softirq-unsafe-read lock in the past:
- */
- if (!check_usage_forwards(curr, this,
- LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
- return 0;
-#endif
- if (softirq_verbose(hlock_class(this)))
- ret = 2;
- break;
- case LOCK_USED_IN_HARDIRQ_READ:
- if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
- return 0;
- /*
- * just marked it hardirq-read-safe, check that this lock
- * took no hardirq-unsafe lock in the past:
- */
- if (!check_usage_forwards(curr, this,
- LOCK_ENABLED_HARDIRQS, "hard"))
- return 0;
- if (hardirq_verbose(hlock_class(this)))
- ret = 2;
- break;
- case LOCK_USED_IN_SOFTIRQ_READ:
- if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
- return 0;
- /*
- * just marked it softirq-read-safe, check that this lock
- * took no softirq-unsafe lock in the past:
- */
- if (!check_usage_forwards(curr, this,
- LOCK_ENABLED_SOFTIRQS, "soft"))
- return 0;
- if (softirq_verbose(hlock_class(this)))
- ret = 2;
- break;
- case LOCK_ENABLED_HARDIRQS:
- if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
- return 0;
- if (!valid_state(curr, this, new_bit,
- LOCK_USED_IN_HARDIRQ_READ))
- return 0;
- /*
- * just marked it hardirq-unsafe, check that no hardirq-safe
- * lock in the system ever took it in the past:
- */
- if (!check_usage_backwards(curr, this,
- LOCK_USED_IN_HARDIRQ, "hard"))
- return 0;
-#if STRICT_READ_CHECKS
- /*
- * just marked it hardirq-unsafe, check that no
- * hardirq-safe-read lock in the system ever took
- * it in the past:
- */
- if (!check_usage_backwards(curr, this,
- LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
- return 0;
-#endif
- if (hardirq_verbose(hlock_class(this)))
- ret = 2;
- break;
- case LOCK_ENABLED_SOFTIRQS:
- if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
- return 0;
- if (!valid_state(curr, this, new_bit,
- LOCK_USED_IN_SOFTIRQ_READ))
- return 0;
- /*
- * just marked it softirq-unsafe, check that no softirq-safe
- * lock in the system ever took it in the past:
- */
- if (!check_usage_backwards(curr, this,
- LOCK_USED_IN_SOFTIRQ, "soft"))
- return 0;
-#if STRICT_READ_CHECKS
- /*
- * just marked it softirq-unsafe, check that no
- * softirq-safe-read lock in the system ever took
- * it in the past:
- */
- if (!check_usage_backwards(curr, this,
- LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
- return 0;
-#endif
- if (softirq_verbose(hlock_class(this)))
- ret = 2;
- break;
- case LOCK_ENABLED_HARDIRQS_READ:
- if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
- return 0;
-#if STRICT_READ_CHECKS
- /*
- * just marked it hardirq-read-unsafe, check that no
- * hardirq-safe lock in the system ever took it in the past:
- */
- if (!check_usage_backwards(curr, this,
- LOCK_USED_IN_HARDIRQ, "hard"))
- return 0;
-#endif
- if (hardirq_verbose(hlock_class(this)))
- ret = 2;
- break;
- case LOCK_ENABLED_SOFTIRQS_READ:
- if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
+ /*
+ * mark USED_IN has to look forwards -- to ensure no dependency
+ * has ENABLED state, which would allow recursion deadlocks.
+ *
+ * mark ENABLED has to look backwards -- to ensure no dependee
+ * has USED_IN state, which, again, would allow recursion deadlocks.
+ */
+ check_usage_f usage = dir ?
+ check_usage_backwards : check_usage_forwards;
+
+ /*
+ * Validate that this particular lock does not have conflicting
+ * usage states.
+ */
+ if (!valid_state(curr, this, new_bit, excl_bit))
+ return 0;
+
+ /*
+ * Validate that the lock dependencies don't have conflicting usage
+ * states.
+ */
+ if ((!read || !dir || STRICT_READ_CHECKS) &&
+ !usage(curr, this, excl_bit, state_name(new_bit & ~1)))
+ return 0;
+
+ /*
+ * Check for read in write conflicts
+ */
+ if (!read) {
+ if (!valid_state(curr, this, new_bit, excl_bit + 1))
return 0;
-#if STRICT_READ_CHECKS
- /*
- * just marked it softirq-read-unsafe, check that no
- * softirq-safe lock in the system ever took it in the past:
- */
- if (!check_usage_backwards(curr, this,
- LOCK_USED_IN_SOFTIRQ, "soft"))
+
+ if (STRICT_READ_CHECKS &&
+ !usage(curr, this, excl_bit + 1,
+ state_name(new_bit + 1)))
return 0;
-#endif
- if (softirq_verbose(hlock_class(this)))
- ret = 2;
- break;
- default:
- WARN_ON(1);
- break;
}
- return ret;
+ if (state_verbose(new_bit, hlock_class(this)))
+ return 2;
+
+ return 1;
}
+enum mark_type {
+#define LOCKDEP_STATE(__STATE) __STATE,
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
/*
* Mark all held locks with a usage bit:
*/
static int
-mark_held_locks(struct task_struct *curr, int hardirq)
+mark_held_locks(struct task_struct *curr, enum mark_type mark)
{
enum lock_usage_bit usage_bit;
struct held_lock *hlock;
@@ -2136,17 +2259,12 @@ mark_held_locks(struct task_struct *curr, int hardirq)
for (i = 0; i < curr->lockdep_depth; i++) {
hlock = curr->held_locks + i;
- if (hardirq) {
- if (hlock->read)
- usage_bit = LOCK_ENABLED_HARDIRQS_READ;
- else
- usage_bit = LOCK_ENABLED_HARDIRQS;
- } else {
- if (hlock->read)
- usage_bit = LOCK_ENABLED_SOFTIRQS_READ;
- else
- usage_bit = LOCK_ENABLED_SOFTIRQS;
- }
+ usage_bit = 2 + (mark << 2); /* ENABLED */
+ if (hlock->read)
+ usage_bit += 1; /* READ */
+
+ BUG_ON(usage_bit >= LOCK_USAGE_STATES);
+
if (!mark_lock(curr, hlock, usage_bit))
return 0;
}
@@ -2200,7 +2318,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
* We are going to turn hardirqs on, so set the
* usage bit for all held locks:
*/
- if (!mark_held_locks(curr, 1))
+ if (!mark_held_locks(curr, HARDIRQ))
return;
/*
* If we have softirqs enabled, then set the usage
@@ -2208,7 +2326,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
* this bit from being set before)
*/
if (curr->softirqs_enabled)
- if (!mark_held_locks(curr, 0))
+ if (!mark_held_locks(curr, SOFTIRQ))
return;
curr->hardirq_enable_ip = ip;
@@ -2288,7 +2406,7 @@ void trace_softirqs_on(unsigned long ip)
* enabled too:
*/
if (curr->hardirqs_enabled)
- mark_held_locks(curr, 0);
+ mark_held_locks(curr, SOFTIRQ);
}
/*
@@ -2317,6 +2435,48 @@ void trace_softirqs_off(unsigned long ip)
debug_atomic_inc(&redundant_softirqs_off);
}
+static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
+{
+ struct task_struct *curr = current;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ /* no reclaim without waiting on it */
+ if (!(gfp_mask & __GFP_WAIT))
+ return;
+
+ /* this guy won't enter reclaim */
+ if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
+ return;
+
+ /* We're only interested __GFP_FS allocations for now */
+ if (!(gfp_mask & __GFP_FS))
+ return;
+
+ if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
+ return;
+
+ mark_held_locks(curr, RECLAIM_FS);
+}
+
+static void check_flags(unsigned long flags);
+
+void lockdep_trace_alloc(gfp_t gfp_mask)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+ current->lockdep_recursion = 1;
+ __lockdep_trace_alloc(gfp_mask, flags);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+
static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
{
/*
@@ -2345,19 +2505,35 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
if (!hlock->hardirqs_off) {
if (hlock->read) {
if (!mark_lock(curr, hlock,
- LOCK_ENABLED_HARDIRQS_READ))
+ LOCK_ENABLED_HARDIRQ_READ))
return 0;
if (curr->softirqs_enabled)
if (!mark_lock(curr, hlock,
- LOCK_ENABLED_SOFTIRQS_READ))
+ LOCK_ENABLED_SOFTIRQ_READ))
return 0;
} else {
if (!mark_lock(curr, hlock,
- LOCK_ENABLED_HARDIRQS))
+ LOCK_ENABLED_HARDIRQ))
return 0;
if (curr->softirqs_enabled)
if (!mark_lock(curr, hlock,
- LOCK_ENABLED_SOFTIRQS))
+ LOCK_ENABLED_SOFTIRQ))
+ return 0;
+ }
+ }
+
+ /*
+ * We reuse the irq context infrastructure more broadly as a general
+ * context checking code. This tests GFP_FS recursion (a lock taken
+ * during reclaim for a GFP_FS allocation is held over a GFP_FS
+ * allocation).
+ */
+ if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) {
+ if (hlock->read) {
+ if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ))
+ return 0;
+ } else {
+ if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))
return 0;
}
}
@@ -2412,6 +2588,10 @@ static inline int separate_irq_context(struct task_struct *curr,
return 0;
}
+void lockdep_trace_alloc(gfp_t gfp_mask)
+{
+}
+
#endif
/*
@@ -2445,14 +2625,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
return 0;
switch (new_bit) {
- case LOCK_USED_IN_HARDIRQ:
- case LOCK_USED_IN_SOFTIRQ:
- case LOCK_USED_IN_HARDIRQ_READ:
- case LOCK_USED_IN_SOFTIRQ_READ:
- case LOCK_ENABLED_HARDIRQS:
- case LOCK_ENABLED_SOFTIRQS:
- case LOCK_ENABLED_HARDIRQS_READ:
- case LOCK_ENABLED_SOFTIRQS_READ:
+#define LOCKDEP_STATE(__STATE) \
+ case LOCK_USED_IN_##__STATE: \
+ case LOCK_USED_IN_##__STATE##_READ: \
+ case LOCK_ENABLED_##__STATE: \
+ case LOCK_ENABLED_##__STATE##_READ:
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
ret = mark_lock_irq(curr, this, new_bit);
if (!ret)
return 0;
@@ -2488,13 +2667,20 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
void lockdep_init_map(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, int subclass)
{
- if (unlikely(!debug_locks))
+ lock->class_cache = NULL;
+#ifdef CONFIG_LOCK_STAT
+ lock->cpu = raw_smp_processor_id();
+#endif
+
+ if (DEBUG_LOCKS_WARN_ON(!name)) {
+ lock->name = "NULL";
return;
+ }
+
+ lock->name = name;
if (DEBUG_LOCKS_WARN_ON(!key))
return;
- if (DEBUG_LOCKS_WARN_ON(!name))
- return;
/*
* Sanity check, the lock-class key must be persistent:
*/
@@ -2503,12 +2689,11 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
DEBUG_LOCKS_WARN_ON(1);
return;
}
- lock->name = name;
lock->key = key;
- lock->class_cache = NULL;
-#ifdef CONFIG_LOCK_STAT
- lock->cpu = raw_smp_processor_id();
-#endif
+
+ if (unlikely(!debug_locks))
+ return;
+
if (subclass)
register_lock_class(lock, subclass, 1);
}
@@ -2520,13 +2705,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
*/
static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int trylock, int read, int check, int hardirqs_off,
- struct lockdep_map *nest_lock, unsigned long ip)
+ struct lockdep_map *nest_lock, unsigned long ip,
+ int references)
{
struct task_struct *curr = current;
struct lock_class *class = NULL;
struct held_lock *hlock;
unsigned int depth, id;
int chain_head = 0;
+ int class_idx;
u64 chain_key;
if (!prove_locking)
@@ -2542,6 +2729,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
debug_locks_off();
printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
printk("turning off the locking correctness validator.\n");
+ dump_stack();
return 0;
}
@@ -2573,10 +2761,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
return 0;
+ class_idx = class - lock_classes + 1;
+
+ if (depth) {
+ hlock = curr->held_locks + depth - 1;
+ if (hlock->class_idx == class_idx && nest_lock) {
+ if (hlock->references)
+ hlock->references++;
+ else
+ hlock->references = 2;
+
+ return 1;
+ }
+ }
+
hlock = curr->held_locks + depth;
if (DEBUG_LOCKS_WARN_ON(!class))
return 0;
- hlock->class_idx = class - lock_classes + 1;
+ hlock->class_idx = class_idx;
hlock->acquire_ip = ip;
hlock->instance = lock;
hlock->nest_lock = nest_lock;
@@ -2584,6 +2786,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->read = read;
hlock->check = check;
hlock->hardirqs_off = !!hardirqs_off;
+ hlock->references = references;
#ifdef CONFIG_LOCK_STAT
hlock->waittime_stamp = 0;
hlock->holdtime_stamp = sched_clock();
@@ -2638,6 +2841,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
debug_locks_off();
printk("BUG: MAX_LOCK_DEPTH too low!\n");
printk("turning off the locking correctness validator.\n");
+ dump_stack();
return 0;
}
@@ -2691,6 +2895,30 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
return 1;
}
+static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
+{
+ if (hlock->instance == lock)
+ return 1;
+
+ if (hlock->references) {
+ struct lock_class *class = lock->class_cache;
+
+ if (!class)
+ class = look_up_lock_class(lock, 0);
+
+ if (DEBUG_LOCKS_WARN_ON(!class))
+ return 0;
+
+ if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
+ return 0;
+
+ if (hlock->class_idx == class - lock_classes + 1)
+ return 1;
+ }
+
+ return 0;
+}
+
static int
__lock_set_class(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, unsigned int subclass,
@@ -2714,7 +2942,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
*/
if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
break;
- if (hlock->instance == lock)
+ if (match_held_lock(hlock, lock))
goto found_it;
prev_hlock = hlock;
}
@@ -2733,7 +2961,8 @@ found_it:
if (!__lock_acquire(hlock->instance,
hlock_class(hlock)->subclass, hlock->trylock,
hlock->read, hlock->check, hlock->hardirqs_off,
- hlock->nest_lock, hlock->acquire_ip))
+ hlock->nest_lock, hlock->acquire_ip,
+ hlock->references))
return 0;
}
@@ -2772,20 +3001,34 @@ lock_release_non_nested(struct task_struct *curr,
*/
if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
break;
- if (hlock->instance == lock)
+ if (match_held_lock(hlock, lock))
goto found_it;
prev_hlock = hlock;
}
return print_unlock_inbalance_bug(curr, lock, ip);
found_it:
- lock_release_holdtime(hlock);
+ if (hlock->instance == lock)
+ lock_release_holdtime(hlock);
+
+ if (hlock->references) {
+ hlock->references--;
+ if (hlock->references) {
+ /*
+ * We had, and after removing one, still have
+ * references, the current lock stack is still
+ * valid. We're done!
+ */
+ return 1;
+ }
+ }
/*
* We have the right lock to unlock, 'hlock' points to it.
* Now we remove it from the stack, and add back the other
* entries (if any), recalculating the hash along the way:
*/
+
curr->lockdep_depth = i;
curr->curr_chain_key = hlock->prev_chain_key;
@@ -2794,7 +3037,8 @@ found_it:
if (!__lock_acquire(hlock->instance,
hlock_class(hlock)->subclass, hlock->trylock,
hlock->read, hlock->check, hlock->hardirqs_off,
- hlock->nest_lock, hlock->acquire_ip))
+ hlock->nest_lock, hlock->acquire_ip,
+ hlock->references))
return 0;
}
@@ -2824,7 +3068,7 @@ static int lock_release_nested(struct task_struct *curr,
/*
* Is the unlock non-nested:
*/
- if (hlock->instance != lock)
+ if (hlock->instance != lock || hlock->references)
return lock_release_non_nested(curr, lock, ip);
curr->lockdep_depth--;
@@ -2869,6 +3113,21 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
check_chain_key(curr);
}
+static int __lock_is_held(struct lockdep_map *lock)
+{
+ struct task_struct *curr = current;
+ int i;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ struct held_lock *hlock = curr->held_locks + i;
+
+ if (match_held_lock(hlock, lock))
+ return 1;
+ }
+
+ return 0;
+}
+
/*
* Check whether we follow the irq-flags state precisely:
*/
@@ -2935,6 +3194,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
{
unsigned long flags;
+ trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
+
if (unlikely(current->lockdep_recursion))
return;
@@ -2943,7 +3204,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
current->lockdep_recursion = 1;
__lock_acquire(lock, subclass, trylock, read, check,
- irqs_disabled_flags(flags), nest_lock, ip);
+ irqs_disabled_flags(flags), nest_lock, ip, 0);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
}
@@ -2954,6 +3215,8 @@ void lock_release(struct lockdep_map *lock, int nested,
{
unsigned long flags;
+ trace_lock_release(lock, nested, ip);
+
if (unlikely(current->lockdep_recursion))
return;
@@ -2966,6 +3229,36 @@ void lock_release(struct lockdep_map *lock, int nested,
}
EXPORT_SYMBOL_GPL(lock_release);
+int lock_is_held(struct lockdep_map *lock)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ if (unlikely(current->lockdep_recursion))
+ return ret;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ current->lockdep_recursion = 1;
+ ret = __lock_is_held(lock);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lock_is_held);
+
+void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
+{
+ current->lockdep_reclaim_gfp = gfp_mask;
+}
+
+void lockdep_clear_current_reclaim_state(void)
+{
+ current->lockdep_reclaim_gfp = 0;
+}
+
#ifdef CONFIG_LOCK_STAT
static int
print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
@@ -3015,7 +3308,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
*/
if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
break;
- if (hlock->instance == lock)
+ if (match_held_lock(hlock, lock))
goto found_it;
prev_hlock = hlock;
}
@@ -3023,6 +3316,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
return;
found_it:
+ if (hlock->instance != lock)
+ return;
+
hlock->waittime_stamp = sched_clock();
contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
@@ -3062,7 +3358,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
*/
if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
break;
- if (hlock->instance == lock)
+ if (match_held_lock(hlock, lock))
goto found_it;
prev_hlock = hlock;
}
@@ -3070,6 +3366,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
return;
found_it:
+ if (hlock->instance != lock)
+ return;
+
cpu = smp_processor_id();
if (hlock->waittime_stamp) {
now = sched_clock();
@@ -3077,6 +3376,8 @@ found_it:
hlock->holdtime_stamp = now;
}
+ trace_lock_acquired(lock, ip, waittime);
+
stats = get_lock_stats(hlock_class(hlock));
if (waittime) {
if (hlock->read)
@@ -3096,6 +3397,8 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
+ trace_lock_contended(lock, ip);
+
if (unlikely(!lock_stat))
return;
@@ -3296,7 +3599,12 @@ void __init lockdep_info(void)
sizeof(struct list_head) * CLASSHASH_SIZE +
sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
- sizeof(struct list_head) * CHAINHASH_SIZE) / 1024);
+ sizeof(struct list_head) * CHAINHASH_SIZE
+#ifdef CONFIG_PROVE_LOCKING
+ + sizeof(struct circular_queue)
+#endif
+ ) / 1024
+ );
printk(" per task-struct memory footprint: %lu bytes\n",
sizeof(struct held_lock) * MAX_LOCK_DEPTH);
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 56b196932c0..a2ee95ad131 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -7,6 +7,45 @@
*/
/*
+ * Lock-class usage-state bits:
+ */
+enum lock_usage_bit {
+#define LOCKDEP_STATE(__STATE) \
+ LOCK_USED_IN_##__STATE, \
+ LOCK_USED_IN_##__STATE##_READ, \
+ LOCK_ENABLED_##__STATE, \
+ LOCK_ENABLED_##__STATE##_READ,
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+ LOCK_USED,
+ LOCK_USAGE_STATES
+};
+
+/*
+ * Usage-state bitmasks:
+ */
+#define __LOCKF(__STATE) LOCKF_##__STATE = (1 << LOCK_##__STATE),
+
+enum {
+#define LOCKDEP_STATE(__STATE) \
+ __LOCKF(USED_IN_##__STATE) \
+ __LOCKF(USED_IN_##__STATE##_READ) \
+ __LOCKF(ENABLED_##__STATE) \
+ __LOCKF(ENABLED_##__STATE##_READ)
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+ __LOCKF(USED)
+};
+
+#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
+#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
+
+#define LOCKF_ENABLED_IRQ_READ \
+ (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
+#define LOCKF_USED_IN_IRQ_READ \
+ (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
+
+/*
* MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
* we track.
*
@@ -15,9 +54,9 @@
* table (if it's not there yet), and we check it for lock order
* conflicts and deadlocks.
*/
-#define MAX_LOCKDEP_ENTRIES 8192UL
+#define MAX_LOCKDEP_ENTRIES 16384UL
-#define MAX_LOCKDEP_CHAINS_BITS 14
+#define MAX_LOCKDEP_CHAINS_BITS 15
#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
@@ -31,8 +70,10 @@
extern struct list_head all_lock_classes;
extern struct lock_chain lock_chains[];
-extern void
-get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4);
+#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2)
+
+extern void get_usage_chars(struct lock_class *class,
+ char usage[LOCK_USAGE_CHARS]);
extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
@@ -50,6 +91,8 @@ extern unsigned int nr_process_chains;
extern unsigned int max_lockdep_depth;
extern unsigned int max_recursion_depth;
+extern unsigned int max_bfs_queue_depth;
+
#ifdef CONFIG_PROVE_LOCKING
extern unsigned long lockdep_count_forward_deps(struct lock_class *);
extern unsigned long lockdep_count_backward_deps(struct lock_class *);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 13716b81389..d4b3dbc79fd 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -25,38 +25,12 @@
static void *l_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct lock_class *class;
-
- (*pos)++;
-
- if (v == SEQ_START_TOKEN)
- class = m->private;
- else {
- class = v;
-
- if (class->lock_entry.next != &all_lock_classes)
- class = list_entry(class->lock_entry.next,
- struct lock_class, lock_entry);
- else
- class = NULL;
- }
-
- return class;
+ return seq_list_next(v, &all_lock_classes, pos);
}
static void *l_start(struct seq_file *m, loff_t *pos)
{
- struct lock_class *class;
- loff_t i = 0;
-
- if (*pos == 0)
- return SEQ_START_TOKEN;
-
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
- if (++i == *pos)
- return class;
- }
- return NULL;
+ return seq_list_start_head(&all_lock_classes, *pos);
}
static void l_stop(struct seq_file *m, void *v)
@@ -82,11 +56,11 @@ static void print_name(struct seq_file *m, struct lock_class *class)
static int l_show(struct seq_file *m, void *v)
{
- struct lock_class *class = v;
+ struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
struct lock_list *entry;
- char c1, c2, c3, c4;
+ char usage[LOCK_USAGE_CHARS];
- if (v == SEQ_START_TOKEN) {
+ if (v == &all_lock_classes) {
seq_printf(m, "all lock classes:\n");
return 0;
}
@@ -100,8 +74,8 @@ static int l_show(struct seq_file *m, void *v)
seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
#endif
- get_usage_chars(class, &c1, &c2, &c3, &c4);
- seq_printf(m, " %c%c%c%c", c1, c2, c3, c4);
+ get_usage_chars(class, usage);
+ seq_printf(m, " %s", usage);
seq_printf(m, ": ");
print_name(m, class);
@@ -128,17 +102,7 @@ static const struct seq_operations lockdep_ops = {
static int lockdep_open(struct inode *inode, struct file *file)
{
- int res = seq_open(file, &lockdep_ops);
- if (!res) {
- struct seq_file *m = file->private_data;
-
- if (!list_empty(&all_lock_classes))
- m->private = list_entry(all_lock_classes.next,
- struct lock_class, lock_entry);
- else
- m->private = NULL;
- }
- return res;
+ return seq_open(file, &lockdep_ops);
}
static const struct file_operations proc_lockdep_operations = {
@@ -149,37 +113,23 @@ static const struct file_operations proc_lockdep_operations = {
};
#ifdef CONFIG_PROVE_LOCKING
-static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
-{
- struct lock_chain *chain;
-
- (*pos)++;
-
- if (v == SEQ_START_TOKEN)
- chain = m->private;
- else {
- chain = v;
-
- if (*pos < nr_lock_chains)
- chain = lock_chains + *pos;
- else
- chain = NULL;
- }
-
- return chain;
-}
-
static void *lc_start(struct seq_file *m, loff_t *pos)
{
if (*pos == 0)
return SEQ_START_TOKEN;
- if (*pos < nr_lock_chains)
- return lock_chains + *pos;
+ if (*pos - 1 < nr_lock_chains)
+ return lock_chains + (*pos - 1);
return NULL;
}
+static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return lc_start(m, pos);
+}
+
static void lc_stop(struct seq_file *m, void *v)
{
}
@@ -220,16 +170,7 @@ static const struct seq_operations lockdep_chains_ops = {
static int lockdep_chains_open(struct inode *inode, struct file *file)
{
- int res = seq_open(file, &lockdep_chains_ops);
- if (!res) {
- struct seq_file *m = file->private_data;
-
- if (nr_lock_chains)
- m->private = lock_chains;
- else
- m->private = NULL;
- }
- return res;
+ return seq_open(file, &lockdep_chains_ops);
}
static const struct file_operations proc_lockdep_chains_operations = {
@@ -258,16 +199,10 @@ static void lockdep_stats_debug_show(struct seq_file *m)
debug_atomic_read(&chain_lookup_hits));
seq_printf(m, " cyclic checks: %11u\n",
debug_atomic_read(&nr_cyclic_checks));
- seq_printf(m, " cyclic-check recursions: %11u\n",
- debug_atomic_read(&nr_cyclic_check_recursions));
seq_printf(m, " find-mask forwards checks: %11u\n",
debug_atomic_read(&nr_find_usage_forwards_checks));
- seq_printf(m, " find-mask forwards recursions: %11u\n",
- debug_atomic_read(&nr_find_usage_forwards_recursions));
seq_printf(m, " find-mask backwards checks: %11u\n",
debug_atomic_read(&nr_find_usage_backwards_checks));
- seq_printf(m, " find-mask backwards recursions:%11u\n",
- debug_atomic_read(&nr_find_usage_backwards_recursions));
seq_printf(m, " hardirq on events: %11u\n", hi1);
seq_printf(m, " hardirq off events: %11u\n", hi2);
@@ -300,27 +235,27 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
nr_uncategorized++;
if (class->usage_mask & LOCKF_USED_IN_IRQ)
nr_irq_safe++;
- if (class->usage_mask & LOCKF_ENABLED_IRQS)
+ if (class->usage_mask & LOCKF_ENABLED_IRQ)
nr_irq_unsafe++;
if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
nr_softirq_safe++;
- if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ)
nr_softirq_unsafe++;
if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
nr_hardirq_safe++;
- if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQ)
nr_hardirq_unsafe++;
if (class->usage_mask & LOCKF_USED_IN_IRQ_READ)
nr_irq_read_safe++;
- if (class->usage_mask & LOCKF_ENABLED_IRQS_READ)
+ if (class->usage_mask & LOCKF_ENABLED_IRQ_READ)
nr_irq_read_unsafe++;
if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ)
nr_softirq_read_safe++;
- if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
nr_softirq_read_unsafe++;
if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ)
nr_hardirq_read_safe++;
- if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
nr_hardirq_read_unsafe++;
#ifdef CONFIG_PROVE_LOCKING
@@ -409,8 +344,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
nr_unused);
seq_printf(m, " max locking depth: %11u\n",
max_lockdep_depth);
- seq_printf(m, " max recursion depth: %11u\n",
- max_recursion_depth);
+#ifdef CONFIG_PROVE_LOCKING
+ seq_printf(m, " max bfs queue depth: %11u\n",
+ max_bfs_queue_depth);
+#endif
lockdep_stats_debug_show(m);
seq_printf(m, " debug_locks: %11u\n",
debug_locks);
@@ -438,7 +375,6 @@ struct lock_stat_data {
};
struct lock_stat_seq {
- struct lock_stat_data *iter;
struct lock_stat_data *iter_end;
struct lock_stat_data stats[MAX_LOCKDEP_KEYS];
};
@@ -601,6 +537,10 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
static void seq_header(struct seq_file *m)
{
seq_printf(m, "lock_stat version 0.3\n");
+
+ if (unlikely(!debug_locks))
+ seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n");
+
seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
"%14s %14s\n",
@@ -622,34 +562,22 @@ static void seq_header(struct seq_file *m)
static void *ls_start(struct seq_file *m, loff_t *pos)
{
struct lock_stat_seq *data = m->private;
+ struct lock_stat_data *iter;
if (*pos == 0)
return SEQ_START_TOKEN;
- data->iter = data->stats + *pos;
- if (data->iter >= data->iter_end)
- data->iter = NULL;
+ iter = data->stats + (*pos - 1);
+ if (iter >= data->iter_end)
+ iter = NULL;
- return data->iter;
+ return iter;
}
static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct lock_stat_seq *data = m->private;
-
(*pos)++;
-
- if (v == SEQ_START_TOKEN)
- data->iter = data->stats;
- else {
- data->iter = v;
- data->iter++;
- }
-
- if (data->iter == data->iter_end)
- data->iter = NULL;
-
- return data->iter;
+ return ls_start(m, pos);
}
static void ls_stop(struct seq_file *m, void *v)
@@ -687,7 +615,6 @@ static int lock_stat_open(struct inode *inode, struct file *file)
struct lock_stat_data *iter = data->stats;
struct seq_file *m = file->private_data;
- data->iter = iter;
list_for_each_entry(class, &all_lock_classes, lock_entry) {
iter->class = class;
iter->stats = lock_stats(class);
@@ -695,7 +622,7 @@ static int lock_stat_open(struct inode *inode, struct file *file)
}
data->iter_end = iter;
- sort(data->stats, data->iter_end - data->iter,
+ sort(data->stats, data->iter_end - data->stats,
sizeof(struct lock_stat_data),
lock_stat_cmp, NULL);
@@ -730,7 +657,6 @@ static int lock_stat_release(struct inode *inode, struct file *file)
struct seq_file *seq = file->private_data;
vfree(seq->private);
- seq->private = NULL;
return seq_release(inode, file);
}
@@ -754,7 +680,8 @@ static int __init lockdep_proc_init(void)
&proc_lockdep_stats_operations);
#ifdef CONFIG_LOCK_STAT
- proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations);
+ proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
+ &proc_lock_stat_operations);
#endif
return 0;
diff --git a/kernel/lockdep_states.h b/kernel/lockdep_states.h
new file mode 100644
index 00000000000..995b0cc2b84
--- /dev/null
+++ b/kernel/lockdep_states.h
@@ -0,0 +1,9 @@
+/*
+ * Lockdep states,
+ *
+ * please update XXX_LOCK_USAGE_STATES in include/linux/lockdep.h whenever
+ * you add one, or come up with a nice dynamic solution.
+ */
+LOCKDEP_STATE(HARDIRQ)
+LOCKDEP_STATE(SOFTIRQ)
+LOCKDEP_STATE(RECLAIM_FS)
diff --git a/kernel/module.c b/kernel/module.c
index e8b51d41dd7..46580edff0c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -18,6 +18,7 @@
*/
#include <linux/module.h>
#include <linux/moduleloader.h>
+#include <linux/ftrace_event.h>
#include <linux/init.h>
#include <linux/kallsyms.h>
#include <linux/fs.h>
@@ -51,6 +52,13 @@
#include <linux/tracepoint.h>
#include <linux/ftrace.h>
#include <linux/async.h>
+#include <linux/percpu.h>
+#include <linux/kmemleak.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/module.h>
+
+EXPORT_TRACEPOINT_SYMBOL(module_get);
#if 0
#define DEBUGP printk
@@ -67,15 +75,19 @@
/* List of modules, protected by module_mutex or preempt_disable
* (delete uses stop_machine/add uses RCU list operations). */
-static DEFINE_MUTEX(module_mutex);
+DEFINE_MUTEX(module_mutex);
+EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules);
+/* Block module loading/unloading? */
+int modules_disabled = 0;
+
/* Waiting for a module to finish initializing? */
static DECLARE_WAIT_QUEUE_HEAD(module_wq);
static BLOCKING_NOTIFIER_HEAD(module_notify_list);
-/* Bounds of module allocation, for speeding __module_text_address */
+/* Bounds of module allocation, for speeding __module_address */
static unsigned long module_addr_min = -1UL, module_addr_max = 0;
int register_module_notifier(struct notifier_block * nb)
@@ -185,17 +197,6 @@ extern const unsigned long __start___kcrctab_unused_gpl[];
#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
#endif
-struct symsearch {
- const struct kernel_symbol *start, *stop;
- const unsigned long *crcs;
- enum {
- NOT_GPL_ONLY,
- GPL_ONLY,
- WILL_BE_GPL_ONLY,
- } licence;
- bool unused;
-};
-
static bool each_symbol_in_section(const struct symsearch *arr,
unsigned int arrsize,
struct module *owner,
@@ -216,10 +217,8 @@ static bool each_symbol_in_section(const struct symsearch *arr,
}
/* Returns true as soon as fn returns true, otherwise false. */
-static bool each_symbol(bool (*fn)(const struct symsearch *arr,
- struct module *owner,
- unsigned int symnum, void *data),
- void *data)
+bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
+ unsigned int symnum, void *data), void *data)
{
struct module *mod;
const struct symsearch arr[] = {
@@ -272,6 +271,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
}
return false;
}
+EXPORT_SYMBOL_GPL(each_symbol);
struct find_symbol_arg {
/* Input */
@@ -282,7 +282,7 @@ struct find_symbol_arg {
/* Output */
struct module *owner;
const unsigned long *crc;
- unsigned long value;
+ const struct kernel_symbol *sym;
};
static bool find_symbol_in_section(const struct symsearch *syms,
@@ -323,17 +323,17 @@ static bool find_symbol_in_section(const struct symsearch *syms,
fsa->owner = owner;
fsa->crc = symversion(syms->crcs, symnum);
- fsa->value = syms->start[symnum].value;
+ fsa->sym = &syms->start[symnum];
return true;
}
-/* Find a symbol, return value, (optional) crc and (optional) module
- * which owns it */
-static unsigned long find_symbol(const char *name,
- struct module **owner,
- const unsigned long **crc,
- bool gplok,
- bool warn)
+/* Find a symbol and return it, along with, (optional) crc and
+ * (optional) module which owns it */
+const struct kernel_symbol *find_symbol(const char *name,
+ struct module **owner,
+ const unsigned long **crc,
+ bool gplok,
+ bool warn)
{
struct find_symbol_arg fsa;
@@ -346,15 +346,16 @@ static unsigned long find_symbol(const char *name,
*owner = fsa.owner;
if (crc)
*crc = fsa.crc;
- return fsa.value;
+ return fsa.sym;
}
DEBUGP("Failed to find symbol %s\n", name);
- return -ENOENT;
+ return NULL;
}
+EXPORT_SYMBOL_GPL(find_symbol);
/* Search for module by name: must hold module_mutex. */
-static struct module *find_module(const char *name)
+struct module *find_module(const char *name)
{
struct module *mod;
@@ -364,8 +365,37 @@ static struct module *find_module(const char *name)
}
return NULL;
}
+EXPORT_SYMBOL_GPL(find_module);
#ifdef CONFIG_SMP
+
+#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+
+static void *percpu_modalloc(unsigned long size, unsigned long align,
+ const char *name)
+{
+ void *ptr;
+
+ if (align > PAGE_SIZE) {
+ printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
+ name, align, PAGE_SIZE);
+ align = PAGE_SIZE;
+ }
+
+ ptr = __alloc_reserved_percpu(size, align);
+ if (!ptr)
+ printk(KERN_WARNING
+ "Could not allocate %lu bytes percpu data\n", size);
+ return ptr;
+}
+
+static void percpu_modfree(void *freeme)
+{
+ free_percpu(freeme);
+}
+
+#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
/* Number of blocks used and allocated. */
static unsigned int pcpu_num_used, pcpu_num_allocated;
/* Size of each block. -ve means used. */
@@ -409,6 +439,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
unsigned long extra;
unsigned int i;
void *ptr;
+ int cpu;
if (align > PAGE_SIZE) {
printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
@@ -438,6 +469,11 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
if (!split_block(i, size))
return NULL;
+ /* add the per-cpu scanning areas */
+ for_each_possible_cpu(cpu)
+ kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0,
+ GFP_KERNEL);
+
/* Mark allocated */
pcpu_size[i] = -pcpu_size[i];
return ptr;
@@ -452,6 +488,7 @@ static void percpu_modfree(void *freeme)
{
unsigned int i;
void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
+ int cpu;
/* First entry is core kernel percpu data. */
for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
@@ -463,6 +500,10 @@ static void percpu_modfree(void *freeme)
BUG();
free:
+ /* remove the per-cpu scanning areas */
+ for_each_possible_cpu(cpu)
+ kmemleak_free(freeme + per_cpu_offset(cpu));
+
/* Merge with previous? */
if (pcpu_size[i-1] >= 0) {
pcpu_size[i-1] += pcpu_size[i];
@@ -480,21 +521,6 @@ static void percpu_modfree(void *freeme)
}
}
-static unsigned int find_pcpusec(Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- const char *secstrings)
-{
- return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
-}
-
-static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- memcpy(pcpudest + per_cpu_offset(cpu), from, size);
-}
-
static int percpu_modinit(void)
{
pcpu_num_used = 2;
@@ -513,7 +539,26 @@ static int percpu_modinit(void)
return 0;
}
__initcall(percpu_modinit);
+
+#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
+static unsigned int find_pcpusec(Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs,
+ const char *secstrings)
+{
+ return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
+}
+
+static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ memcpy(pcpudest + per_cpu_offset(cpu), from, size);
+}
+
#else /* ... !CONFIG_SMP */
+
static inline void *percpu_modalloc(unsigned long size, unsigned long align,
const char *name)
{
@@ -535,6 +580,7 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
/* pcpusec should be 0, and size of that section should be 0. */
BUG_ON(size != 0);
}
+
#endif /* CONFIG_SMP */
#define MODINFO_ATTR(field) \
@@ -573,13 +619,13 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
/* Init the unload section of the module. */
static void module_unload_init(struct module *mod)
{
- unsigned int i;
+ int cpu;
INIT_LIST_HEAD(&mod->modules_which_use_me);
- for (i = 0; i < NR_CPUS; i++)
- local_set(&mod->ref[i].count, 0);
+ for_each_possible_cpu(cpu)
+ local_set(__module_ref_addr(mod, cpu), 0);
/* Hold reference count during initialization. */
- local_set(&mod->ref[raw_smp_processor_id()].count, 1);
+ local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
/* Backwards compatibility macros put refcount during init. */
mod->waiter = current;
}
@@ -607,7 +653,7 @@ static int already_uses(struct module *a, struct module *b)
}
/* Module a uses b */
-static int use_module(struct module *a, struct module *b)
+int use_module(struct module *a, struct module *b)
{
struct module_use *use;
int no_warn, err;
@@ -640,6 +686,7 @@ static int use_module(struct module *a, struct module *b)
no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
return 1;
}
+EXPORT_SYMBOL_GPL(use_module);
/* Clear the unload stuff of the module. */
static void module_unload_free(struct module *mod)
@@ -717,10 +764,11 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
unsigned int module_refcount(struct module *mod)
{
- unsigned int i, total = 0;
+ unsigned int total = 0;
+ int cpu;
- for (i = 0; i < NR_CPUS; i++)
- total += local_read(&mod->ref[i].count);
+ for_each_possible_cpu(cpu)
+ total += local_read(__module_ref_addr(mod, cpu));
return total;
}
EXPORT_SYMBOL(module_refcount);
@@ -750,7 +798,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
char name[MODULE_NAME_LEN];
int ret, forced = 0;
- if (!capable(CAP_SYS_MODULE))
+ if (!capable(CAP_SYS_MODULE) || modules_disabled)
return -EPERM;
if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
@@ -821,7 +869,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
mutex_lock(&module_mutex);
/* Store the name of the last unloaded module for diagnostic purposes */
strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
- unregister_dynamic_debug_module(mod->name);
+ ddebug_remove_module(mod->name);
free_module(mod);
out:
@@ -859,22 +907,26 @@ void __symbol_put(const char *symbol)
struct module *owner;
preempt_disable();
- if (IS_ERR_VALUE(find_symbol(symbol, &owner, NULL, true, false)))
+ if (!find_symbol(symbol, &owner, NULL, true, false))
BUG();
module_put(owner);
preempt_enable();
}
EXPORT_SYMBOL(__symbol_put);
+/* Note this assumes addr is a function, which it currently always is. */
void symbol_put_addr(void *addr)
{
struct module *modaddr;
+ unsigned long a = (unsigned long)dereference_function_descriptor(addr);
- if (core_kernel_text((unsigned long)addr))
+ if (core_kernel_text(a))
return;
- if (!(modaddr = module_text_address((unsigned long)addr)))
- BUG();
+ /* module_text_address is safe here: we're supposed to have reference
+ * to module from symbol_get, so it can't go away. */
+ modaddr = __module_text_address(a);
+ BUG_ON(!modaddr);
module_put(modaddr);
}
EXPORT_SYMBOL_GPL(symbol_put_addr);
@@ -894,7 +946,9 @@ void module_put(struct module *module)
{
if (module) {
unsigned int cpu = get_cpu();
- local_dec(&module->ref[cpu].count);
+ local_dec(__module_ref_addr(module, cpu));
+ trace_module_put(module, _RET_IP_,
+ local_read(__module_ref_addr(module, cpu)));
/* Maybe they're waiting for us to drop reference? */
if (unlikely(!module_is_live(module)))
wake_up_process(module->waiter);
@@ -914,10 +968,11 @@ static inline void module_unload_free(struct module *mod)
{
}
-static inline int use_module(struct module *a, struct module *b)
+int use_module(struct module *a, struct module *b)
{
return strong_try_module_get(b) == 0;
}
+EXPORT_SYMBOL_GPL(use_module);
static inline void module_unload_init(struct module *mod)
{
@@ -960,12 +1015,12 @@ static struct module_attribute *modinfo_attrs[] = {
static const char vermagic[] = VERMAGIC_STRING;
-static int try_to_force_load(struct module *mod, const char *symname)
+static int try_to_force_load(struct module *mod, const char *reason)
{
#ifdef CONFIG_MODULE_FORCE_LOAD
if (!test_taint(TAINT_FORCED_MODULE))
- printk("%s: no version for \"%s\" found: kernel tainted.\n",
- mod->name, symname);
+ printk(KERN_WARNING "%s: %s: kernel tainted.\n",
+ mod->name, reason);
add_taint_module(mod, TAINT_FORCED_MODULE);
return 0;
#else
@@ -1022,9 +1077,10 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
{
const unsigned long *crc;
- if (IS_ERR_VALUE(find_symbol("struct_module", NULL, &crc, true, false)))
+ if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
+ &crc, true, false))
BUG();
- return check_version(sechdrs, versindex, "struct_module", mod, crc);
+ return check_version(sechdrs, versindex, "module_layout", mod, crc);
}
/* First part is kernel version, which we ignore if module has crcs. */
@@ -1063,25 +1119,25 @@ static inline int same_magic(const char *amagic, const char *bmagic,
/* Resolve a symbol for this module. I.e. if we find one, record usage.
Must be holding module_mutex. */
-static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
- unsigned int versindex,
- const char *name,
- struct module *mod)
+static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
+ unsigned int versindex,
+ const char *name,
+ struct module *mod)
{
struct module *owner;
- unsigned long ret;
+ const struct kernel_symbol *sym;
const unsigned long *crc;
- ret = find_symbol(name, &owner, &crc,
+ sym = find_symbol(name, &owner, &crc,
!(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
- if (!IS_ERR_VALUE(ret)) {
- /* use_module can fail due to OOM,
- or module initialization or unloading */
+ /* use_module can fail due to OOM,
+ or module initialization or unloading */
+ if (sym) {
if (!check_version(sechdrs, versindex, name, mod, crc) ||
!use_module(mod, owner))
- ret = -EINVAL;
+ sym = NULL;
}
- return ret;
+ return sym;
}
/*
@@ -1225,6 +1281,10 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
struct module_notes_attrs *notes_attrs;
struct bin_attribute *nattr;
+ /* failed to create section attributes, so can't create notes */
+ if (!mod->sect_attrs)
+ return;
+
/* Count notes sections and allocate structures. */
notes = 0;
for (i = 0; i < nsect; i++)
@@ -1444,6 +1504,8 @@ static int __unlink_module(void *_mod)
/* Free a module, remove from lists, etc (must hold module_mutex). */
static void free_module(struct module *mod)
{
+ trace_module_free(mod);
+
/* Delete from various lists */
stop_machine(__unlink_module, mod, NULL);
remove_notes_attrs(mod);
@@ -1456,15 +1518,18 @@ static void free_module(struct module *mod)
/* Module unload stuff */
module_unload_free(mod);
- /* release any pointers to mcount in this module */
- ftrace_release(mod->module_core, mod->core_size);
+ /* Free any allocated parameters. */
+ destroy_params(mod->kp, mod->num_kp);
/* This may be NULL, but that's OK */
module_free(mod, mod->module_init);
kfree(mod->args);
if (mod->percpu)
percpu_modfree(mod->percpu);
-
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ if (mod->refptr)
+ percpu_modfree(mod->refptr);
+#endif
/* Free lock-classes: */
lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -1475,17 +1540,15 @@ static void free_module(struct module *mod)
void *__symbol_get(const char *symbol)
{
struct module *owner;
- unsigned long value;
+ const struct kernel_symbol *sym;
preempt_disable();
- value = find_symbol(symbol, &owner, NULL, true, true);
- if (IS_ERR_VALUE(value))
- value = 0;
- else if (strong_try_module_get(owner))
- value = 0;
+ sym = find_symbol(symbol, &owner, NULL, true, true);
+ if (sym && strong_try_module_get(owner))
+ sym = NULL;
preempt_enable();
- return (void *)value;
+ return sym ? (void *)sym->value : NULL;
}
EXPORT_SYMBOL_GPL(__symbol_get);
@@ -1513,8 +1576,7 @@ static int verify_export_symbols(struct module *mod)
for (i = 0; i < ARRAY_SIZE(arr); i++) {
for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
- if (!IS_ERR_VALUE(find_symbol(s->name, &owner,
- NULL, true, false))) {
+ if (find_symbol(s->name, &owner, NULL, true, false)) {
printk(KERN_ERR
"%s: exports duplicate symbol %s"
" (owned by %s)\n",
@@ -1538,6 +1600,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
unsigned long secbase;
unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
int ret = 0;
+ const struct kernel_symbol *ksym;
for (i = 1; i < n; i++) {
switch (sym[i].st_shndx) {
@@ -1557,13 +1620,14 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
break;
case SHN_UNDEF:
- sym[i].st_value
- = resolve_symbol(sechdrs, versindex,
- strtab + sym[i].st_name, mod);
-
+ ksym = resolve_symbol(sechdrs, versindex,
+ strtab + sym[i].st_name, mod);
/* Ok if resolved. */
- if (!IS_ERR_VALUE(sym[i].st_value))
+ if (ksym) {
+ sym[i].st_value = ksym->value;
break;
+ }
+
/* Ok if weak. */
if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
break;
@@ -1638,8 +1702,7 @@ static void layout_sections(struct module *mod,
if ((s->sh_flags & masks[m][0]) != masks[m][0]
|| (s->sh_flags & masks[m][1])
|| s->sh_entsize != ~0UL
- || strncmp(secstrings + s->sh_name,
- ".init", 5) == 0)
+ || strstarts(secstrings + s->sh_name, ".init"))
continue;
s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
DEBUGP("\t%s\n", secstrings + s->sh_name);
@@ -1656,8 +1719,7 @@ static void layout_sections(struct module *mod,
if ((s->sh_flags & masks[m][0]) != masks[m][0]
|| (s->sh_flags & masks[m][1])
|| s->sh_entsize != ~0UL
- || strncmp(secstrings + s->sh_name,
- ".init", 5) != 0)
+ || !strstarts(secstrings + s->sh_name, ".init"))
continue;
s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
| INIT_OFFSET_MASK);
@@ -1790,8 +1852,7 @@ static char elf_type(const Elf_Sym *sym,
else
return 'b';
}
- if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name,
- ".debug", strlen(".debug")) == 0)
+ if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug"))
return 'n';
return '?';
}
@@ -1823,19 +1884,13 @@ static inline void add_kallsyms(struct module *mod,
}
#endif /* CONFIG_KALLSYMS */
-static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num)
+static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
{
-#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
- unsigned int i;
-
- for (i = 0; i < num; i++) {
- register_dynamic_debug_module(debug[i].modname,
- debug[i].type,
- debug[i].logical_modname,
- debug[i].flag_names,
- debug[i].hash, debug[i].hash2);
- }
-#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */
+#ifdef CONFIG_DYNAMIC_DEBUG
+ if (ddebug_add_module(debug, num, debug->modname))
+ printk(KERN_ERR "dynamic debug error adding module: %s\n",
+ debug->modname);
+#endif
}
static void *module_alloc_update_bounds(unsigned long size)
@@ -1852,6 +1907,36 @@ static void *module_alloc_update_bounds(unsigned long size)
return ret;
}
+#ifdef CONFIG_DEBUG_KMEMLEAK
+static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs, char *secstrings)
+{
+ unsigned int i;
+
+ /* only scan the sections containing data */
+ kmemleak_scan_area(mod->module_core, (unsigned long)mod -
+ (unsigned long)mod->module_core,
+ sizeof(struct module), GFP_KERNEL);
+
+ for (i = 1; i < hdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+ if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0
+ && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
+ continue;
+
+ kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr -
+ (unsigned long)mod->module_core,
+ sechdrs[i].sh_size, GFP_KERNEL);
+ }
+}
+#else
+static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs, char *secstrings)
+{
+}
+#endif
+
/* Allocate and load the module: note that size of section 0 is always
zero, and we rely on this for optional sections. */
static noinline struct module *load_module(void __user *umod,
@@ -1866,12 +1951,9 @@ static noinline struct module *load_module(void __user *umod,
unsigned int symindex = 0;
unsigned int strindex = 0;
unsigned int modindex, versindex, infoindex, pcpuindex;
- unsigned int num_kp, num_mcount;
- struct kernel_param *kp;
struct module *mod;
long err = 0;
void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
- unsigned long *mseg;
mm_segment_t old_fs;
DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -1884,12 +1966,6 @@ static noinline struct module *load_module(void __user *umod,
if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
return ERR_PTR(-ENOMEM);
- /* Create stop_machine threads since the error path relies on
- * a non-failing stop_machine call. */
- err = stop_machine_create();
- if (err)
- goto free_hdr;
-
if (copy_from_user(hdr, umod, len) != 0) {
err = -EFAULT;
goto free_hdr;
@@ -1930,7 +2006,7 @@ static noinline struct module *load_module(void __user *umod,
}
#ifndef CONFIG_MODULE_UNLOAD
/* Don't load .exit sections */
- if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0)
+ if (strstarts(secstrings+sechdrs[i].sh_name, ".exit"))
sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
#endif
}
@@ -1974,7 +2050,7 @@ static noinline struct module *load_module(void __user *umod,
modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
/* This is allowed: modprobe --force will invalidate it. */
if (!modmagic) {
- err = try_to_force_load(mod, "magic");
+ err = try_to_force_load(mod, "bad vermagic");
if (err)
goto free_hdr;
} else if (!same_magic(modmagic, vermagic, versindex)) {
@@ -2031,6 +2107,12 @@ static noinline struct module *load_module(void __user *umod,
/* Do the allocs. */
ptr = module_alloc_update_bounds(mod->core_size);
+ /*
+ * The pointer to this block is stored in the module structure
+ * which is inside the block. Just mark it as not being a
+ * leak.
+ */
+ kmemleak_not_leak(ptr);
if (!ptr) {
err = -ENOMEM;
goto free_percpu;
@@ -2039,6 +2121,13 @@ static noinline struct module *load_module(void __user *umod,
mod->module_core = ptr;
ptr = module_alloc_update_bounds(mod->init_size);
+ /*
+ * The pointer to this block is stored in the module structure
+ * which is inside the block. This block doesn't need to be
+ * scanned as it contains data and code that will be freed
+ * after the module is initialized.
+ */
+ kmemleak_ignore(ptr);
if (!ptr && mod->init_size) {
err = -ENOMEM;
goto free_core;
@@ -2069,7 +2158,16 @@ static noinline struct module *load_module(void __user *umod,
}
/* Module has been moved. */
mod = (void *)sechdrs[modindex].sh_addr;
+ kmemleak_load_module(mod, hdr, sechdrs, secstrings);
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
+ mod->name);
+ if (!mod->refptr) {
+ err = -ENOMEM;
+ goto free_init;
+ }
+#endif
/* Now we've moved module, initialize linked lists, etc. */
module_unload_init(mod);
@@ -2104,8 +2202,8 @@ static noinline struct module *load_module(void __user *umod,
/* Now we've got everything in the final locations, we can
* find optional sections. */
- kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp),
- &num_kp);
+ mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
+ sizeof(*mod->kp), &mod->num_kp);
mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
sizeof(*mod->syms), &mod->num_syms);
mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
@@ -2134,6 +2232,10 @@ static noinline struct module *load_module(void __user *umod,
mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
"__kcrctab_unused_gpl");
#endif
+#ifdef CONFIG_CONSTRUCTORS
+ mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors",
+ sizeof(*mod->ctors), &mod->num_ctors);
+#endif
#ifdef CONFIG_MARKERS
mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
@@ -2145,7 +2247,19 @@ static noinline struct module *load_module(void __user *umod,
sizeof(*mod->tracepoints),
&mod->num_tracepoints);
#endif
-
+#ifdef CONFIG_EVENT_TRACING
+ mod->trace_events = section_objs(hdr, sechdrs, secstrings,
+ "_ftrace_events",
+ sizeof(*mod->trace_events),
+ &mod->num_trace_events);
+#endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+ /* sechdrs[0].sh_size is always zero */
+ mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
+ "__mcount_loc",
+ sizeof(*mod->ftrace_callsites),
+ &mod->num_ftrace_callsites);
+#endif
#ifdef CONFIG_MODVERSIONS
if ((mod->num_syms && !mod->crcs)
|| (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2155,8 +2269,8 @@ static noinline struct module *load_module(void __user *umod,
|| (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
#endif
) {
- printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
- err = try_to_force_load(mod, "nocrc");
+ err = try_to_force_load(mod,
+ "no versions for exported symbols");
if (err)
goto cleanup;
}
@@ -2201,19 +2315,15 @@ static noinline struct module *load_module(void __user *umod,
add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
if (!mod->taints) {
- struct mod_debug *debug;
+ struct _ddebug *debug;
unsigned int num_debug;
debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
sizeof(*debug), &num_debug);
- dynamic_printk_setup(debug, num_debug);
+ if (debug)
+ dynamic_debug_setup(debug, num_debug);
}
- /* sechdrs[0].sh_size is always zero */
- mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
- sizeof(*mseg), &num_mcount);
- ftrace_init_module(mod, mseg, mseg + num_mcount);
-
err = module_finalize(hdr, sechdrs, mod);
if (err < 0)
goto cleanup;
@@ -2250,11 +2360,11 @@ static noinline struct module *load_module(void __user *umod,
*/
list_add_rcu(&mod->list, &modules);
- err = parse_args(mod->name, mod->args, kp, num_kp, NULL);
+ err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
if (err < 0)
goto unlink;
- err = mod_sysfs_setup(mod, kp, num_kp);
+ err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
if (err < 0)
goto unlink;
add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
@@ -2263,22 +2373,29 @@ static noinline struct module *load_module(void __user *umod,
/* Get rid of temporary copy */
vfree(hdr);
- stop_machine_destroy();
+ trace_module_load(mod);
+
/* Done! */
return mod;
unlink:
- stop_machine(__unlink_module, mod, NULL);
+ /* Unlink carefully: kallsyms could be walking list. */
+ list_del_rcu(&mod->list);
+ synchronize_sched();
module_arch_cleanup(mod);
cleanup:
kobject_del(&mod->mkobj.kobj);
kobject_put(&mod->mkobj.kobj);
- ftrace_release(mod->module_core, mod->core_size);
free_unload:
module_unload_free(mod);
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ free_init:
+ percpu_modfree(mod->refptr);
+#endif
module_free(mod, mod->module_init);
free_core:
module_free(mod, mod->module_core);
+ /* mod will be freed with core. Don't access it beyond this line! */
free_percpu:
if (percpu)
percpu_modfree(percpu);
@@ -2286,7 +2403,6 @@ static noinline struct module *load_module(void __user *umod,
kfree(args);
free_hdr:
vfree(hdr);
- stop_machine_destroy();
return ERR_PTR(err);
truncated:
@@ -2295,6 +2411,17 @@ static noinline struct module *load_module(void __user *umod,
goto free_hdr;
}
+/* Call module constructors. */
+static void do_mod_ctors(struct module *mod)
+{
+#ifdef CONFIG_CONSTRUCTORS
+ unsigned long i;
+
+ for (i = 0; i < mod->num_ctors; i++)
+ mod->ctors[i]();
+#endif
+}
+
/* This is where the real work happens */
SYSCALL_DEFINE3(init_module, void __user *, umod,
unsigned long, len, const char __user *, uargs)
@@ -2303,7 +2430,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
int ret = 0;
/* Must have permission */
- if (!capable(CAP_SYS_MODULE))
+ if (!capable(CAP_SYS_MODULE) || modules_disabled)
return -EPERM;
/* Only one module load at a time, please */
@@ -2323,6 +2450,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_COMING, mod);
+ do_mod_ctors(mod);
/* Start the module */
if (mod->init != NULL)
ret = do_one_initcall(mod->init);
@@ -2341,9 +2469,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
return ret;
}
if (ret > 0) {
- printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, "
- "it should follow 0/-E convention\n"
- KERN_WARNING "%s: loading module anyway...\n",
+ printk(KERN_WARNING
+"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
+"%s: loading module anyway...\n",
__func__, mod->name, ret,
__func__);
dump_stack();
@@ -2355,9 +2483,13 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_LIVE, mod);
+ /* We need to finish all async code before the module init sequence is done */
+ async_synchronize_full();
+
mutex_lock(&module_mutex);
/* Drop initial reference. */
module_put(mod);
+ trim_init_extable(mod);
module_free(mod, mod->module_init);
mod->module_init = NULL;
mod->init_size = 0;
@@ -2563,6 +2695,25 @@ unsigned long module_kallsyms_lookup_name(const char *name)
preempt_enable();
return ret;
}
+
+int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+ struct module *, unsigned long),
+ void *data)
+{
+ struct module *mod;
+ unsigned int i;
+ int ret;
+
+ list_for_each_entry(mod, &modules, list) {
+ for (i = 0; i < mod->num_symtab; i++) {
+ ret = fn(data, mod->strtab + mod->symtab[i].st_name,
+ mod, mod->symtab[i].st_value);
+ if (ret != 0)
+ return ret;
+ }
+ }
+ return 0;
+}
#endif /* CONFIG_KALLSYMS */
static char *module_flags(struct module *mod, char *buf)
@@ -2698,29 +2849,31 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
}
/*
- * Is this a valid module address?
+ * is_module_address - is this address inside a module?
+ * @addr: the address to check.
+ *
+ * See is_module_text_address() if you simply want to see if the address
+ * is code (not data).
*/
-int is_module_address(unsigned long addr)
+bool is_module_address(unsigned long addr)
{
- struct module *mod;
+ bool ret;
preempt_disable();
-
- list_for_each_entry_rcu(mod, &modules, list) {
- if (within_module_core(addr, mod)) {
- preempt_enable();
- return 1;
- }
- }
-
+ ret = __module_address(addr) != NULL;
preempt_enable();
- return 0;
+ return ret;
}
-
-/* Is this a valid kernel address? */
-__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
+/*
+ * __module_address - get the module which contains an address.
+ * @addr: the address.
+ *
+ * Must be called with preempt disabled or module mutex held so that
+ * module doesn't get freed during this.
+ */
+struct module *__module_address(unsigned long addr)
{
struct module *mod;
@@ -2728,22 +2881,51 @@ __notrace_funcgraph struct module *__module_text_address(unsigned long addr)
return NULL;
list_for_each_entry_rcu(mod, &modules, list)
- if (within(addr, mod->module_init, mod->init_text_size)
- || within(addr, mod->module_core, mod->core_text_size))
+ if (within_module_core(addr, mod)
+ || within_module_init(addr, mod))
return mod;
return NULL;
}
+EXPORT_SYMBOL_GPL(__module_address);
-struct module *module_text_address(unsigned long addr)
+/*
+ * is_module_text_address - is this address inside module code?
+ * @addr: the address to check.
+ *
+ * See is_module_address() if you simply want to see if the address is
+ * anywhere in a module. See kernel_text_address() for testing if an
+ * address corresponds to kernel or module code.
+ */
+bool is_module_text_address(unsigned long addr)
{
- struct module *mod;
+ bool ret;
preempt_disable();
- mod = __module_text_address(addr);
+ ret = __module_text_address(addr) != NULL;
preempt_enable();
+ return ret;
+}
+
+/*
+ * __module_text_address - get the module whose code contains an address.
+ * @addr: the address.
+ *
+ * Must be called with preempt disabled or module mutex held so that
+ * module doesn't get freed during this.
+ */
+struct module *__module_text_address(unsigned long addr)
+{
+ struct module *mod = __module_address(addr);
+ if (mod) {
+ /* Make sure it's within the text section. */
+ if (!within(addr, mod->module_init, mod->init_text_size)
+ && !within(addr, mod->module_core, mod->core_text_size))
+ mod = NULL;
+ }
return mod;
}
+EXPORT_SYMBOL_GPL(__module_text_address);
/* Don't grab lock, we're oopsing. */
void print_modules(void)
@@ -2751,7 +2933,7 @@ void print_modules(void)
struct module *mod;
char buf[8];
- printk("Modules linked in:");
+ printk(KERN_DEFAULT "Modules linked in:");
/* Most callers should already have preempt disabled, but make sure */
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list)
@@ -2763,9 +2945,17 @@ void print_modules(void)
}
#ifdef CONFIG_MODVERSIONS
-/* Generate the signature for struct module here, too, for modversions. */
-void struct_module(struct module *mod) { return; }
-EXPORT_SYMBOL(struct_module);
+/* Generate the signature for all relevant module structures here.
+ * If these change, we don't want to try to parse the module. */
+void module_layout(struct module *mod,
+ struct modversion_info *ver,
+ struct kernel_param *kp,
+ struct kernel_symbol *ks,
+ struct marker *marker,
+ struct tracepoint *tp)
+{
+}
+EXPORT_SYMBOL(module_layout);
#endif
#ifdef CONFIG_MARKERS
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 1d94160eb53..50d022e5a56 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -26,11 +26,6 @@
/*
* Must be called with lock->wait_lock held.
*/
-void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
-{
- lock->owner = new_owner;
-}
-
void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
{
memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
@@ -59,7 +54,6 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
/* Mark the current thread as blocked on the lock: */
ti->task->blocked_on = waiter;
- waiter->lock = lock;
}
void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
@@ -82,7 +76,7 @@ void debug_mutex_unlock(struct mutex *lock)
DEBUG_LOCKS_WARN_ON(lock->magic != lock);
DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
- DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+ mutex_clear_owner(lock);
}
void debug_mutex_init(struct mutex *lock, const char *name,
@@ -95,7 +89,6 @@ void debug_mutex_init(struct mutex *lock, const char *name,
debug_check_no_locks_freed((void *)lock, sizeof(*lock));
lockdep_init_map(&lock->dep_map, name, key, 0);
#endif
- lock->owner = NULL;
lock->magic = lock;
}
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index babfbdfc534..6b2d735846a 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -13,14 +13,6 @@
/*
* This must be called with lock->wait_lock held.
*/
-extern void
-debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
-
-static inline void debug_mutex_clear_owner(struct mutex *lock)
-{
- lock->owner = NULL;
-}
-
extern void debug_mutex_lock_common(struct mutex *lock,
struct mutex_waiter *waiter);
extern void debug_mutex_wake_waiter(struct mutex *lock,
@@ -35,6 +27,16 @@ extern void debug_mutex_unlock(struct mutex *lock);
extern void debug_mutex_init(struct mutex *lock, const char *name,
struct lock_class_key *key);
+static inline void mutex_set_owner(struct mutex *lock)
+{
+ lock->owner = current_thread_info();
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+ lock->owner = NULL;
+}
+
#define spin_lock_mutex(lock, flags) \
do { \
struct mutex *l = container_of(lock, struct mutex, wait_lock); \
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 4f45d4b658e..947b3ad551f 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -10,6 +10,11 @@
* Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
* David Howells for suggestions and improvements.
*
+ * - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline
+ * from the -rt tree, where it was originally implemented for rtmutexes
+ * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
+ * and Sven Dietrich.
+ *
* Also see Documentation/mutex-design.txt.
*/
#include <linux/mutex.h>
@@ -46,6 +51,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
atomic_set(&lock->count, 1);
spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list);
+ mutex_clear_owner(lock);
debug_mutex_init(lock, name, key);
}
@@ -83,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
*
* This function is similar to (but not equivalent to) down().
*/
-void inline __sched mutex_lock(struct mutex *lock)
+void __sched mutex_lock(struct mutex *lock)
{
might_sleep();
/*
@@ -91,6 +97,7 @@ void inline __sched mutex_lock(struct mutex *lock)
* 'unlocked' into 'locked' state.
*/
__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+ mutex_set_owner(lock);
}
EXPORT_SYMBOL(mutex_lock);
@@ -115,6 +122,14 @@ void __sched mutex_unlock(struct mutex *lock)
* The unlocking fastpath is the 0->1 transition from 'locked'
* into 'unlocked' state:
*/
+#ifndef CONFIG_DEBUG_MUTEXES
+ /*
+ * When debugging is enabled we must not clear the owner before time,
+ * the slow path will always be taken, and that clears the owner field
+ * after verifying that it was indeed current.
+ */
+ mutex_clear_owner(lock);
+#endif
__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
}
@@ -129,21 +144,76 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
{
struct task_struct *task = current;
struct mutex_waiter waiter;
- unsigned int old_val;
unsigned long flags;
+ preempt_disable();
+ mutex_acquire(&lock->dep_map, subclass, 0, ip);
+#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \
+ !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES)
+ /*
+ * Optimistic spinning.
+ *
+ * We try to spin for acquisition when we find that there are no
+ * pending waiters and the lock owner is currently running on a
+ * (different) CPU.
+ *
+ * The rationale is that if the lock owner is running, it is likely to
+ * release the lock soon.
+ *
+ * Since this needs the lock owner, and this mutex implementation
+ * doesn't track the owner atomically in the lock field, we need to
+ * track it non-atomically.
+ *
+ * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
+ * to serialize everything.
+ */
+
+ for (;;) {
+ struct thread_info *owner;
+
+ /*
+ * If there's an owner, wait for it to either
+ * release the lock or go to sleep.
+ */
+ owner = ACCESS_ONCE(lock->owner);
+ if (owner && !mutex_spin_on_owner(lock, owner))
+ break;
+
+ if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+ lock_acquired(&lock->dep_map, ip);
+ mutex_set_owner(lock);
+ preempt_enable();
+ return 0;
+ }
+
+ /*
+ * When there's no owner, we might have preempted between the
+ * owner acquiring the lock and setting the owner field. If
+ * we're an RT task that will live-lock because we won't let
+ * the owner complete.
+ */
+ if (!owner && (need_resched() || rt_task(task)))
+ break;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ cpu_relax();
+ }
+#endif
spin_lock_mutex(&lock->wait_lock, flags);
debug_mutex_lock_common(lock, &waiter);
- mutex_acquire(&lock->dep_map, subclass, 0, ip);
debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
/* add waiting tasks to the end of the waitqueue (FIFO): */
list_add_tail(&waiter.list, &lock->wait_list);
waiter.task = task;
- old_val = atomic_xchg(&lock->count, -1);
- if (old_val == 1)
+ if (atomic_xchg(&lock->count, -1) == 1)
goto done;
lock_contended(&lock->dep_map, ip);
@@ -158,8 +228,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* that when we release the lock, we properly wake up the
* other waiters:
*/
- old_val = atomic_xchg(&lock->count, -1);
- if (old_val == 1)
+ if (atomic_xchg(&lock->count, -1) == 1)
break;
/*
@@ -173,21 +242,24 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
spin_unlock_mutex(&lock->wait_lock, flags);
debug_mutex_free_waiter(&waiter);
+ preempt_enable();
return -EINTR;
}
__set_task_state(task, state);
/* didnt get the lock, go to sleep: */
spin_unlock_mutex(&lock->wait_lock, flags);
+ preempt_enable_no_resched();
schedule();
+ preempt_disable();
spin_lock_mutex(&lock->wait_lock, flags);
}
done:
lock_acquired(&lock->dep_map, ip);
/* got the lock - rejoice! */
- mutex_remove_waiter(lock, &waiter, task_thread_info(task));
- debug_mutex_set_owner(lock, task_thread_info(task));
+ mutex_remove_waiter(lock, &waiter, current_thread_info());
+ mutex_set_owner(lock);
/* set it to 0 if there are no waiters left: */
if (likely(list_empty(&lock->wait_list)))
@@ -196,6 +268,7 @@ done:
spin_unlock_mutex(&lock->wait_lock, flags);
debug_mutex_free_waiter(&waiter);
+ preempt_enable();
return 0;
}
@@ -222,7 +295,8 @@ int __sched
mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
{
might_sleep();
- return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_);
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
+ subclass, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -260,8 +334,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
wake_up_process(waiter->task);
}
- debug_mutex_clear_owner(lock);
-
spin_unlock_mutex(&lock->wait_lock, flags);
}
@@ -298,18 +370,30 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count);
*/
int __sched mutex_lock_interruptible(struct mutex *lock)
{
+ int ret;
+
might_sleep();
- return __mutex_fastpath_lock_retval
+ ret = __mutex_fastpath_lock_retval
(&lock->count, __mutex_lock_interruptible_slowpath);
+ if (!ret)
+ mutex_set_owner(lock);
+
+ return ret;
}
EXPORT_SYMBOL(mutex_lock_interruptible);
int __sched mutex_lock_killable(struct mutex *lock)
{
+ int ret;
+
might_sleep();
- return __mutex_fastpath_lock_retval
+ ret = __mutex_fastpath_lock_retval
(&lock->count, __mutex_lock_killable_slowpath);
+ if (!ret)
+ mutex_set_owner(lock);
+
+ return ret;
}
EXPORT_SYMBOL(mutex_lock_killable);
@@ -352,9 +436,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
prev = atomic_xchg(&lock->count, -1);
if (likely(prev == 1)) {
- debug_mutex_set_owner(lock, current_thread_info());
+ mutex_set_owner(lock);
mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
}
+
/* Set it back to 0 if there are no waiters: */
if (likely(list_empty(&lock->wait_list)))
atomic_set(&lock->count, 0);
@@ -380,8 +465,36 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
*/
int __sched mutex_trylock(struct mutex *lock)
{
- return __mutex_fastpath_trylock(&lock->count,
- __mutex_trylock_slowpath);
-}
+ int ret;
+
+ ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
+ if (ret)
+ mutex_set_owner(lock);
+ return ret;
+}
EXPORT_SYMBOL(mutex_trylock);
+
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+ /* dec if we can't possibly hit 0 */
+ if (atomic_add_unless(cnt, -1, 1))
+ return 0;
+ /* we might hit 0, so take the lock */
+ mutex_lock(lock);
+ if (!atomic_dec_and_test(cnt)) {
+ /* when we actually did the dec, we didn't hit 0 */
+ mutex_unlock(lock);
+ return 0;
+ }
+ /* we hit 0, and we hold the lock */
+ return 1;
+}
+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/mutex.h b/kernel/mutex.h
index a075dafbb29..67578ca48f9 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -16,8 +16,26 @@
#define mutex_remove_waiter(lock, waiter, ti) \
__list_del((waiter)->list.prev, (waiter)->list.next)
-#define debug_mutex_set_owner(lock, new_owner) do { } while (0)
-#define debug_mutex_clear_owner(lock) do { } while (0)
+#ifdef CONFIG_SMP
+static inline void mutex_set_owner(struct mutex *lock)
+{
+ lock->owner = current_thread_info();
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+ lock->owner = NULL;
+}
+#else
+static inline void mutex_set_owner(struct mutex *lock)
+{
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+}
+#endif
+
#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
#define debug_mutex_free_waiter(waiter) do { } while (0)
#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 78bc3fdac0d..5aa854f9e5a 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -34,7 +34,7 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
/*
* Rules:
- * 1. you can only enter a cgroup which is a child of your current
+ * 1. you can only enter a cgroup which is a descendant of your current
* cgroup
* 2. you can only place another process into a cgroup if
* a. you have CAP_SYS_ADMIN
@@ -45,21 +45,15 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
static int ns_can_attach(struct cgroup_subsys *ss,
struct cgroup *new_cgroup, struct task_struct *task)
{
- struct cgroup *orig;
-
if (current != task) {
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!cgroup_is_descendant(new_cgroup))
+ if (!cgroup_is_descendant(new_cgroup, current))
return -EPERM;
}
- if (atomic_read(&new_cgroup->count) != 0)
- return -EPERM;
-
- orig = task_cgroup(task, ns_subsys_id);
- if (orig && orig != new_cgroup->parent)
+ if (!cgroup_is_descendant(new_cgroup, task))
return -EPERM;
return 0;
@@ -77,7 +71,7 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
- if (!cgroup_is_descendant(cgroup))
+ if (!cgroup_is_descendant(cgroup, current))
return ERR_PTR(-EPERM);
ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 63598dca2d0..09b4ff9711b 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,19 +26,14 @@ static struct kmem_cache *nsproxy_cachep;
struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
-/*
- * creates a copy of "orig" with refcount 1.
- */
-static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
+static inline struct nsproxy *create_nsproxy(void)
{
- struct nsproxy *ns;
+ struct nsproxy *nsproxy;
- ns = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
- if (ns) {
- memcpy(ns, orig, sizeof(struct nsproxy));
- atomic_set(&ns->count, 1);
- }
- return ns;
+ nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
+ if (nsproxy)
+ atomic_set(&nsproxy->count, 1);
+ return nsproxy;
}
/*
@@ -52,7 +47,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
struct nsproxy *new_nsp;
int err;
- new_nsp = clone_nsproxy(tsk->nsproxy);
+ new_nsp = create_nsproxy();
if (!new_nsp)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/panic.c b/kernel/panic.c
index 2a2ff36ff44..512ab73b0ca 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -8,19 +8,19 @@
* This function is used through-out the kernel (including mm and fs)
* to indicate a major problem.
*/
+#include <linux/debug_locks.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/notifier.h>
#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/delay.h>
+#include <linux/random.h>
#include <linux/reboot.h>
-#include <linux/notifier.h>
-#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kexec.h>
+#include <linux/sched.h>
#include <linux/sysrq.h>
-#include <linux/interrupt.h>
+#include <linux/init.h>
#include <linux/nmi.h>
-#include <linux/kexec.h>
-#include <linux/debug_locks.h>
-#include <linux/random.h>
-#include <linux/kallsyms.h>
#include <linux/dmi.h>
int panic_on_oops;
@@ -52,19 +52,15 @@ EXPORT_SYMBOL(panic_blink);
*
* This function never returns.
*/
-
NORET_TYPE void panic(const char * fmt, ...)
{
- long i;
static char buf[1024];
va_list args;
-#if defined(CONFIG_S390)
- unsigned long caller = (unsigned long) __builtin_return_address(0);
-#endif
+ long i;
/*
- * It's possible to come here directly from a panic-assertion and not
- * have preempt disabled. Some functions called from here want
+ * It's possible to come here directly from a panic-assertion and
+ * not have preempt disabled. Some functions called from here want
* preempt to be disabled. No point enabling it later though...
*/
preempt_disable();
@@ -74,7 +70,9 @@ NORET_TYPE void panic(const char * fmt, ...)
vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
- bust_spinlocks(0);
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+ dump_stack();
+#endif
/*
* If we have crashed and we have a crash kernel loaded let it handle
@@ -83,14 +81,12 @@ NORET_TYPE void panic(const char * fmt, ...)
*/
crash_kexec(NULL);
-#ifdef CONFIG_SMP
/*
* Note smp_send_stop is the usual smp shutdown function, which
* unfortunately means it may not be hardened to work in a panic
* situation.
*/
smp_send_stop();
-#endif
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
@@ -99,19 +95,21 @@ NORET_TYPE void panic(const char * fmt, ...)
if (panic_timeout > 0) {
/*
- * Delay timeout seconds before rebooting the machine.
- * We can't use the "normal" timers since we just panicked..
- */
- printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout);
+ * Delay timeout seconds before rebooting the machine.
+ * We can't use the "normal" timers since we just panicked.
+ */
+ printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
+
for (i = 0; i < panic_timeout*1000; ) {
touch_nmi_watchdog();
i += panic_blink(i);
mdelay(1);
i++;
}
- /* This will not be a clean reboot, with everything
- * shutting down. But if there is a chance of
- * rebooting the system it will be rebooted.
+ /*
+ * This will not be a clean reboot, with everything
+ * shutting down. But if there is a chance of
+ * rebooting the system it will be rebooted.
*/
emergency_restart();
}
@@ -124,38 +122,44 @@ NORET_TYPE void panic(const char * fmt, ...)
}
#endif
#if defined(CONFIG_S390)
- disabled_wait(caller);
+ {
+ unsigned long caller;
+
+ caller = (unsigned long)__builtin_return_address(0);
+ disabled_wait(caller);
+ }
#endif
local_irq_enable();
- for (i = 0;;) {
+ for (i = 0; ; ) {
touch_softlockup_watchdog();
i += panic_blink(i);
mdelay(1);
i++;
}
+ bust_spinlocks(0);
}
EXPORT_SYMBOL(panic);
struct tnt {
- u8 bit;
- char true;
- char false;
+ u8 bit;
+ char true;
+ char false;
};
static const struct tnt tnts[] = {
- { TAINT_PROPRIETARY_MODULE, 'P', 'G' },
- { TAINT_FORCED_MODULE, 'F', ' ' },
- { TAINT_UNSAFE_SMP, 'S', ' ' },
- { TAINT_FORCED_RMMOD, 'R', ' ' },
- { TAINT_MACHINE_CHECK, 'M', ' ' },
- { TAINT_BAD_PAGE, 'B', ' ' },
- { TAINT_USER, 'U', ' ' },
- { TAINT_DIE, 'D', ' ' },
- { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
- { TAINT_WARN, 'W', ' ' },
- { TAINT_CRAP, 'C', ' ' },
+ { TAINT_PROPRIETARY_MODULE, 'P', 'G' },
+ { TAINT_FORCED_MODULE, 'F', ' ' },
+ { TAINT_UNSAFE_SMP, 'S', ' ' },
+ { TAINT_FORCED_RMMOD, 'R', ' ' },
+ { TAINT_MACHINE_CHECK, 'M', ' ' },
+ { TAINT_BAD_PAGE, 'B', ' ' },
+ { TAINT_USER, 'U', ' ' },
+ { TAINT_DIE, 'D', ' ' },
+ { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
+ { TAINT_WARN, 'W', ' ' },
+ { TAINT_CRAP, 'C', ' ' },
};
/**
@@ -192,7 +196,8 @@ const char *print_tainted(void)
*s = 0;
} else
snprintf(buf, sizeof(buf), "Not tainted");
- return(buf);
+
+ return buf;
}
int test_taint(unsigned flag)
@@ -208,7 +213,16 @@ unsigned long get_taint(void)
void add_taint(unsigned flag)
{
- debug_locks = 0; /* can't trust the integrity of the kernel anymore */
+ /*
+ * Can't trust the integrity of the kernel anymore.
+ * We don't call directly debug_locks_off() because the issue
+ * is not necessarily serious enough to set oops_in_progress to 1
+ * Also we want to keep up lockdep for staging development and
+ * post-warning case.
+ */
+ if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off())
+ printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
+
set_bit(flag, &tainted_mask);
}
EXPORT_SYMBOL(add_taint);
@@ -263,8 +277,8 @@ static void do_oops_enter_exit(void)
}
/*
- * Return true if the calling CPU is allowed to print oops-related info. This
- * is a bit racy..
+ * Return true if the calling CPU is allowed to print oops-related info.
+ * This is a bit racy..
*/
int oops_may_print(void)
{
@@ -273,20 +287,23 @@ int oops_may_print(void)
/*
* Called when the architecture enters its oops handler, before it prints
- * anything. If this is the first CPU to oops, and it's oopsing the first time
- * then let it proceed.
+ * anything. If this is the first CPU to oops, and it's oopsing the first
+ * time then let it proceed.
*
- * This is all enabled by the pause_on_oops kernel boot option. We do all this
- * to ensure that oopses don't scroll off the screen. It has the side-effect
- * of preventing later-oopsing CPUs from mucking up the display, too.
+ * This is all enabled by the pause_on_oops kernel boot option. We do all
+ * this to ensure that oopses don't scroll off the screen. It has the
+ * side-effect of preventing later-oopsing CPUs from mucking up the display,
+ * too.
*
- * It turns out that the CPU which is allowed to print ends up pausing for the
- * right duration, whereas all the other CPUs pause for twice as long: once in
- * oops_enter(), once in oops_exit().
+ * It turns out that the CPU which is allowed to print ends up pausing for
+ * the right duration, whereas all the other CPUs pause for twice as long:
+ * once in oops_enter(), once in oops_exit().
*/
void oops_enter(void)
{
- debug_locks_off(); /* can't trust the integrity of the kernel anymore */
+ tracing_off();
+ /* can't trust the integrity of the kernel anymore: */
+ debug_locks_off();
do_oops_enter_exit();
}
@@ -324,46 +341,61 @@ void oops_exit(void)
}
#ifdef WANT_WARN_ON_SLOWPATH
-void warn_slowpath(const char *file, int line, const char *fmt, ...)
-{
+struct slowpath_args {
+ const char *fmt;
va_list args;
- char function[KSYM_SYMBOL_LEN];
- unsigned long caller = (unsigned long)__builtin_return_address(0);
- const char *board;
+};
- sprint_symbol(function, caller);
+static void warn_slowpath_common(const char *file, int line, void *caller, struct slowpath_args *args)
+{
+ const char *board;
printk(KERN_WARNING "------------[ cut here ]------------\n");
- printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
- line, function);
+ printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller);
board = dmi_get_system_info(DMI_PRODUCT_NAME);
if (board)
printk(KERN_WARNING "Hardware name: %s\n", board);
- if (fmt) {
- va_start(args, fmt);
- vprintk(fmt, args);
- va_end(args);
- }
+ if (args)
+ vprintk(args->fmt, args->args);
print_modules();
dump_stack();
print_oops_end_marker();
add_taint(TAINT_WARN);
}
-EXPORT_SYMBOL(warn_slowpath);
+
+void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
+{
+ struct slowpath_args args;
+
+ args.fmt = fmt;
+ va_start(args.args, fmt);
+ warn_slowpath_common(file, line, __builtin_return_address(0), &args);
+ va_end(args.args);
+}
+EXPORT_SYMBOL(warn_slowpath_fmt);
+
+void warn_slowpath_null(const char *file, int line)
+{
+ warn_slowpath_common(file, line, __builtin_return_address(0), NULL);
+}
+EXPORT_SYMBOL(warn_slowpath_null);
#endif
#ifdef CONFIG_CC_STACKPROTECTOR
+
/*
* Called when gcc's -fstack-protector feature is used, and
* gcc detects corruption of the on-stack canary value
*/
void __stack_chk_fail(void)
{
- panic("stack-protector: Kernel stack is corrupted");
+ panic("stack-protector: Kernel stack is corrupted in: %p\n",
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(__stack_chk_fail);
+
#endif
core_param(panic, panic_timeout, int, 0644);
diff --git a/kernel/params.c b/kernel/params.c
index a1e3025b19a..7f6912ced2b 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -217,7 +217,19 @@ int param_set_charp(const char *val, struct kernel_param *kp)
return -ENOSPC;
}
- *(char **)kp->arg = (char *)val;
+ if (kp->flags & KPARAM_KMALLOCED)
+ kfree(*(char **)kp->arg);
+
+ /* This is a hack. We can't need to strdup in early boot, and we
+ * don't need to; this mangled commandline is preserved. */
+ if (slab_is_available()) {
+ kp->flags |= KPARAM_KMALLOCED;
+ *(char **)kp->arg = kstrdup(val, GFP_KERNEL);
+ if (!kp->arg)
+ return -ENOMEM;
+ } else
+ *(const char **)kp->arg = val;
+
return 0;
}
@@ -226,44 +238,63 @@ int param_get_charp(char *buffer, struct kernel_param *kp)
return sprintf(buffer, "%s", *((char **)kp->arg));
}
+/* Actually could be a bool or an int, for historical reasons. */
int param_set_bool(const char *val, struct kernel_param *kp)
{
+ bool v;
+
/* No equals means "set"... */
if (!val) val = "1";
/* One of =[yYnN01] */
switch (val[0]) {
case 'y': case 'Y': case '1':
- *(int *)kp->arg = 1;
- return 0;
+ v = true;
+ break;
case 'n': case 'N': case '0':
- *(int *)kp->arg = 0;
- return 0;
+ v = false;
+ break;
+ default:
+ return -EINVAL;
}
- return -EINVAL;
+
+ if (kp->flags & KPARAM_ISBOOL)
+ *(bool *)kp->arg = v;
+ else
+ *(int *)kp->arg = v;
+ return 0;
}
int param_get_bool(char *buffer, struct kernel_param *kp)
{
+ bool val;
+ if (kp->flags & KPARAM_ISBOOL)
+ val = *(bool *)kp->arg;
+ else
+ val = *(int *)kp->arg;
+
/* Y and N chosen as being relatively non-coder friendly */
- return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N');
+ return sprintf(buffer, "%c", val ? 'Y' : 'N');
}
+/* This one must be bool. */
int param_set_invbool(const char *val, struct kernel_param *kp)
{
- int boolval, ret;
+ int ret;
+ bool boolval;
struct kernel_param dummy;
dummy.arg = &boolval;
+ dummy.flags = KPARAM_ISBOOL;
ret = param_set_bool(val, &dummy);
if (ret == 0)
- *(int *)kp->arg = !boolval;
+ *(bool *)kp->arg = !boolval;
return ret;
}
int param_get_invbool(char *buffer, struct kernel_param *kp)
{
- return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'N' : 'Y');
+ return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
}
/* We break the rule and mangle the string. */
@@ -571,6 +602,15 @@ void module_param_sysfs_remove(struct module *mod)
}
#endif
+void destroy_params(const struct kernel_param *params, unsigned num)
+{
+ unsigned int i;
+
+ for (i = 0; i < num; i++)
+ if (params[i].flags & KPARAM_KMALLOCED)
+ kfree(*(char **)params[i].arg);
+}
+
static void __init kernel_add_sysfs_param(const char *name,
struct kernel_param *kparam,
unsigned int name_skip)
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
new file mode 100644
index 00000000000..e0d91fdf0c3
--- /dev/null
+++ b/kernel/perf_counter.c
@@ -0,0 +1,4962 @@
+/*
+ * Performance counter core code
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * For licensing details see kernel-base/COPYING
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/sysfs.h>
+#include <linux/dcache.h>
+#include <linux/percpu.h>
+#include <linux/ptrace.h>
+#include <linux/vmstat.h>
+#include <linux/hardirq.h>
+#include <linux/rculist.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/anon_inodes.h>
+#include <linux/kernel_stat.h>
+#include <linux/perf_counter.h>
+
+#include <asm/irq_regs.h>
+
+/*
+ * Each CPU has a list of per CPU counters:
+ */
+DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+
+int perf_max_counters __read_mostly = 1;
+static int perf_reserved_percpu __read_mostly;
+static int perf_overcommit __read_mostly = 1;
+
+static atomic_t nr_counters __read_mostly;
+static atomic_t nr_mmap_counters __read_mostly;
+static atomic_t nr_comm_counters __read_mostly;
+static atomic_t nr_task_counters __read_mostly;
+
+/*
+ * perf counter paranoia level:
+ * -1 - not paranoid at all
+ * 0 - disallow raw tracepoint access for unpriv
+ * 1 - disallow cpu counters for unpriv
+ * 2 - disallow kernel profiling for unpriv
+ */
+int sysctl_perf_counter_paranoid __read_mostly = 1;
+
+static inline bool perf_paranoid_tracepoint_raw(void)
+{
+ return sysctl_perf_counter_paranoid > -1;
+}
+
+static inline bool perf_paranoid_cpu(void)
+{
+ return sysctl_perf_counter_paranoid > 0;
+}
+
+static inline bool perf_paranoid_kernel(void)
+{
+ return sysctl_perf_counter_paranoid > 1;
+}
+
+int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
+
+/*
+ * max perf counter sample rate
+ */
+int sysctl_perf_counter_sample_rate __read_mostly = 100000;
+
+static atomic64_t perf_counter_id;
+
+/*
+ * Lock for (sysadmin-configurable) counter reservations:
+ */
+static DEFINE_SPINLOCK(perf_resource_lock);
+
+/*
+ * Architecture provided APIs - weak aliases:
+ */
+extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
+{
+ return NULL;
+}
+
+void __weak hw_perf_disable(void) { barrier(); }
+void __weak hw_perf_enable(void) { barrier(); }
+
+void __weak hw_perf_counter_setup(int cpu) { barrier(); }
+void __weak hw_perf_counter_setup_online(int cpu) { barrier(); }
+
+int __weak
+hw_perf_group_sched_in(struct perf_counter *group_leader,
+ struct perf_cpu_context *cpuctx,
+ struct perf_counter_context *ctx, int cpu)
+{
+ return 0;
+}
+
+void __weak perf_counter_print_debug(void) { }
+
+static DEFINE_PER_CPU(int, disable_count);
+
+void __perf_disable(void)
+{
+ __get_cpu_var(disable_count)++;
+}
+
+bool __perf_enable(void)
+{
+ return !--__get_cpu_var(disable_count);
+}
+
+void perf_disable(void)
+{
+ __perf_disable();
+ hw_perf_disable();
+}
+
+void perf_enable(void)
+{
+ if (__perf_enable())
+ hw_perf_enable();
+}
+
+static void get_ctx(struct perf_counter_context *ctx)
+{
+ WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
+}
+
+static void free_ctx(struct rcu_head *head)
+{
+ struct perf_counter_context *ctx;
+
+ ctx = container_of(head, struct perf_counter_context, rcu_head);
+ kfree(ctx);
+}
+
+static void put_ctx(struct perf_counter_context *ctx)
+{
+ if (atomic_dec_and_test(&ctx->refcount)) {
+ if (ctx->parent_ctx)
+ put_ctx(ctx->parent_ctx);
+ if (ctx->task)
+ put_task_struct(ctx->task);
+ call_rcu(&ctx->rcu_head, free_ctx);
+ }
+}
+
+static void unclone_ctx(struct perf_counter_context *ctx)
+{
+ if (ctx->parent_ctx) {
+ put_ctx(ctx->parent_ctx);
+ ctx->parent_ctx = NULL;
+ }
+}
+
+/*
+ * If we inherit counters we want to return the parent counter id
+ * to userspace.
+ */
+static u64 primary_counter_id(struct perf_counter *counter)
+{
+ u64 id = counter->id;
+
+ if (counter->parent)
+ id = counter->parent->id;
+
+ return id;
+}
+
+/*
+ * Get the perf_counter_context for a task and lock it.
+ * This has to cope with with the fact that until it is locked,
+ * the context could get moved to another task.
+ */
+static struct perf_counter_context *
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
+{
+ struct perf_counter_context *ctx;
+
+ rcu_read_lock();
+ retry:
+ ctx = rcu_dereference(task->perf_counter_ctxp);
+ if (ctx) {
+ /*
+ * If this context is a clone of another, it might
+ * get swapped for another underneath us by
+ * perf_counter_task_sched_out, though the
+ * rcu_read_lock() protects us from any context
+ * getting freed. Lock the context and check if it
+ * got swapped before we could get the lock, and retry
+ * if so. If we locked the right context, then it
+ * can't get swapped on us any more.
+ */
+ spin_lock_irqsave(&ctx->lock, *flags);
+ if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
+ spin_unlock_irqrestore(&ctx->lock, *flags);
+ goto retry;
+ }
+
+ if (!atomic_inc_not_zero(&ctx->refcount)) {
+ spin_unlock_irqrestore(&ctx->lock, *flags);
+ ctx = NULL;
+ }
+ }
+ rcu_read_unlock();
+ return ctx;
+}
+
+/*
+ * Get the context for a task and increment its pin_count so it
+ * can't get swapped to another task. This also increments its
+ * reference count so that the context can't get freed.
+ */
+static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
+{
+ struct perf_counter_context *ctx;
+ unsigned long flags;
+
+ ctx = perf_lock_task_context(task, &flags);
+ if (ctx) {
+ ++ctx->pin_count;
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ }
+ return ctx;
+}
+
+static void perf_unpin_context(struct perf_counter_context *ctx)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->lock, flags);
+ --ctx->pin_count;
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ put_ctx(ctx);
+}
+
+/*
+ * Add a counter from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
+{
+ struct perf_counter *group_leader = counter->group_leader;
+
+ /*
+ * Depending on whether it is a standalone or sibling counter,
+ * add it straight to the context's counter list, or to the group
+ * leader's sibling list:
+ */
+ if (group_leader == counter)
+ list_add_tail(&counter->list_entry, &ctx->counter_list);
+ else {
+ list_add_tail(&counter->list_entry, &group_leader->sibling_list);
+ group_leader->nr_siblings++;
+ }
+
+ list_add_rcu(&counter->event_entry, &ctx->event_list);
+ ctx->nr_counters++;
+ if (counter->attr.inherit_stat)
+ ctx->nr_stat++;
+}
+
+/*
+ * Remove a counter from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
+{
+ struct perf_counter *sibling, *tmp;
+
+ if (list_empty(&counter->list_entry))
+ return;
+ ctx->nr_counters--;
+ if (counter->attr.inherit_stat)
+ ctx->nr_stat--;
+
+ list_del_init(&counter->list_entry);
+ list_del_rcu(&counter->event_entry);
+
+ if (counter->group_leader != counter)
+ counter->group_leader->nr_siblings--;
+
+ /*
+ * If this was a group counter with sibling counters then
+ * upgrade the siblings to singleton counters by adding them
+ * to the context list directly:
+ */
+ list_for_each_entry_safe(sibling, tmp,
+ &counter->sibling_list, list_entry) {
+
+ list_move_tail(&sibling->list_entry, &ctx->counter_list);
+ sibling->group_leader = sibling;
+ }
+}
+
+static void
+counter_sched_out(struct perf_counter *counter,
+ struct perf_cpu_context *cpuctx,
+ struct perf_counter_context *ctx)
+{
+ if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+ return;
+
+ counter->state = PERF_COUNTER_STATE_INACTIVE;
+ if (counter->pending_disable) {
+ counter->pending_disable = 0;
+ counter->state = PERF_COUNTER_STATE_OFF;
+ }
+ counter->tstamp_stopped = ctx->time;
+ counter->pmu->disable(counter);
+ counter->oncpu = -1;
+
+ if (!is_software_counter(counter))
+ cpuctx->active_oncpu--;
+ ctx->nr_active--;
+ if (counter->attr.exclusive || !cpuctx->active_oncpu)
+ cpuctx->exclusive = 0;
+}
+
+static void
+group_sched_out(struct perf_counter *group_counter,
+ struct perf_cpu_context *cpuctx,
+ struct perf_counter_context *ctx)
+{
+ struct perf_counter *counter;
+
+ if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
+ return;
+
+ counter_sched_out(group_counter, cpuctx, ctx);
+
+ /*
+ * Schedule out siblings (if any):
+ */
+ list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+ counter_sched_out(counter, cpuctx, ctx);
+
+ if (group_counter->attr.exclusive)
+ cpuctx->exclusive = 0;
+}
+
+/*
+ * Cross CPU call to remove a performance counter
+ *
+ * We disable the counter on the hardware level first. After that we
+ * remove it from the context list.
+ */
+static void __perf_counter_remove_from_context(void *info)
+{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_counter *counter = info;
+ struct perf_counter_context *ctx = counter->ctx;
+
+ /*
+ * If this is a task context, we need to check whether it is
+ * the current task context of this cpu. If not it has been
+ * scheduled out before the smp call arrived.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx)
+ return;
+
+ spin_lock(&ctx->lock);
+ /*
+ * Protect the list operation against NMI by disabling the
+ * counters on a global level.
+ */
+ perf_disable();
+
+ counter_sched_out(counter, cpuctx, ctx);
+
+ list_del_counter(counter, ctx);
+
+ if (!ctx->task) {
+ /*
+ * Allow more per task counters with respect to the
+ * reservation:
+ */
+ cpuctx->max_pertask =
+ min(perf_max_counters - ctx->nr_counters,
+ perf_max_counters - perf_reserved_percpu);
+ }
+
+ perf_enable();
+ spin_unlock(&ctx->lock);
+}
+
+
+/*
+ * Remove the counter from a task's (or a CPU's) list of counters.
+ *
+ * Must be called with ctx->mutex held.
+ *
+ * CPU counters are removed with a smp call. For task counters we only
+ * call when the task is on a CPU.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid. This is OK when called from perf_release since
+ * that only calls us on the top-level context, which can't be a clone.
+ * When called from perf_counter_exit_task, it's OK because the
+ * context has been detached from its task.
+ */
+static void perf_counter_remove_from_context(struct perf_counter *counter)
+{
+ struct perf_counter_context *ctx = counter->ctx;
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Per cpu counters are removed via an smp call and
+ * the removal is always sucessful.
+ */
+ smp_call_function_single(counter->cpu,
+ __perf_counter_remove_from_context,
+ counter, 1);
+ return;
+ }
+
+retry:
+ task_oncpu_function_call(task, __perf_counter_remove_from_context,
+ counter);
+
+ spin_lock_irq(&ctx->lock);
+ /*
+ * If the context is active we need to retry the smp call.
+ */
+ if (ctx->nr_active && !list_empty(&counter->list_entry)) {
+ spin_unlock_irq(&ctx->lock);
+ goto retry;
+ }
+
+ /*
+ * The lock prevents that this context is scheduled in so we
+ * can remove the counter safely, if the call above did not
+ * succeed.
+ */
+ if (!list_empty(&counter->list_entry)) {
+ list_del_counter(counter, ctx);
+ }
+ spin_unlock_irq(&ctx->lock);
+}
+
+static inline u64 perf_clock(void)
+{
+ return cpu_clock(smp_processor_id());
+}
+
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_counter_context *ctx)
+{
+ u64 now = perf_clock();
+
+ ctx->time += now - ctx->timestamp;
+ ctx->timestamp = now;
+}
+
+/*
+ * Update the total_time_enabled and total_time_running fields for a counter.
+ */
+static void update_counter_times(struct perf_counter *counter)
+{
+ struct perf_counter_context *ctx = counter->ctx;
+ u64 run_end;
+
+ if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
+ counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE)
+ return;
+
+ counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
+
+ if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+ run_end = counter->tstamp_stopped;
+ else
+ run_end = ctx->time;
+
+ counter->total_time_running = run_end - counter->tstamp_running;
+}
+
+/*
+ * Update total_time_enabled and total_time_running for all counters in a group.
+ */
+static void update_group_times(struct perf_counter *leader)
+{
+ struct perf_counter *counter;
+
+ update_counter_times(leader);
+ list_for_each_entry(counter, &leader->sibling_list, list_entry)
+ update_counter_times(counter);
+}
+
+/*
+ * Cross CPU call to disable a performance counter
+ */
+static void __perf_counter_disable(void *info)
+{
+ struct perf_counter *counter = info;
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_counter_context *ctx = counter->ctx;
+
+ /*
+ * If this is a per-task counter, need to check whether this
+ * counter's task is the current task on this cpu.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx)
+ return;
+
+ spin_lock(&ctx->lock);
+
+ /*
+ * If the counter is on, turn it off.
+ * If it is in error state, leave it in error state.
+ */
+ if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
+ update_context_time(ctx);
+ update_group_times(counter);
+ if (counter == counter->group_leader)
+ group_sched_out(counter, cpuctx, ctx);
+ else
+ counter_sched_out(counter, cpuctx, ctx);
+ counter->state = PERF_COUNTER_STATE_OFF;
+ }
+
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * Disable a counter.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid. This condition is satisifed when called through
+ * perf_counter_for_each_child or perf_counter_for_each because they
+ * hold the top-level counter's child_mutex, so any descendant that
+ * goes to exit will block in sync_child_counter.
+ * When called from perf_pending_counter it's OK because counter->ctx
+ * is the current context on this CPU and preemption is disabled,
+ * hence we can't get into perf_counter_task_sched_out for this context.
+ */
+static void perf_counter_disable(struct perf_counter *counter)
+{
+ struct perf_counter_context *ctx = counter->ctx;
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Disable the counter on the cpu that it's on
+ */
+ smp_call_function_single(counter->cpu, __perf_counter_disable,
+ counter, 1);
+ return;
+ }
+
+ retry:
+ task_oncpu_function_call(task, __perf_counter_disable, counter);
+
+ spin_lock_irq(&ctx->lock);
+ /*
+ * If the counter is still active, we need to retry the cross-call.
+ */
+ if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
+ spin_unlock_irq(&ctx->lock);
+ goto retry;
+ }
+
+ /*
+ * Since we have the lock this context can't be scheduled
+ * in, so we can change the state safely.
+ */
+ if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+ update_group_times(counter);
+ counter->state = PERF_COUNTER_STATE_OFF;
+ }
+
+ spin_unlock_irq(&ctx->lock);
+}
+
+static int
+counter_sched_in(struct perf_counter *counter,
+ struct perf_cpu_context *cpuctx,
+ struct perf_counter_context *ctx,
+ int cpu)
+{
+ if (counter->state <= PERF_COUNTER_STATE_OFF)
+ return 0;
+
+ counter->state = PERF_COUNTER_STATE_ACTIVE;
+ counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
+ /*
+ * The new state must be visible before we turn it on in the hardware:
+ */
+ smp_wmb();
+
+ if (counter->pmu->enable(counter)) {
+ counter->state = PERF_COUNTER_STATE_INACTIVE;
+ counter->oncpu = -1;
+ return -EAGAIN;
+ }
+
+ counter->tstamp_running += ctx->time - counter->tstamp_stopped;
+
+ if (!is_software_counter(counter))
+ cpuctx->active_oncpu++;
+ ctx->nr_active++;
+
+ if (counter->attr.exclusive)
+ cpuctx->exclusive = 1;
+
+ return 0;
+}
+
+static int
+group_sched_in(struct perf_counter *group_counter,
+ struct perf_cpu_context *cpuctx,
+ struct perf_counter_context *ctx,
+ int cpu)
+{
+ struct perf_counter *counter, *partial_group;
+ int ret;
+
+ if (group_counter->state == PERF_COUNTER_STATE_OFF)
+ return 0;
+
+ ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
+ if (ret)
+ return ret < 0 ? ret : 0;
+
+ if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
+ return -EAGAIN;
+
+ /*
+ * Schedule in siblings as one group (if any):
+ */
+ list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+ if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
+ partial_group = counter;
+ goto group_error;
+ }
+ }
+
+ return 0;
+
+group_error:
+ /*
+ * Groups can be scheduled in as one unit only, so undo any
+ * partial group before returning:
+ */
+ list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+ if (counter == partial_group)
+ break;
+ counter_sched_out(counter, cpuctx, ctx);
+ }
+ counter_sched_out(group_counter, cpuctx, ctx);
+
+ return -EAGAIN;
+}
+
+/*
+ * Return 1 for a group consisting entirely of software counters,
+ * 0 if the group contains any hardware counters.
+ */
+static int is_software_only_group(struct perf_counter *leader)
+{
+ struct perf_counter *counter;
+
+ if (!is_software_counter(leader))
+ return 0;
+
+ list_for_each_entry(counter, &leader->sibling_list, list_entry)
+ if (!is_software_counter(counter))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Work out whether we can put this counter group on the CPU now.
+ */
+static int group_can_go_on(struct perf_counter *counter,
+ struct perf_cpu_context *cpuctx,
+ int can_add_hw)
+{
+ /*
+ * Groups consisting entirely of software counters can always go on.
+ */
+ if (is_software_only_group(counter))
+ return 1;
+ /*
+ * If an exclusive group is already on, no other hardware
+ * counters can go on.
+ */
+ if (cpuctx->exclusive)
+ return 0;
+ /*
+ * If this group is exclusive and there are already
+ * counters on the CPU, it can't go on.
+ */
+ if (counter->attr.exclusive && cpuctx->active_oncpu)
+ return 0;
+ /*
+ * Otherwise, try to add it if all previous groups were able
+ * to go on.
+ */
+ return can_add_hw;
+}
+
+static void add_counter_to_ctx(struct perf_counter *counter,
+ struct perf_counter_context *ctx)
+{
+ list_add_counter(counter, ctx);
+ counter->tstamp_enabled = ctx->time;
+ counter->tstamp_running = ctx->time;
+ counter->tstamp_stopped = ctx->time;
+}
+
+/*
+ * Cross CPU call to install and enable a performance counter
+ *
+ * Must be called with ctx->mutex held
+ */
+static void __perf_install_in_context(void *info)
+{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_counter *counter = info;
+ struct perf_counter_context *ctx = counter->ctx;
+ struct perf_counter *leader = counter->group_leader;
+ int cpu = smp_processor_id();
+ int err;
+
+ /*
+ * If this is a task context, we need to check whether it is
+ * the current task context of this cpu. If not it has been
+ * scheduled out before the smp call arrived.
+ * Or possibly this is the right context but it isn't
+ * on this cpu because it had no counters.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx) {
+ if (cpuctx->task_ctx || ctx->task != current)
+ return;
+ cpuctx->task_ctx = ctx;
+ }
+
+ spin_lock(&ctx->lock);
+ ctx->is_active = 1;
+ update_context_time(ctx);
+
+ /*
+ * Protect the list operation against NMI by disabling the
+ * counters on a global level. NOP for non NMI based counters.
+ */
+ perf_disable();
+
+ add_counter_to_ctx(counter, ctx);
+
+ /*
+ * Don't put the counter on if it is disabled or if
+ * it is in a group and the group isn't on.
+ */
+ if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
+ (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
+ goto unlock;
+
+ /*
+ * An exclusive counter can't go on if there are already active
+ * hardware counters, and no hardware counter can go on if there
+ * is already an exclusive counter on.
+ */
+ if (!group_can_go_on(counter, cpuctx, 1))
+ err = -EEXIST;
+ else
+ err = counter_sched_in(counter, cpuctx, ctx, cpu);
+
+ if (err) {
+ /*
+ * This counter couldn't go on. If it is in a group
+ * then we have to pull the whole group off.
+ * If the counter group is pinned then put it in error state.
+ */
+ if (leader != counter)
+ group_sched_out(leader, cpuctx, ctx);
+ if (leader->attr.pinned) {
+ update_group_times(leader);
+ leader->state = PERF_COUNTER_STATE_ERROR;
+ }
+ }
+
+ if (!err && !ctx->task && cpuctx->max_pertask)
+ cpuctx->max_pertask--;
+
+ unlock:
+ perf_enable();
+
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * Attach a performance counter to a context
+ *
+ * First we add the counter to the list with the hardware enable bit
+ * in counter->hw_config cleared.
+ *
+ * If the counter is attached to a task which is on a CPU we use a smp
+ * call to enable it in the task context. The task might have been
+ * scheduled away, but we check this in the smp call again.
+ *
+ * Must be called with ctx->mutex held.
+ */
+static void
+perf_install_in_context(struct perf_counter_context *ctx,
+ struct perf_counter *counter,
+ int cpu)
+{
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Per cpu counters are installed via an smp call and
+ * the install is always sucessful.
+ */
+ smp_call_function_single(cpu, __perf_install_in_context,
+ counter, 1);
+ return;
+ }
+
+retry:
+ task_oncpu_function_call(task, __perf_install_in_context,
+ counter);
+
+ spin_lock_irq(&ctx->lock);
+ /*
+ * we need to retry the smp call.
+ */
+ if (ctx->is_active && list_empty(&counter->list_entry)) {
+ spin_unlock_irq(&ctx->lock);
+ goto retry;
+ }
+
+ /*
+ * The lock prevents that this context is scheduled in so we
+ * can add the counter safely, if it the call above did not
+ * succeed.
+ */
+ if (list_empty(&counter->list_entry))
+ add_counter_to_ctx(counter, ctx);
+ spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Put a counter into inactive state and update time fields.
+ * Enabling the leader of a group effectively enables all
+ * the group members that aren't explicitly disabled, so we
+ * have to update their ->tstamp_enabled also.
+ * Note: this works for group members as well as group leaders
+ * since the non-leader members' sibling_lists will be empty.
+ */
+static void __perf_counter_mark_enabled(struct perf_counter *counter,
+ struct perf_counter_context *ctx)
+{
+ struct perf_counter *sub;
+
+ counter->state = PERF_COUNTER_STATE_INACTIVE;
+ counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
+ list_for_each_entry(sub, &counter->sibling_list, list_entry)
+ if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
+ sub->tstamp_enabled =
+ ctx->time - sub->total_time_enabled;
+}
+
+/*
+ * Cross CPU call to enable a performance counter
+ */
+static void __perf_counter_enable(void *info)
+{
+ struct perf_counter *counter = info;
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_counter_context *ctx = counter->ctx;
+ struct perf_counter *leader = counter->group_leader;
+ int err;
+
+ /*
+ * If this is a per-task counter, need to check whether this
+ * counter's task is the current task on this cpu.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx) {
+ if (cpuctx->task_ctx || ctx->task != current)
+ return;
+ cpuctx->task_ctx = ctx;
+ }
+
+ spin_lock(&ctx->lock);
+ ctx->is_active = 1;
+ update_context_time(ctx);
+
+ if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+ goto unlock;
+ __perf_counter_mark_enabled(counter, ctx);
+
+ /*
+ * If the counter is in a group and isn't the group leader,
+ * then don't put it on unless the group is on.
+ */
+ if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
+ goto unlock;
+
+ if (!group_can_go_on(counter, cpuctx, 1)) {
+ err = -EEXIST;
+ } else {
+ perf_disable();
+ if (counter == leader)
+ err = group_sched_in(counter, cpuctx, ctx,
+ smp_processor_id());
+ else
+ err = counter_sched_in(counter, cpuctx, ctx,
+ smp_processor_id());
+ perf_enable();
+ }
+
+ if (err) {
+ /*
+ * If this counter can't go on and it's part of a
+ * group, then the whole group has to come off.
+ */
+ if (leader != counter)
+ group_sched_out(leader, cpuctx, ctx);
+ if (leader->attr.pinned) {
+ update_group_times(leader);
+ leader->state = PERF_COUNTER_STATE_ERROR;
+ }
+ }
+
+ unlock:
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * Enable a counter.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid. This condition is satisfied when called through
+ * perf_counter_for_each_child or perf_counter_for_each as described
+ * for perf_counter_disable.
+ */
+static void perf_counter_enable(struct perf_counter *counter)
+{
+ struct perf_counter_context *ctx = counter->ctx;
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Enable the counter on the cpu that it's on
+ */
+ smp_call_function_single(counter->cpu, __perf_counter_enable,
+ counter, 1);
+ return;
+ }
+
+ spin_lock_irq(&ctx->lock);
+ if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+ goto out;
+
+ /*
+ * If the counter is in error state, clear that first.
+ * That way, if we see the counter in error state below, we
+ * know that it has gone back into error state, as distinct
+ * from the task having been scheduled away before the
+ * cross-call arrived.
+ */
+ if (counter->state == PERF_COUNTER_STATE_ERROR)
+ counter->state = PERF_COUNTER_STATE_OFF;
+
+ retry:
+ spin_unlock_irq(&ctx->lock);
+ task_oncpu_function_call(task, __perf_counter_enable, counter);
+
+ spin_lock_irq(&ctx->lock);
+
+ /*
+ * If the context is active and the counter is still off,
+ * we need to retry the cross-call.
+ */
+ if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
+ goto retry;
+
+ /*
+ * Since we have the lock this context can't be scheduled
+ * in, so we can change the state safely.
+ */
+ if (counter->state == PERF_COUNTER_STATE_OFF)
+ __perf_counter_mark_enabled(counter, ctx);
+
+ out:
+ spin_unlock_irq(&ctx->lock);
+}
+
+static int perf_counter_refresh(struct perf_counter *counter, int refresh)
+{
+ /*
+ * not supported on inherited counters
+ */
+ if (counter->attr.inherit)
+ return -EINVAL;
+
+ atomic_add(refresh, &counter->event_limit);
+ perf_counter_enable(counter);
+
+ return 0;
+}
+
+void __perf_counter_sched_out(struct perf_counter_context *ctx,
+ struct perf_cpu_context *cpuctx)
+{
+ struct perf_counter *counter;
+
+ spin_lock(&ctx->lock);
+ ctx->is_active = 0;
+ if (likely(!ctx->nr_counters))
+ goto out;
+ update_context_time(ctx);
+
+ perf_disable();
+ if (ctx->nr_active) {
+ list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+ if (counter != counter->group_leader)
+ counter_sched_out(counter, cpuctx, ctx);
+ else
+ group_sched_out(counter, cpuctx, ctx);
+ }
+ }
+ perf_enable();
+ out:
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * Test whether two contexts are equivalent, i.e. whether they
+ * have both been cloned from the same version of the same context
+ * and they both have the same number of enabled counters.
+ * If the number of enabled counters is the same, then the set
+ * of enabled counters should be the same, because these are both
+ * inherited contexts, therefore we can't access individual counters
+ * in them directly with an fd; we can only enable/disable all
+ * counters via prctl, or enable/disable all counters in a family
+ * via ioctl, which will have the same effect on both contexts.
+ */
+static int context_equiv(struct perf_counter_context *ctx1,
+ struct perf_counter_context *ctx2)
+{
+ return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
+ && ctx1->parent_gen == ctx2->parent_gen
+ && !ctx1->pin_count && !ctx2->pin_count;
+}
+
+static void __perf_counter_read(void *counter);
+
+static void __perf_counter_sync_stat(struct perf_counter *counter,
+ struct perf_counter *next_counter)
+{
+ u64 value;
+
+ if (!counter->attr.inherit_stat)
+ return;
+
+ /*
+ * Update the counter value, we cannot use perf_counter_read()
+ * because we're in the middle of a context switch and have IRQs
+ * disabled, which upsets smp_call_function_single(), however
+ * we know the counter must be on the current CPU, therefore we
+ * don't need to use it.
+ */
+ switch (counter->state) {
+ case PERF_COUNTER_STATE_ACTIVE:
+ __perf_counter_read(counter);
+ break;
+
+ case PERF_COUNTER_STATE_INACTIVE:
+ update_counter_times(counter);
+ break;
+
+ default:
+ break;
+ }
+
+ /*
+ * In order to keep per-task stats reliable we need to flip the counter
+ * values when we flip the contexts.
+ */
+ value = atomic64_read(&next_counter->count);
+ value = atomic64_xchg(&counter->count, value);
+ atomic64_set(&next_counter->count, value);
+
+ swap(counter->total_time_enabled, next_counter->total_time_enabled);
+ swap(counter->total_time_running, next_counter->total_time_running);
+
+ /*
+ * Since we swizzled the values, update the user visible data too.
+ */
+ perf_counter_update_userpage(counter);
+ perf_counter_update_userpage(next_counter);
+}
+
+#define list_next_entry(pos, member) \
+ list_entry(pos->member.next, typeof(*pos), member)
+
+static void perf_counter_sync_stat(struct perf_counter_context *ctx,
+ struct perf_counter_context *next_ctx)
+{
+ struct perf_counter *counter, *next_counter;
+
+ if (!ctx->nr_stat)
+ return;
+
+ counter = list_first_entry(&ctx->event_list,
+ struct perf_counter, event_entry);
+
+ next_counter = list_first_entry(&next_ctx->event_list,
+ struct perf_counter, event_entry);
+
+ while (&counter->event_entry != &ctx->event_list &&
+ &next_counter->event_entry != &next_ctx->event_list) {
+
+ __perf_counter_sync_stat(counter, next_counter);
+
+ counter = list_next_entry(counter, event_entry);
+ next_counter = list_next_entry(next_counter, event_entry);
+ }
+}
+
+/*
+ * Called from scheduler to remove the counters of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each counter and update the counter value in counter->count.
+ *
+ * This does not protect us against NMI, but disable()
+ * sets the disabled bit in the control field of counter _before_
+ * accessing the counter control register. If a NMI hits, then it will
+ * not restart the counter.
+ */
+void perf_counter_task_sched_out(struct task_struct *task,
+ struct task_struct *next, int cpu)
+{
+ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+ struct perf_counter_context *ctx = task->perf_counter_ctxp;
+ struct perf_counter_context *next_ctx;
+ struct perf_counter_context *parent;
+ struct pt_regs *regs;
+ int do_switch = 1;
+
+ regs = task_pt_regs(task);
+ perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
+
+ if (likely(!ctx || !cpuctx->task_ctx))
+ return;
+
+ update_context_time(ctx);
+
+ rcu_read_lock();
+ parent = rcu_dereference(ctx->parent_ctx);
+ next_ctx = next->perf_counter_ctxp;
+ if (parent && next_ctx &&
+ rcu_dereference(next_ctx->parent_ctx) == parent) {
+ /*
+ * Looks like the two contexts are clones, so we might be
+ * able to optimize the context switch. We lock both
+ * contexts and check that they are clones under the
+ * lock (including re-checking that neither has been
+ * uncloned in the meantime). It doesn't matter which
+ * order we take the locks because no other cpu could
+ * be trying to lock both of these tasks.
+ */
+ spin_lock(&ctx->lock);
+ spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
+ if (context_equiv(ctx, next_ctx)) {
+ /*
+ * XXX do we need a memory barrier of sorts
+ * wrt to rcu_dereference() of perf_counter_ctxp
+ */
+ task->perf_counter_ctxp = next_ctx;
+ next->perf_counter_ctxp = ctx;
+ ctx->task = next;
+ next_ctx->task = task;
+ do_switch = 0;
+
+ perf_counter_sync_stat(ctx, next_ctx);
+ }
+ spin_unlock(&next_ctx->lock);
+ spin_unlock(&ctx->lock);
+ }
+ rcu_read_unlock();
+
+ if (do_switch) {
+ __perf_counter_sched_out(ctx, cpuctx);
+ cpuctx->task_ctx = NULL;
+ }
+}
+
+/*
+ * Called with IRQs disabled
+ */
+static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
+{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+
+ if (!cpuctx->task_ctx)
+ return;
+
+ if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+ return;
+
+ __perf_counter_sched_out(ctx, cpuctx);
+ cpuctx->task_ctx = NULL;
+}
+
+/*
+ * Called with IRQs disabled
+ */
+static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
+{
+ __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
+}
+
+static void
+__perf_counter_sched_in(struct perf_counter_context *ctx,
+ struct perf_cpu_context *cpuctx, int cpu)
+{
+ struct perf_counter *counter;
+ int can_add_hw = 1;
+
+ spin_lock(&ctx->lock);
+ ctx->is_active = 1;
+ if (likely(!ctx->nr_counters))
+ goto out;
+
+ ctx->timestamp = perf_clock();
+
+ perf_disable();
+
+ /*
+ * First go through the list and put on any pinned groups
+ * in order to give them the best chance of going on.
+ */
+ list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+ if (counter->state <= PERF_COUNTER_STATE_OFF ||
+ !counter->attr.pinned)
+ continue;
+ if (counter->cpu != -1 && counter->cpu != cpu)
+ continue;
+
+ if (counter != counter->group_leader)
+ counter_sched_in(counter, cpuctx, ctx, cpu);
+ else {
+ if (group_can_go_on(counter, cpuctx, 1))
+ group_sched_in(counter, cpuctx, ctx, cpu);
+ }
+
+ /*
+ * If this pinned group hasn't been scheduled,
+ * put it in error state.
+ */
+ if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+ update_group_times(counter);
+ counter->state = PERF_COUNTER_STATE_ERROR;
+ }
+ }
+
+ list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+ /*
+ * Ignore counters in OFF or ERROR state, and
+ * ignore pinned counters since we did them already.
+ */
+ if (counter->state <= PERF_COUNTER_STATE_OFF ||
+ counter->attr.pinned)
+ continue;
+
+ /*
+ * Listen to the 'cpu' scheduling filter constraint
+ * of counters:
+ */
+ if (counter->cpu != -1 && counter->cpu != cpu)
+ continue;
+
+ if (counter != counter->group_leader) {
+ if (counter_sched_in(counter, cpuctx, ctx, cpu))
+ can_add_hw = 0;
+ } else {
+ if (group_can_go_on(counter, cpuctx, can_add_hw)) {
+ if (group_sched_in(counter, cpuctx, ctx, cpu))
+ can_add_hw = 0;
+ }
+ }
+ }
+ perf_enable();
+ out:
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * Called from scheduler to add the counters of the current task
+ * with interrupts disabled.
+ *
+ * We restore the counter value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of counter _before_
+ * accessing the counter control register. If a NMI hits, then it will
+ * keep the counter running.
+ */
+void perf_counter_task_sched_in(struct task_struct *task, int cpu)
+{
+ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+ struct perf_counter_context *ctx = task->perf_counter_ctxp;
+
+ if (likely(!ctx))
+ return;
+ if (cpuctx->task_ctx == ctx)
+ return;
+ __perf_counter_sched_in(ctx, cpuctx, cpu);
+ cpuctx->task_ctx = ctx;
+}
+
+static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
+{
+ struct perf_counter_context *ctx = &cpuctx->ctx;
+
+ __perf_counter_sched_in(ctx, cpuctx, cpu);
+}
+
+#define MAX_INTERRUPTS (~0ULL)
+
+static void perf_log_throttle(struct perf_counter *counter, int enable);
+
+static void perf_adjust_period(struct perf_counter *counter, u64 events)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+ u64 period, sample_period;
+ s64 delta;
+
+ events *= hwc->sample_period;
+ period = div64_u64(events, counter->attr.sample_freq);
+
+ delta = (s64)(period - hwc->sample_period);
+ delta = (delta + 7) / 8; /* low pass filter */
+
+ sample_period = hwc->sample_period + delta;
+
+ if (!sample_period)
+ sample_period = 1;
+
+ hwc->sample_period = sample_period;
+}
+
+static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
+{
+ struct perf_counter *counter;
+ struct hw_perf_counter *hwc;
+ u64 interrupts, freq;
+
+ spin_lock(&ctx->lock);
+ list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+ if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+ continue;
+
+ hwc = &counter->hw;
+
+ interrupts = hwc->interrupts;
+ hwc->interrupts = 0;
+
+ /*
+ * unthrottle counters on the tick
+ */
+ if (interrupts == MAX_INTERRUPTS) {
+ perf_log_throttle(counter, 1);
+ counter->pmu->unthrottle(counter);
+ interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
+ }
+
+ if (!counter->attr.freq || !counter->attr.sample_freq)
+ continue;
+
+ /*
+ * if the specified freq < HZ then we need to skip ticks
+ */
+ if (counter->attr.sample_freq < HZ) {
+ freq = counter->attr.sample_freq;
+
+ hwc->freq_count += freq;
+ hwc->freq_interrupts += interrupts;
+
+ if (hwc->freq_count < HZ)
+ continue;
+
+ interrupts = hwc->freq_interrupts;
+ hwc->freq_interrupts = 0;
+ hwc->freq_count -= HZ;
+ } else
+ freq = HZ;
+
+ perf_adjust_period(counter, freq * interrupts);
+
+ /*
+ * In order to avoid being stalled by an (accidental) huge
+ * sample period, force reset the sample period if we didn't
+ * get any events in this freq period.
+ */
+ if (!interrupts) {
+ perf_disable();
+ counter->pmu->disable(counter);
+ atomic64_set(&hwc->period_left, 0);
+ counter->pmu->enable(counter);
+ perf_enable();
+ }
+ }
+ spin_unlock(&ctx->lock);
+}
+
+/*
+ * Round-robin a context's counters:
+ */
+static void rotate_ctx(struct perf_counter_context *ctx)
+{
+ struct perf_counter *counter;
+
+ if (!ctx->nr_counters)
+ return;
+
+ spin_lock(&ctx->lock);
+ /*
+ * Rotate the first entry last (works just fine for group counters too):
+ */
+ perf_disable();
+ list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+ list_move_tail(&counter->list_entry, &ctx->counter_list);
+ break;
+ }
+ perf_enable();
+
+ spin_unlock(&ctx->lock);
+}
+
+void perf_counter_task_tick(struct task_struct *curr, int cpu)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_counter_context *ctx;
+
+ if (!atomic_read(&nr_counters))
+ return;
+
+ cpuctx = &per_cpu(perf_cpu_context, cpu);
+ ctx = curr->perf_counter_ctxp;
+
+ perf_ctx_adjust_freq(&cpuctx->ctx);
+ if (ctx)
+ perf_ctx_adjust_freq(ctx);
+
+ perf_counter_cpu_sched_out(cpuctx);
+ if (ctx)
+ __perf_counter_task_sched_out(ctx);
+
+ rotate_ctx(&cpuctx->ctx);
+ if (ctx)
+ rotate_ctx(ctx);
+
+ perf_counter_cpu_sched_in(cpuctx, cpu);
+ if (ctx)
+ perf_counter_task_sched_in(curr, cpu);
+}
+
+/*
+ * Enable all of a task's counters that have been marked enable-on-exec.
+ * This expects task == current.
+ */
+static void perf_counter_enable_on_exec(struct task_struct *task)
+{
+ struct perf_counter_context *ctx;
+ struct perf_counter *counter;
+ unsigned long flags;
+ int enabled = 0;
+
+ local_irq_save(flags);
+ ctx = task->perf_counter_ctxp;
+ if (!ctx || !ctx->nr_counters)
+ goto out;
+
+ __perf_counter_task_sched_out(ctx);
+
+ spin_lock(&ctx->lock);
+
+ list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+ if (!counter->attr.enable_on_exec)
+ continue;
+ counter->attr.enable_on_exec = 0;
+ if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+ continue;
+ __perf_counter_mark_enabled(counter, ctx);
+ enabled = 1;
+ }
+
+ /*
+ * Unclone this context if we enabled any counter.
+ */
+ if (enabled)
+ unclone_ctx(ctx);
+
+ spin_unlock(&ctx->lock);
+
+ perf_counter_task_sched_in(task, smp_processor_id());
+ out:
+ local_irq_restore(flags);
+}
+
+/*
+ * Cross CPU call to read the hardware counter
+ */
+static void __perf_counter_read(void *info)
+{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_counter *counter = info;
+ struct perf_counter_context *ctx = counter->ctx;
+ unsigned long flags;
+
+ /*
+ * If this is a task context, we need to check whether it is
+ * the current task context of this cpu. If not it has been
+ * scheduled out before the smp call arrived. In that case
+ * counter->count would have been updated to a recent sample
+ * when the counter was scheduled out.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx)
+ return;
+
+ local_irq_save(flags);
+ if (ctx->is_active)
+ update_context_time(ctx);
+ counter->pmu->read(counter);
+ update_counter_times(counter);
+ local_irq_restore(flags);
+}
+
+static u64 perf_counter_read(struct perf_counter *counter)
+{
+ /*
+ * If counter is enabled and currently active on a CPU, update the
+ * value in the counter structure:
+ */
+ if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
+ smp_call_function_single(counter->oncpu,
+ __perf_counter_read, counter, 1);
+ } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+ update_counter_times(counter);
+ }
+
+ return atomic64_read(&counter->count);
+}
+
+/*
+ * Initialize the perf_counter context in a task_struct:
+ */
+static void
+__perf_counter_init_context(struct perf_counter_context *ctx,
+ struct task_struct *task)
+{
+ memset(ctx, 0, sizeof(*ctx));
+ spin_lock_init(&ctx->lock);
+ mutex_init(&ctx->mutex);
+ INIT_LIST_HEAD(&ctx->counter_list);
+ INIT_LIST_HEAD(&ctx->event_list);
+ atomic_set(&ctx->refcount, 1);
+ ctx->task = task;
+}
+
+static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
+{
+ struct perf_counter_context *ctx;
+ struct perf_cpu_context *cpuctx;
+ struct task_struct *task;
+ unsigned long flags;
+ int err;
+
+ /*
+ * If cpu is not a wildcard then this is a percpu counter:
+ */
+ if (cpu != -1) {
+ /* Must be root to operate on a CPU counter: */
+ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EACCES);
+
+ if (cpu < 0 || cpu > num_possible_cpus())
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * We could be clever and allow to attach a counter to an
+ * offline CPU and activate it when the CPU comes up, but
+ * that's for later.
+ */
+ if (!cpu_isset(cpu, cpu_online_map))
+ return ERR_PTR(-ENODEV);
+
+ cpuctx = &per_cpu(perf_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
+ get_ctx(ctx);
+
+ return ctx;
+ }
+
+ rcu_read_lock();
+ if (!pid)
+ task = current;
+ else
+ task = find_task_by_vpid(pid);
+ if (task)
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ if (!task)
+ return ERR_PTR(-ESRCH);
+
+ /*
+ * Can't attach counters to a dying task.
+ */
+ err = -ESRCH;
+ if (task->flags & PF_EXITING)
+ goto errout;
+
+ /* Reuse ptrace permission checks for now. */
+ err = -EACCES;
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ goto errout;
+
+ retry:
+ ctx = perf_lock_task_context(task, &flags);
+ if (ctx) {
+ unclone_ctx(ctx);
+ spin_unlock_irqrestore(&ctx->lock, flags);
+ }
+
+ if (!ctx) {
+ ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
+ err = -ENOMEM;
+ if (!ctx)
+ goto errout;
+ __perf_counter_init_context(ctx, task);
+ get_ctx(ctx);
+ if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
+ /*
+ * We raced with some other task; use
+ * the context they set.
+ */
+ kfree(ctx);
+ goto retry;
+ }
+ get_task_struct(task);
+ }
+
+ put_task_struct(task);
+ return ctx;
+
+ errout:
+ put_task_struct(task);
+ return ERR_PTR(err);
+}
+
+static void free_counter_rcu(struct rcu_head *head)
+{
+ struct perf_counter *counter;
+
+ counter = container_of(head, struct perf_counter, rcu_head);
+ if (counter->ns)
+ put_pid_ns(counter->ns);
+ kfree(counter);
+}
+
+static void perf_pending_sync(struct perf_counter *counter);
+
+static void free_counter(struct perf_counter *counter)
+{
+ perf_pending_sync(counter);
+
+ if (!counter->parent) {
+ atomic_dec(&nr_counters);
+ if (counter->attr.mmap)
+ atomic_dec(&nr_mmap_counters);
+ if (counter->attr.comm)
+ atomic_dec(&nr_comm_counters);
+ if (counter->attr.task)
+ atomic_dec(&nr_task_counters);
+ }
+
+ if (counter->output) {
+ fput(counter->output->filp);
+ counter->output = NULL;
+ }
+
+ if (counter->destroy)
+ counter->destroy(counter);
+
+ put_ctx(counter->ctx);
+ call_rcu(&counter->rcu_head, free_counter_rcu);
+}
+
+/*
+ * Called when the last reference to the file is gone.
+ */
+static int perf_release(struct inode *inode, struct file *file)
+{
+ struct perf_counter *counter = file->private_data;
+ struct perf_counter_context *ctx = counter->ctx;
+
+ file->private_data = NULL;
+
+ WARN_ON_ONCE(ctx->parent_ctx);
+ mutex_lock(&ctx->mutex);
+ perf_counter_remove_from_context(counter);
+ mutex_unlock(&ctx->mutex);
+
+ mutex_lock(&counter->owner->perf_counter_mutex);
+ list_del_init(&counter->owner_entry);
+ mutex_unlock(&counter->owner->perf_counter_mutex);
+ put_task_struct(counter->owner);
+
+ free_counter(counter);
+
+ return 0;
+}
+
+static int perf_counter_read_size(struct perf_counter *counter)
+{
+ int entry = sizeof(u64); /* value */
+ int size = 0;
+ int nr = 1;
+
+ if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ size += sizeof(u64);
+
+ if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ size += sizeof(u64);
+
+ if (counter->attr.read_format & PERF_FORMAT_ID)
+ entry += sizeof(u64);
+
+ if (counter->attr.read_format & PERF_FORMAT_GROUP) {
+ nr += counter->group_leader->nr_siblings;
+ size += sizeof(u64);
+ }
+
+ size += entry * nr;
+
+ return size;
+}
+
+static u64 perf_counter_read_value(struct perf_counter *counter)
+{
+ struct perf_counter *child;
+ u64 total = 0;
+
+ total += perf_counter_read(counter);
+ list_for_each_entry(child, &counter->child_list, child_list)
+ total += perf_counter_read(child);
+
+ return total;
+}
+
+static int perf_counter_read_entry(struct perf_counter *counter,
+ u64 read_format, char __user *buf)
+{
+ int n = 0, count = 0;
+ u64 values[2];
+
+ values[n++] = perf_counter_read_value(counter);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_counter_id(counter);
+
+ count = n * sizeof(u64);
+
+ if (copy_to_user(buf, values, count))
+ return -EFAULT;
+
+ return count;
+}
+
+static int perf_counter_read_group(struct perf_counter *counter,
+ u64 read_format, char __user *buf)
+{
+ struct perf_counter *leader = counter->group_leader, *sub;
+ int n = 0, size = 0, err = -EFAULT;
+ u64 values[3];
+
+ values[n++] = 1 + leader->nr_siblings;
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+ values[n++] = leader->total_time_enabled +
+ atomic64_read(&leader->child_total_time_enabled);
+ }
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+ values[n++] = leader->total_time_running +
+ atomic64_read(&leader->child_total_time_running);
+ }
+
+ size = n * sizeof(u64);
+
+ if (copy_to_user(buf, values, size))
+ return -EFAULT;
+
+ err = perf_counter_read_entry(leader, read_format, buf + size);
+ if (err < 0)
+ return err;
+
+ size += err;
+
+ list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+ err = perf_counter_read_entry(sub, read_format,
+ buf + size);
+ if (err < 0)
+ return err;
+
+ size += err;
+ }
+
+ return size;
+}
+
+static int perf_counter_read_one(struct perf_counter *counter,
+ u64 read_format, char __user *buf)
+{
+ u64 values[4];
+ int n = 0;
+
+ values[n++] = perf_counter_read_value(counter);
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+ values[n++] = counter->total_time_enabled +
+ atomic64_read(&counter->child_total_time_enabled);
+ }
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+ values[n++] = counter->total_time_running +
+ atomic64_read(&counter->child_total_time_running);
+ }
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_counter_id(counter);
+
+ if (copy_to_user(buf, values, n * sizeof(u64)))
+ return -EFAULT;
+
+ return n * sizeof(u64);
+}
+
+/*
+ * Read the performance counter - simple non blocking version for now
+ */
+static ssize_t
+perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
+{
+ u64 read_format = counter->attr.read_format;
+ int ret;
+
+ /*
+ * Return end-of-file for a read on a counter that is in
+ * error state (i.e. because it was pinned but it couldn't be
+ * scheduled on to the CPU at some point).
+ */
+ if (counter->state == PERF_COUNTER_STATE_ERROR)
+ return 0;
+
+ if (count < perf_counter_read_size(counter))
+ return -ENOSPC;
+
+ WARN_ON_ONCE(counter->ctx->parent_ctx);
+ mutex_lock(&counter->child_mutex);
+ if (read_format & PERF_FORMAT_GROUP)
+ ret = perf_counter_read_group(counter, read_format, buf);
+ else
+ ret = perf_counter_read_one(counter, read_format, buf);
+ mutex_unlock(&counter->child_mutex);
+
+ return ret;
+}
+
+static ssize_t
+perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+ struct perf_counter *counter = file->private_data;
+
+ return perf_read_hw(counter, buf, count);
+}
+
+static unsigned int perf_poll(struct file *file, poll_table *wait)
+{
+ struct perf_counter *counter = file->private_data;
+ struct perf_mmap_data *data;
+ unsigned int events = POLL_HUP;
+
+ rcu_read_lock();
+ data = rcu_dereference(counter->data);
+ if (data)
+ events = atomic_xchg(&data->poll, 0);
+ rcu_read_unlock();
+
+ poll_wait(file, &counter->waitq, wait);
+
+ return events;
+}
+
+static void perf_counter_reset(struct perf_counter *counter)
+{
+ (void)perf_counter_read(counter);
+ atomic64_set(&counter->count, 0);
+ perf_counter_update_userpage(counter);
+}
+
+/*
+ * Holding the top-level counter's child_mutex means that any
+ * descendant process that has inherited this counter will block
+ * in sync_child_counter if it goes to exit, thus satisfying the
+ * task existence requirements of perf_counter_enable/disable.
+ */
+static void perf_counter_for_each_child(struct perf_counter *counter,
+ void (*func)(struct perf_counter *))
+{
+ struct perf_counter *child;
+
+ WARN_ON_ONCE(counter->ctx->parent_ctx);
+ mutex_lock(&counter->child_mutex);
+ func(counter);
+ list_for_each_entry(child, &counter->child_list, child_list)
+ func(child);
+ mutex_unlock(&counter->child_mutex);
+}
+
+static void perf_counter_for_each(struct perf_counter *counter,
+ void (*func)(struct perf_counter *))
+{
+ struct perf_counter_context *ctx = counter->ctx;
+ struct perf_counter *sibling;
+
+ WARN_ON_ONCE(ctx->parent_ctx);
+ mutex_lock(&ctx->mutex);
+ counter = counter->group_leader;
+
+ perf_counter_for_each_child(counter, func);
+ func(counter);
+ list_for_each_entry(sibling, &counter->sibling_list, list_entry)
+ perf_counter_for_each_child(counter, func);
+ mutex_unlock(&ctx->mutex);
+}
+
+static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
+{
+ struct perf_counter_context *ctx = counter->ctx;
+ unsigned long size;
+ int ret = 0;
+ u64 value;
+
+ if (!counter->attr.sample_period)
+ return -EINVAL;
+
+ size = copy_from_user(&value, arg, sizeof(value));
+ if (size != sizeof(value))
+ return -EFAULT;
+
+ if (!value)
+ return -EINVAL;
+
+ spin_lock_irq(&ctx->lock);
+ if (counter->attr.freq) {
+ if (value > sysctl_perf_counter_sample_rate) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ counter->attr.sample_freq = value;
+ } else {
+ counter->attr.sample_period = value;
+ counter->hw.sample_period = value;
+ }
+unlock:
+ spin_unlock_irq(&ctx->lock);
+
+ return ret;
+}
+
+int perf_counter_set_output(struct perf_counter *counter, int output_fd);
+
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct perf_counter *counter = file->private_data;
+ void (*func)(struct perf_counter *);
+ u32 flags = arg;
+
+ switch (cmd) {
+ case PERF_COUNTER_IOC_ENABLE:
+ func = perf_counter_enable;
+ break;
+ case PERF_COUNTER_IOC_DISABLE:
+ func = perf_counter_disable;
+ break;
+ case PERF_COUNTER_IOC_RESET:
+ func = perf_counter_reset;
+ break;
+
+ case PERF_COUNTER_IOC_REFRESH:
+ return perf_counter_refresh(counter, arg);
+
+ case PERF_COUNTER_IOC_PERIOD:
+ return perf_counter_period(counter, (u64 __user *)arg);
+
+ case PERF_COUNTER_IOC_SET_OUTPUT:
+ return perf_counter_set_output(counter, arg);
+
+ default:
+ return -ENOTTY;
+ }
+
+ if (flags & PERF_IOC_FLAG_GROUP)
+ perf_counter_for_each(counter, func);
+ else
+ perf_counter_for_each_child(counter, func);
+
+ return 0;
+}
+
+int perf_counter_task_enable(void)
+{
+ struct perf_counter *counter;
+
+ mutex_lock(&current->perf_counter_mutex);
+ list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
+ perf_counter_for_each_child(counter, perf_counter_enable);
+ mutex_unlock(&current->perf_counter_mutex);
+
+ return 0;
+}
+
+int perf_counter_task_disable(void)
+{
+ struct perf_counter *counter;
+
+ mutex_lock(&current->perf_counter_mutex);
+ list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
+ perf_counter_for_each_child(counter, perf_counter_disable);
+ mutex_unlock(&current->perf_counter_mutex);
+
+ return 0;
+}
+
+#ifndef PERF_COUNTER_INDEX_OFFSET
+# define PERF_COUNTER_INDEX_OFFSET 0
+#endif
+
+static int perf_counter_index(struct perf_counter *counter)
+{
+ if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+ return 0;
+
+ return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
+}
+
+/*
+ * Callers need to ensure there can be no nesting of this function, otherwise
+ * the seqlock logic goes bad. We can not serialize this because the arch
+ * code calls this from NMI context.
+ */
+void perf_counter_update_userpage(struct perf_counter *counter)
+{
+ struct perf_counter_mmap_page *userpg;
+ struct perf_mmap_data *data;
+
+ rcu_read_lock();
+ data = rcu_dereference(counter->data);
+ if (!data)
+ goto unlock;
+
+ userpg = data->user_page;
+
+ /*
+ * Disable preemption so as to not let the corresponding user-space
+ * spin too long if we get preempted.
+ */
+ preempt_disable();
+ ++userpg->lock;
+ barrier();
+ userpg->index = perf_counter_index(counter);
+ userpg->offset = atomic64_read(&counter->count);
+ if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+ userpg->offset -= atomic64_read(&counter->hw.prev_count);
+
+ userpg->time_enabled = counter->total_time_enabled +
+ atomic64_read(&counter->child_total_time_enabled);
+
+ userpg->time_running = counter->total_time_running +
+ atomic64_read(&counter->child_total_time_running);
+
+ barrier();
+ ++userpg->lock;
+ preempt_enable();
+unlock:
+ rcu_read_unlock();
+}
+
+static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct perf_counter *counter = vma->vm_file->private_data;
+ struct perf_mmap_data *data;
+ int ret = VM_FAULT_SIGBUS;
+
+ if (vmf->flags & FAULT_FLAG_MKWRITE) {
+ if (vmf->pgoff == 0)
+ ret = 0;
+ return ret;
+ }
+
+ rcu_read_lock();
+ data = rcu_dereference(counter->data);
+ if (!data)
+ goto unlock;
+
+ if (vmf->pgoff == 0) {
+ vmf->page = virt_to_page(data->user_page);
+ } else {
+ int nr = vmf->pgoff - 1;
+
+ if ((unsigned)nr > data->nr_pages)
+ goto unlock;
+
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ goto unlock;
+
+ vmf->page = virt_to_page(data->data_pages[nr]);
+ }
+
+ get_page(vmf->page);
+ vmf->page->mapping = vma->vm_file->f_mapping;
+ vmf->page->index = vmf->pgoff;
+
+ ret = 0;
+unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
+{
+ struct perf_mmap_data *data;
+ unsigned long size;
+ int i;
+
+ WARN_ON(atomic_read(&counter->mmap_count));
+
+ size = sizeof(struct perf_mmap_data);
+ size += nr_pages * sizeof(void *);
+
+ data = kzalloc(size, GFP_KERNEL);
+ if (!data)
+ goto fail;
+
+ data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!data->user_page)
+ goto fail_user_page;
+
+ for (i = 0; i < nr_pages; i++) {
+ data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!data->data_pages[i])
+ goto fail_data_pages;
+ }
+
+ data->nr_pages = nr_pages;
+ atomic_set(&data->lock, -1);
+
+ rcu_assign_pointer(counter->data, data);
+
+ return 0;
+
+fail_data_pages:
+ for (i--; i >= 0; i--)
+ free_page((unsigned long)data->data_pages[i]);
+
+ free_page((unsigned long)data->user_page);
+
+fail_user_page:
+ kfree(data);
+
+fail:
+ return -ENOMEM;
+}
+
+static void perf_mmap_free_page(unsigned long addr)
+{
+ struct page *page = virt_to_page((void *)addr);
+
+ page->mapping = NULL;
+ __free_page(page);
+}
+
+static void __perf_mmap_data_free(struct rcu_head *rcu_head)
+{
+ struct perf_mmap_data *data;
+ int i;
+
+ data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
+
+ perf_mmap_free_page((unsigned long)data->user_page);
+ for (i = 0; i < data->nr_pages; i++)
+ perf_mmap_free_page((unsigned long)data->data_pages[i]);
+
+ kfree(data);
+}
+
+static void perf_mmap_data_free(struct perf_counter *counter)
+{
+ struct perf_mmap_data *data = counter->data;
+
+ WARN_ON(atomic_read(&counter->mmap_count));
+
+ rcu_assign_pointer(counter->data, NULL);
+ call_rcu(&data->rcu_head, __perf_mmap_data_free);
+}
+
+static void perf_mmap_open(struct vm_area_struct *vma)
+{
+ struct perf_counter *counter = vma->vm_file->private_data;
+
+ atomic_inc(&counter->mmap_count);
+}
+
+static void perf_mmap_close(struct vm_area_struct *vma)
+{
+ struct perf_counter *counter = vma->vm_file->private_data;
+
+ WARN_ON_ONCE(counter->ctx->parent_ctx);
+ if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
+ struct user_struct *user = current_user();
+
+ atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
+ vma->vm_mm->locked_vm -= counter->data->nr_locked;
+ perf_mmap_data_free(counter);
+ mutex_unlock(&counter->mmap_mutex);
+ }
+}
+
+static struct vm_operations_struct perf_mmap_vmops = {
+ .open = perf_mmap_open,
+ .close = perf_mmap_close,
+ .fault = perf_mmap_fault,
+ .page_mkwrite = perf_mmap_fault,
+};
+
+static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct perf_counter *counter = file->private_data;
+ unsigned long user_locked, user_lock_limit;
+ struct user_struct *user = current_user();
+ unsigned long locked, lock_limit;
+ unsigned long vma_size;
+ unsigned long nr_pages;
+ long user_extra, extra;
+ int ret = 0;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ vma_size = vma->vm_end - vma->vm_start;
+ nr_pages = (vma_size / PAGE_SIZE) - 1;
+
+ /*
+ * If we have data pages ensure they're a power-of-two number, so we
+ * can do bitmasks instead of modulo.
+ */
+ if (nr_pages != 0 && !is_power_of_2(nr_pages))
+ return -EINVAL;
+
+ if (vma_size != PAGE_SIZE * (1 + nr_pages))
+ return -EINVAL;
+
+ if (vma->vm_pgoff != 0)
+ return -EINVAL;
+
+ WARN_ON_ONCE(counter->ctx->parent_ctx);
+ mutex_lock(&counter->mmap_mutex);
+ if (counter->output) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (atomic_inc_not_zero(&counter->mmap_count)) {
+ if (nr_pages != counter->data->nr_pages)
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ user_extra = nr_pages + 1;
+ user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
+
+ /*
+ * Increase the limit linearly with more CPUs:
+ */
+ user_lock_limit *= num_online_cpus();
+
+ user_locked = atomic_long_read(&user->locked_vm) + user_extra;
+
+ extra = 0;
+ if (user_locked > user_lock_limit)
+ extra = user_locked - user_lock_limit;
+
+ lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit >>= PAGE_SHIFT;
+ locked = vma->vm_mm->locked_vm + extra;
+
+ if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+ ret = -EPERM;
+ goto unlock;
+ }
+
+ WARN_ON(counter->data);
+ ret = perf_mmap_data_alloc(counter, nr_pages);
+ if (ret)
+ goto unlock;
+
+ atomic_set(&counter->mmap_count, 1);
+ atomic_long_add(user_extra, &user->locked_vm);
+ vma->vm_mm->locked_vm += extra;
+ counter->data->nr_locked = extra;
+ if (vma->vm_flags & VM_WRITE)
+ counter->data->writable = 1;
+
+unlock:
+ mutex_unlock(&counter->mmap_mutex);
+
+ vma->vm_flags |= VM_RESERVED;
+ vma->vm_ops = &perf_mmap_vmops;
+
+ return ret;
+}
+
+static int perf_fasync(int fd, struct file *filp, int on)
+{
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ struct perf_counter *counter = filp->private_data;
+ int retval;
+
+ mutex_lock(&inode->i_mutex);
+ retval = fasync_helper(fd, filp, on, &counter->fasync);
+ mutex_unlock(&inode->i_mutex);
+
+ if (retval < 0)
+ return retval;
+
+ return 0;
+}
+
+static const struct file_operations perf_fops = {
+ .release = perf_release,
+ .read = perf_read,
+ .poll = perf_poll,
+ .unlocked_ioctl = perf_ioctl,
+ .compat_ioctl = perf_ioctl,
+ .mmap = perf_mmap,
+ .fasync = perf_fasync,
+};
+
+/*
+ * Perf counter wakeup
+ *
+ * If there's data, ensure we set the poll() state and publish everything
+ * to user-space before waking everybody up.
+ */
+
+void perf_counter_wakeup(struct perf_counter *counter)
+{
+ wake_up_all(&counter->waitq);
+
+ if (counter->pending_kill) {
+ kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
+ counter->pending_kill = 0;
+ }
+}
+
+/*
+ * Pending wakeups
+ *
+ * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
+ *
+ * The NMI bit means we cannot possibly take locks. Therefore, maintain a
+ * single linked list and use cmpxchg() to add entries lockless.
+ */
+
+static void perf_pending_counter(struct perf_pending_entry *entry)
+{
+ struct perf_counter *counter = container_of(entry,
+ struct perf_counter, pending);
+
+ if (counter->pending_disable) {
+ counter->pending_disable = 0;
+ __perf_counter_disable(counter);
+ }
+
+ if (counter->pending_wakeup) {
+ counter->pending_wakeup = 0;
+ perf_counter_wakeup(counter);
+ }
+}
+
+#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
+
+static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
+ PENDING_TAIL,
+};
+
+static void perf_pending_queue(struct perf_pending_entry *entry,
+ void (*func)(struct perf_pending_entry *))
+{
+ struct perf_pending_entry **head;
+
+ if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
+ return;
+
+ entry->func = func;
+
+ head = &get_cpu_var(perf_pending_head);
+
+ do {
+ entry->next = *head;
+ } while (cmpxchg(head, entry->next, entry) != entry->next);
+
+ set_perf_counter_pending();
+
+ put_cpu_var(perf_pending_head);
+}
+
+static int __perf_pending_run(void)
+{
+ struct perf_pending_entry *list;
+ int nr = 0;
+
+ list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
+ while (list != PENDING_TAIL) {
+ void (*func)(struct perf_pending_entry *);
+ struct perf_pending_entry *entry = list;
+
+ list = list->next;
+
+ func = entry->func;
+ entry->next = NULL;
+ /*
+ * Ensure we observe the unqueue before we issue the wakeup,
+ * so that we won't be waiting forever.
+ * -- see perf_not_pending().
+ */
+ smp_wmb();
+
+ func(entry);
+ nr++;
+ }
+
+ return nr;
+}
+
+static inline int perf_not_pending(struct perf_counter *counter)
+{
+ /*
+ * If we flush on whatever cpu we run, there is a chance we don't
+ * need to wait.
+ */
+ get_cpu();
+ __perf_pending_run();
+ put_cpu();
+
+ /*
+ * Ensure we see the proper queue state before going to sleep
+ * so that we do not miss the wakeup. -- see perf_pending_handle()
+ */
+ smp_rmb();
+ return counter->pending.next == NULL;
+}
+
+static void perf_pending_sync(struct perf_counter *counter)
+{
+ wait_event(counter->waitq, perf_not_pending(counter));
+}
+
+void perf_counter_do_pending(void)
+{
+ __perf_pending_run();
+}
+
+/*
+ * Callchain support -- arch specific
+ */
+
+__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+ return NULL;
+}
+
+/*
+ * Output
+ */
+
+struct perf_output_handle {
+ struct perf_counter *counter;
+ struct perf_mmap_data *data;
+ unsigned long head;
+ unsigned long offset;
+ int nmi;
+ int sample;
+ int locked;
+ unsigned long flags;
+};
+
+static bool perf_output_space(struct perf_mmap_data *data,
+ unsigned int offset, unsigned int head)
+{
+ unsigned long tail;
+ unsigned long mask;
+
+ if (!data->writable)
+ return true;
+
+ mask = (data->nr_pages << PAGE_SHIFT) - 1;
+ /*
+ * Userspace could choose to issue a mb() before updating the tail
+ * pointer. So that all reads will be completed before the write is
+ * issued.
+ */
+ tail = ACCESS_ONCE(data->user_page->data_tail);
+ smp_rmb();
+
+ offset = (offset - tail) & mask;
+ head = (head - tail) & mask;
+
+ if ((int)(head - offset) < 0)
+ return false;
+
+ return true;
+}
+
+static void perf_output_wakeup(struct perf_output_handle *handle)
+{
+ atomic_set(&handle->data->poll, POLL_IN);
+
+ if (handle->nmi) {
+ handle->counter->pending_wakeup = 1;
+ perf_pending_queue(&handle->counter->pending,
+ perf_pending_counter);
+ } else
+ perf_counter_wakeup(handle->counter);
+}
+
+/*
+ * Curious locking construct.
+ *
+ * We need to ensure a later event doesn't publish a head when a former
+ * event isn't done writing. However since we need to deal with NMIs we
+ * cannot fully serialize things.
+ *
+ * What we do is serialize between CPUs so we only have to deal with NMI
+ * nesting on a single CPU.
+ *
+ * We only publish the head (and generate a wakeup) when the outer-most
+ * event completes.
+ */
+static void perf_output_lock(struct perf_output_handle *handle)
+{
+ struct perf_mmap_data *data = handle->data;
+ int cpu;
+
+ handle->locked = 0;
+
+ local_irq_save(handle->flags);
+ cpu = smp_processor_id();
+
+ if (in_nmi() && atomic_read(&data->lock) == cpu)
+ return;
+
+ while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
+ cpu_relax();
+
+ handle->locked = 1;
+}
+
+static void perf_output_unlock(struct perf_output_handle *handle)
+{
+ struct perf_mmap_data *data = handle->data;
+ unsigned long head;
+ int cpu;
+
+ data->done_head = data->head;
+
+ if (!handle->locked)
+ goto out;
+
+again:
+ /*
+ * The xchg implies a full barrier that ensures all writes are done
+ * before we publish the new head, matched by a rmb() in userspace when
+ * reading this position.
+ */
+ while ((head = atomic_long_xchg(&data->done_head, 0)))
+ data->user_page->data_head = head;
+
+ /*
+ * NMI can happen here, which means we can miss a done_head update.
+ */
+
+ cpu = atomic_xchg(&data->lock, -1);
+ WARN_ON_ONCE(cpu != smp_processor_id());
+
+ /*
+ * Therefore we have to validate we did not indeed do so.
+ */
+ if (unlikely(atomic_long_read(&data->done_head))) {
+ /*
+ * Since we had it locked, we can lock it again.
+ */
+ while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
+ cpu_relax();
+
+ goto again;
+ }
+
+ if (atomic_xchg(&data->wakeup, 0))
+ perf_output_wakeup(handle);
+out:
+ local_irq_restore(handle->flags);
+}
+
+static void perf_output_copy(struct perf_output_handle *handle,
+ const void *buf, unsigned int len)
+{
+ unsigned int pages_mask;
+ unsigned int offset;
+ unsigned int size;
+ void **pages;
+
+ offset = handle->offset;
+ pages_mask = handle->data->nr_pages - 1;
+ pages = handle->data->data_pages;
+
+ do {
+ unsigned int page_offset;
+ int nr;
+
+ nr = (offset >> PAGE_SHIFT) & pages_mask;
+ page_offset = offset & (PAGE_SIZE - 1);
+ size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
+
+ memcpy(pages[nr] + page_offset, buf, size);
+
+ len -= size;
+ buf += size;
+ offset += size;
+ } while (len);
+
+ handle->offset = offset;
+
+ /*
+ * Check we didn't copy past our reservation window, taking the
+ * possible unsigned int wrap into account.
+ */
+ WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
+}
+
+#define perf_output_put(handle, x) \
+ perf_output_copy((handle), &(x), sizeof(x))
+
+static int perf_output_begin(struct perf_output_handle *handle,
+ struct perf_counter *counter, unsigned int size,
+ int nmi, int sample)
+{
+ struct perf_counter *output_counter;
+ struct perf_mmap_data *data;
+ unsigned int offset, head;
+ int have_lost;
+ struct {
+ struct perf_event_header header;
+ u64 id;
+ u64 lost;
+ } lost_event;
+
+ rcu_read_lock();
+ /*
+ * For inherited counters we send all the output towards the parent.
+ */
+ if (counter->parent)
+ counter = counter->parent;
+
+ output_counter = rcu_dereference(counter->output);
+ if (output_counter)
+ counter = output_counter;
+
+ data = rcu_dereference(counter->data);
+ if (!data)
+ goto out;
+
+ handle->data = data;
+ handle->counter = counter;
+ handle->nmi = nmi;
+ handle->sample = sample;
+
+ if (!data->nr_pages)
+ goto fail;
+
+ have_lost = atomic_read(&data->lost);
+ if (have_lost)
+ size += sizeof(lost_event);
+
+ perf_output_lock(handle);
+
+ do {
+ offset = head = atomic_long_read(&data->head);
+ head += size;
+ if (unlikely(!perf_output_space(data, offset, head)))
+ goto fail;
+ } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
+
+ handle->offset = offset;
+ handle->head = head;
+
+ if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
+ atomic_set(&data->wakeup, 1);
+
+ if (have_lost) {
+ lost_event.header.type = PERF_EVENT_LOST;
+ lost_event.header.misc = 0;
+ lost_event.header.size = sizeof(lost_event);
+ lost_event.id = counter->id;
+ lost_event.lost = atomic_xchg(&data->lost, 0);
+
+ perf_output_put(handle, lost_event);
+ }
+
+ return 0;
+
+fail:
+ atomic_inc(&data->lost);
+ perf_output_unlock(handle);
+out:
+ rcu_read_unlock();
+
+ return -ENOSPC;
+}
+
+static void perf_output_end(struct perf_output_handle *handle)
+{
+ struct perf_counter *counter = handle->counter;
+ struct perf_mmap_data *data = handle->data;
+
+ int wakeup_events = counter->attr.wakeup_events;
+
+ if (handle->sample && wakeup_events) {
+ int events = atomic_inc_return(&data->events);
+ if (events >= wakeup_events) {
+ atomic_sub(wakeup_events, &data->events);
+ atomic_set(&data->wakeup, 1);
+ }
+ }
+
+ perf_output_unlock(handle);
+ rcu_read_unlock();
+}
+
+static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
+{
+ /*
+ * only top level counters have the pid namespace they were created in
+ */
+ if (counter->parent)
+ counter = counter->parent;
+
+ return task_tgid_nr_ns(p, counter->ns);
+}
+
+static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
+{
+ /*
+ * only top level counters have the pid namespace they were created in
+ */
+ if (counter->parent)
+ counter = counter->parent;
+
+ return task_pid_nr_ns(p, counter->ns);
+}
+
+static void perf_output_read_one(struct perf_output_handle *handle,
+ struct perf_counter *counter)
+{
+ u64 read_format = counter->attr.read_format;
+ u64 values[4];
+ int n = 0;
+
+ values[n++] = atomic64_read(&counter->count);
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+ values[n++] = counter->total_time_enabled +
+ atomic64_read(&counter->child_total_time_enabled);
+ }
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+ values[n++] = counter->total_time_running +
+ atomic64_read(&counter->child_total_time_running);
+ }
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_counter_id(counter);
+
+ perf_output_copy(handle, values, n * sizeof(u64));
+}
+
+/*
+ * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
+ */
+static void perf_output_read_group(struct perf_output_handle *handle,
+ struct perf_counter *counter)
+{
+ struct perf_counter *leader = counter->group_leader, *sub;
+ u64 read_format = counter->attr.read_format;
+ u64 values[5];
+ int n = 0;
+
+ values[n++] = 1 + leader->nr_siblings;
+
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ values[n++] = leader->total_time_enabled;
+
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ values[n++] = leader->total_time_running;
+
+ if (leader != counter)
+ leader->pmu->read(leader);
+
+ values[n++] = atomic64_read(&leader->count);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_counter_id(leader);
+
+ perf_output_copy(handle, values, n * sizeof(u64));
+
+ list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+ n = 0;
+
+ if (sub != counter)
+ sub->pmu->read(sub);
+
+ values[n++] = atomic64_read(&sub->count);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_counter_id(sub);
+
+ perf_output_copy(handle, values, n * sizeof(u64));
+ }
+}
+
+static void perf_output_read(struct perf_output_handle *handle,
+ struct perf_counter *counter)
+{
+ if (counter->attr.read_format & PERF_FORMAT_GROUP)
+ perf_output_read_group(handle, counter);
+ else
+ perf_output_read_one(handle, counter);
+}
+
+void perf_counter_output(struct perf_counter *counter, int nmi,
+ struct perf_sample_data *data)
+{
+ int ret;
+ u64 sample_type = counter->attr.sample_type;
+ struct perf_output_handle handle;
+ struct perf_event_header header;
+ u64 ip;
+ struct {
+ u32 pid, tid;
+ } tid_entry;
+ struct perf_callchain_entry *callchain = NULL;
+ int callchain_size = 0;
+ u64 time;
+ struct {
+ u32 cpu, reserved;
+ } cpu_entry;
+
+ header.type = PERF_EVENT_SAMPLE;
+ header.size = sizeof(header);
+
+ header.misc = 0;
+ header.misc |= perf_misc_flags(data->regs);
+
+ if (sample_type & PERF_SAMPLE_IP) {
+ ip = perf_instruction_pointer(data->regs);
+ header.size += sizeof(ip);
+ }
+
+ if (sample_type & PERF_SAMPLE_TID) {
+ /* namespace issues */
+ tid_entry.pid = perf_counter_pid(counter, current);
+ tid_entry.tid = perf_counter_tid(counter, current);
+
+ header.size += sizeof(tid_entry);
+ }
+
+ if (sample_type & PERF_SAMPLE_TIME) {
+ /*
+ * Maybe do better on x86 and provide cpu_clock_nmi()
+ */
+ time = sched_clock();
+
+ header.size += sizeof(u64);
+ }
+
+ if (sample_type & PERF_SAMPLE_ADDR)
+ header.size += sizeof(u64);
+
+ if (sample_type & PERF_SAMPLE_ID)
+ header.size += sizeof(u64);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ header.size += sizeof(u64);
+
+ if (sample_type & PERF_SAMPLE_CPU) {
+ header.size += sizeof(cpu_entry);
+
+ cpu_entry.cpu = raw_smp_processor_id();
+ cpu_entry.reserved = 0;
+ }
+
+ if (sample_type & PERF_SAMPLE_PERIOD)
+ header.size += sizeof(u64);
+
+ if (sample_type & PERF_SAMPLE_READ)
+ header.size += perf_counter_read_size(counter);
+
+ if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+ callchain = perf_callchain(data->regs);
+
+ if (callchain) {
+ callchain_size = (1 + callchain->nr) * sizeof(u64);
+ header.size += callchain_size;
+ } else
+ header.size += sizeof(u64);
+ }
+
+ if (sample_type & PERF_SAMPLE_RAW) {
+ int size = sizeof(u32);
+
+ if (data->raw)
+ size += data->raw->size;
+ else
+ size += sizeof(u32);
+
+ WARN_ON_ONCE(size & (sizeof(u64)-1));
+ header.size += size;
+ }
+
+ ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, header);
+
+ if (sample_type & PERF_SAMPLE_IP)
+ perf_output_put(&handle, ip);
+
+ if (sample_type & PERF_SAMPLE_TID)
+ perf_output_put(&handle, tid_entry);
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ perf_output_put(&handle, time);
+
+ if (sample_type & PERF_SAMPLE_ADDR)
+ perf_output_put(&handle, data->addr);
+
+ if (sample_type & PERF_SAMPLE_ID) {
+ u64 id = primary_counter_id(counter);
+
+ perf_output_put(&handle, id);
+ }
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ perf_output_put(&handle, counter->id);
+
+ if (sample_type & PERF_SAMPLE_CPU)
+ perf_output_put(&handle, cpu_entry);
+
+ if (sample_type & PERF_SAMPLE_PERIOD)
+ perf_output_put(&handle, data->period);
+
+ if (sample_type & PERF_SAMPLE_READ)
+ perf_output_read(&handle, counter);
+
+ if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+ if (callchain)
+ perf_output_copy(&handle, callchain, callchain_size);
+ else {
+ u64 nr = 0;
+ perf_output_put(&handle, nr);
+ }
+ }
+
+ if (sample_type & PERF_SAMPLE_RAW) {
+ if (data->raw) {
+ perf_output_put(&handle, data->raw->size);
+ perf_output_copy(&handle, data->raw->data, data->raw->size);
+ } else {
+ struct {
+ u32 size;
+ u32 data;
+ } raw = {
+ .size = sizeof(u32),
+ .data = 0,
+ };
+ perf_output_put(&handle, raw);
+ }
+ }
+
+ perf_output_end(&handle);
+}
+
+/*
+ * read event
+ */
+
+struct perf_read_event {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 tid;
+};
+
+static void
+perf_counter_read_event(struct perf_counter *counter,
+ struct task_struct *task)
+{
+ struct perf_output_handle handle;
+ struct perf_read_event event = {
+ .header = {
+ .type = PERF_EVENT_READ,
+ .misc = 0,
+ .size = sizeof(event) + perf_counter_read_size(counter),
+ },
+ .pid = perf_counter_pid(counter, task),
+ .tid = perf_counter_tid(counter, task),
+ };
+ int ret;
+
+ ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, event);
+ perf_output_read(&handle, counter);
+
+ perf_output_end(&handle);
+}
+
+/*
+ * task tracking -- fork/exit
+ *
+ * enabled by: attr.comm | attr.mmap | attr.task
+ */
+
+struct perf_task_event {
+ struct task_struct *task;
+ struct perf_counter_context *task_ctx;
+
+ struct {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 ppid;
+ u32 tid;
+ u32 ptid;
+ } event;
+};
+
+static void perf_counter_task_output(struct perf_counter *counter,
+ struct perf_task_event *task_event)
+{
+ struct perf_output_handle handle;
+ int size = task_event->event.header.size;
+ struct task_struct *task = task_event->task;
+ int ret = perf_output_begin(&handle, counter, size, 0, 0);
+
+ if (ret)
+ return;
+
+ task_event->event.pid = perf_counter_pid(counter, task);
+ task_event->event.ppid = perf_counter_pid(counter, current);
+
+ task_event->event.tid = perf_counter_tid(counter, task);
+ task_event->event.ptid = perf_counter_tid(counter, current);
+
+ perf_output_put(&handle, task_event->event);
+ perf_output_end(&handle);
+}
+
+static int perf_counter_task_match(struct perf_counter *counter)
+{
+ if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
+ return 1;
+
+ return 0;
+}
+
+static void perf_counter_task_ctx(struct perf_counter_context *ctx,
+ struct perf_task_event *task_event)
+{
+ struct perf_counter *counter;
+
+ if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+ if (perf_counter_task_match(counter))
+ perf_counter_task_output(counter, task_event);
+ }
+ rcu_read_unlock();
+}
+
+static void perf_counter_task_event(struct perf_task_event *task_event)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_counter_context *ctx = task_event->task_ctx;
+
+ cpuctx = &get_cpu_var(perf_cpu_context);
+ perf_counter_task_ctx(&cpuctx->ctx, task_event);
+ put_cpu_var(perf_cpu_context);
+
+ rcu_read_lock();
+ if (!ctx)
+ ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
+ if (ctx)
+ perf_counter_task_ctx(ctx, task_event);
+ rcu_read_unlock();
+}
+
+static void perf_counter_task(struct task_struct *task,
+ struct perf_counter_context *task_ctx,
+ int new)
+{
+ struct perf_task_event task_event;
+
+ if (!atomic_read(&nr_comm_counters) &&
+ !atomic_read(&nr_mmap_counters) &&
+ !atomic_read(&nr_task_counters))
+ return;
+
+ task_event = (struct perf_task_event){
+ .task = task,
+ .task_ctx = task_ctx,
+ .event = {
+ .header = {
+ .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
+ .misc = 0,
+ .size = sizeof(task_event.event),
+ },
+ /* .pid */
+ /* .ppid */
+ /* .tid */
+ /* .ptid */
+ },
+ };
+
+ perf_counter_task_event(&task_event);
+}
+
+void perf_counter_fork(struct task_struct *task)
+{
+ perf_counter_task(task, NULL, 1);
+}
+
+/*
+ * comm tracking
+ */
+
+struct perf_comm_event {
+ struct task_struct *task;
+ char *comm;
+ int comm_size;
+
+ struct {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 tid;
+ } event;
+};
+
+static void perf_counter_comm_output(struct perf_counter *counter,
+ struct perf_comm_event *comm_event)
+{
+ struct perf_output_handle handle;
+ int size = comm_event->event.header.size;
+ int ret = perf_output_begin(&handle, counter, size, 0, 0);
+
+ if (ret)
+ return;
+
+ comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
+ comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
+
+ perf_output_put(&handle, comm_event->event);
+ perf_output_copy(&handle, comm_event->comm,
+ comm_event->comm_size);
+ perf_output_end(&handle);
+}
+
+static int perf_counter_comm_match(struct perf_counter *counter)
+{
+ if (counter->attr.comm)
+ return 1;
+
+ return 0;
+}
+
+static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
+ struct perf_comm_event *comm_event)
+{
+ struct perf_counter *counter;
+
+ if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+ if (perf_counter_comm_match(counter))
+ perf_counter_comm_output(counter, comm_event);
+ }
+ rcu_read_unlock();
+}
+
+static void perf_counter_comm_event(struct perf_comm_event *comm_event)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_counter_context *ctx;
+ unsigned int size;
+ char comm[TASK_COMM_LEN];
+
+ memset(comm, 0, sizeof(comm));
+ strncpy(comm, comm_event->task->comm, sizeof(comm));
+ size = ALIGN(strlen(comm)+1, sizeof(u64));
+
+ comm_event->comm = comm;
+ comm_event->comm_size = size;
+
+ comm_event->event.header.size = sizeof(comm_event->event) + size;
+
+ cpuctx = &get_cpu_var(perf_cpu_context);
+ perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
+ put_cpu_var(perf_cpu_context);
+
+ rcu_read_lock();
+ /*
+ * doesn't really matter which of the child contexts the
+ * events ends up in.
+ */
+ ctx = rcu_dereference(current->perf_counter_ctxp);
+ if (ctx)
+ perf_counter_comm_ctx(ctx, comm_event);
+ rcu_read_unlock();
+}
+
+void perf_counter_comm(struct task_struct *task)
+{
+ struct perf_comm_event comm_event;
+
+ if (task->perf_counter_ctxp)
+ perf_counter_enable_on_exec(task);
+
+ if (!atomic_read(&nr_comm_counters))
+ return;
+
+ comm_event = (struct perf_comm_event){
+ .task = task,
+ /* .comm */
+ /* .comm_size */
+ .event = {
+ .header = {
+ .type = PERF_EVENT_COMM,
+ .misc = 0,
+ /* .size */
+ },
+ /* .pid */
+ /* .tid */
+ },
+ };
+
+ perf_counter_comm_event(&comm_event);
+}
+
+/*
+ * mmap tracking
+ */
+
+struct perf_mmap_event {
+ struct vm_area_struct *vma;
+
+ const char *file_name;
+ int file_size;
+
+ struct {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 tid;
+ u64 start;
+ u64 len;
+ u64 pgoff;
+ } event;
+};
+
+static void perf_counter_mmap_output(struct perf_counter *counter,
+ struct perf_mmap_event *mmap_event)
+{
+ struct perf_output_handle handle;
+ int size = mmap_event->event.header.size;
+ int ret = perf_output_begin(&handle, counter, size, 0, 0);
+
+ if (ret)
+ return;
+
+ mmap_event->event.pid = perf_counter_pid(counter, current);
+ mmap_event->event.tid = perf_counter_tid(counter, current);
+
+ perf_output_put(&handle, mmap_event->event);
+ perf_output_copy(&handle, mmap_event->file_name,
+ mmap_event->file_size);
+ perf_output_end(&handle);
+}
+
+static int perf_counter_mmap_match(struct perf_counter *counter,
+ struct perf_mmap_event *mmap_event)
+{
+ if (counter->attr.mmap)
+ return 1;
+
+ return 0;
+}
+
+static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
+ struct perf_mmap_event *mmap_event)
+{
+ struct perf_counter *counter;
+
+ if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+ if (perf_counter_mmap_match(counter, mmap_event))
+ perf_counter_mmap_output(counter, mmap_event);
+ }
+ rcu_read_unlock();
+}
+
+static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_counter_context *ctx;
+ struct vm_area_struct *vma = mmap_event->vma;
+ struct file *file = vma->vm_file;
+ unsigned int size;
+ char tmp[16];
+ char *buf = NULL;
+ const char *name;
+
+ memset(tmp, 0, sizeof(tmp));
+
+ if (file) {
+ /*
+ * d_path works from the end of the buffer backwards, so we
+ * need to add enough zero bytes after the string to handle
+ * the 64bit alignment we do later.
+ */
+ buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
+ if (!buf) {
+ name = strncpy(tmp, "//enomem", sizeof(tmp));
+ goto got_name;
+ }
+ name = d_path(&file->f_path, buf, PATH_MAX);
+ if (IS_ERR(name)) {
+ name = strncpy(tmp, "//toolong", sizeof(tmp));
+ goto got_name;
+ }
+ } else {
+ if (arch_vma_name(mmap_event->vma)) {
+ name = strncpy(tmp, arch_vma_name(mmap_event->vma),
+ sizeof(tmp));
+ goto got_name;
+ }
+
+ if (!vma->vm_mm) {
+ name = strncpy(tmp, "[vdso]", sizeof(tmp));
+ goto got_name;
+ }
+
+ name = strncpy(tmp, "//anon", sizeof(tmp));
+ goto got_name;
+ }
+
+got_name:
+ size = ALIGN(strlen(name)+1, sizeof(u64));
+
+ mmap_event->file_name = name;
+ mmap_event->file_size = size;
+
+ mmap_event->event.header.size = sizeof(mmap_event->event) + size;
+
+ cpuctx = &get_cpu_var(perf_cpu_context);
+ perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
+ put_cpu_var(perf_cpu_context);
+
+ rcu_read_lock();
+ /*
+ * doesn't really matter which of the child contexts the
+ * events ends up in.
+ */
+ ctx = rcu_dereference(current->perf_counter_ctxp);
+ if (ctx)
+ perf_counter_mmap_ctx(ctx, mmap_event);
+ rcu_read_unlock();
+
+ kfree(buf);
+}
+
+void __perf_counter_mmap(struct vm_area_struct *vma)
+{
+ struct perf_mmap_event mmap_event;
+
+ if (!atomic_read(&nr_mmap_counters))
+ return;
+
+ mmap_event = (struct perf_mmap_event){
+ .vma = vma,
+ /* .file_name */
+ /* .file_size */
+ .event = {
+ .header = {
+ .type = PERF_EVENT_MMAP,
+ .misc = 0,
+ /* .size */
+ },
+ /* .pid */
+ /* .tid */
+ .start = vma->vm_start,
+ .len = vma->vm_end - vma->vm_start,
+ .pgoff = vma->vm_pgoff,
+ },
+ };
+
+ perf_counter_mmap_event(&mmap_event);
+}
+
+/*
+ * IRQ throttle logging
+ */
+
+static void perf_log_throttle(struct perf_counter *counter, int enable)
+{
+ struct perf_output_handle handle;
+ int ret;
+
+ struct {
+ struct perf_event_header header;
+ u64 time;
+ u64 id;
+ u64 stream_id;
+ } throttle_event = {
+ .header = {
+ .type = PERF_EVENT_THROTTLE,
+ .misc = 0,
+ .size = sizeof(throttle_event),
+ },
+ .time = sched_clock(),
+ .id = primary_counter_id(counter),
+ .stream_id = counter->id,
+ };
+
+ if (enable)
+ throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
+
+ ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, throttle_event);
+ perf_output_end(&handle);
+}
+
+/*
+ * Generic counter overflow handling, sampling.
+ */
+
+int perf_counter_overflow(struct perf_counter *counter, int nmi,
+ struct perf_sample_data *data)
+{
+ int events = atomic_read(&counter->event_limit);
+ int throttle = counter->pmu->unthrottle != NULL;
+ struct hw_perf_counter *hwc = &counter->hw;
+ int ret = 0;
+
+ if (!throttle) {
+ hwc->interrupts++;
+ } else {
+ if (hwc->interrupts != MAX_INTERRUPTS) {
+ hwc->interrupts++;
+ if (HZ * hwc->interrupts >
+ (u64)sysctl_perf_counter_sample_rate) {
+ hwc->interrupts = MAX_INTERRUPTS;
+ perf_log_throttle(counter, 0);
+ ret = 1;
+ }
+ } else {
+ /*
+ * Keep re-disabling counters even though on the previous
+ * pass we disabled it - just in case we raced with a
+ * sched-in and the counter got enabled again:
+ */
+ ret = 1;
+ }
+ }
+
+ if (counter->attr.freq) {
+ u64 now = sched_clock();
+ s64 delta = now - hwc->freq_stamp;
+
+ hwc->freq_stamp = now;
+
+ if (delta > 0 && delta < TICK_NSEC)
+ perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
+ }
+
+ /*
+ * XXX event_limit might not quite work as expected on inherited
+ * counters
+ */
+
+ counter->pending_kill = POLL_IN;
+ if (events && atomic_dec_and_test(&counter->event_limit)) {
+ ret = 1;
+ counter->pending_kill = POLL_HUP;
+ if (nmi) {
+ counter->pending_disable = 1;
+ perf_pending_queue(&counter->pending,
+ perf_pending_counter);
+ } else
+ perf_counter_disable(counter);
+ }
+
+ perf_counter_output(counter, nmi, data);
+ return ret;
+}
+
+/*
+ * Generic software counter infrastructure
+ */
+
+/*
+ * We directly increment counter->count and keep a second value in
+ * counter->hw.period_left to count intervals. This period counter
+ * is kept in the range [-sample_period, 0] so that we can use the
+ * sign as trigger.
+ */
+
+static u64 perf_swcounter_set_period(struct perf_counter *counter)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+ u64 period = hwc->last_period;
+ u64 nr, offset;
+ s64 old, val;
+
+ hwc->last_period = hwc->sample_period;
+
+again:
+ old = val = atomic64_read(&hwc->period_left);
+ if (val < 0)
+ return 0;
+
+ nr = div64_u64(period + val, period);
+ offset = nr * period;
+ val -= offset;
+ if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
+ goto again;
+
+ return nr;
+}
+
+static void perf_swcounter_overflow(struct perf_counter *counter,
+ int nmi, struct perf_sample_data *data)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+ u64 overflow;
+
+ data->period = counter->hw.last_period;
+ overflow = perf_swcounter_set_period(counter);
+
+ if (hwc->interrupts == MAX_INTERRUPTS)
+ return;
+
+ for (; overflow; overflow--) {
+ if (perf_counter_overflow(counter, nmi, data)) {
+ /*
+ * We inhibit the overflow from happening when
+ * hwc->interrupts == MAX_INTERRUPTS.
+ */
+ break;
+ }
+ }
+}
+
+static void perf_swcounter_unthrottle(struct perf_counter *counter)
+{
+ /*
+ * Nothing to do, we already reset hwc->interrupts.
+ */
+}
+
+static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
+ int nmi, struct perf_sample_data *data)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+
+ atomic64_add(nr, &counter->count);
+
+ if (!hwc->sample_period)
+ return;
+
+ if (!data->regs)
+ return;
+
+ if (!atomic64_add_negative(nr, &hwc->period_left))
+ perf_swcounter_overflow(counter, nmi, data);
+}
+
+static int perf_swcounter_is_counting(struct perf_counter *counter)
+{
+ /*
+ * The counter is active, we're good!
+ */
+ if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+ return 1;
+
+ /*
+ * The counter is off/error, not counting.
+ */
+ if (counter->state != PERF_COUNTER_STATE_INACTIVE)
+ return 0;
+
+ /*
+ * The counter is inactive, if the context is active
+ * we're part of a group that didn't make it on the 'pmu',
+ * not counting.
+ */
+ if (counter->ctx->is_active)
+ return 0;
+
+ /*
+ * We're inactive and the context is too, this means the
+ * task is scheduled out, we're counting events that happen
+ * to us, like migration events.
+ */
+ return 1;
+}
+
+static int perf_swcounter_match(struct perf_counter *counter,
+ enum perf_type_id type,
+ u32 event, struct pt_regs *regs)
+{
+ if (!perf_swcounter_is_counting(counter))
+ return 0;
+
+ if (counter->attr.type != type)
+ return 0;
+ if (counter->attr.config != event)
+ return 0;
+
+ if (regs) {
+ if (counter->attr.exclude_user && user_mode(regs))
+ return 0;
+
+ if (counter->attr.exclude_kernel && !user_mode(regs))
+ return 0;
+ }
+
+ return 1;
+}
+
+static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
+ enum perf_type_id type,
+ u32 event, u64 nr, int nmi,
+ struct perf_sample_data *data)
+{
+ struct perf_counter *counter;
+
+ if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+ if (perf_swcounter_match(counter, type, event, data->regs))
+ perf_swcounter_add(counter, nr, nmi, data);
+ }
+ rcu_read_unlock();
+}
+
+static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
+{
+ if (in_nmi())
+ return &cpuctx->recursion[3];
+
+ if (in_irq())
+ return &cpuctx->recursion[2];
+
+ if (in_softirq())
+ return &cpuctx->recursion[1];
+
+ return &cpuctx->recursion[0];
+}
+
+static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
+ u64 nr, int nmi,
+ struct perf_sample_data *data)
+{
+ struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+ int *recursion = perf_swcounter_recursion_context(cpuctx);
+ struct perf_counter_context *ctx;
+
+ if (*recursion)
+ goto out;
+
+ (*recursion)++;
+ barrier();
+
+ perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
+ nr, nmi, data);
+ rcu_read_lock();
+ /*
+ * doesn't really matter which of the child contexts the
+ * events ends up in.
+ */
+ ctx = rcu_dereference(current->perf_counter_ctxp);
+ if (ctx)
+ perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
+ rcu_read_unlock();
+
+ barrier();
+ (*recursion)--;
+
+out:
+ put_cpu_var(perf_cpu_context);
+}
+
+void __perf_swcounter_event(u32 event, u64 nr, int nmi,
+ struct pt_regs *regs, u64 addr)
+{
+ struct perf_sample_data data = {
+ .regs = regs,
+ .addr = addr,
+ };
+
+ do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
+}
+
+static void perf_swcounter_read(struct perf_counter *counter)
+{
+}
+
+static int perf_swcounter_enable(struct perf_counter *counter)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+
+ if (hwc->sample_period) {
+ hwc->last_period = hwc->sample_period;
+ perf_swcounter_set_period(counter);
+ }
+ return 0;
+}
+
+static void perf_swcounter_disable(struct perf_counter *counter)
+{
+}
+
+static const struct pmu perf_ops_generic = {
+ .enable = perf_swcounter_enable,
+ .disable = perf_swcounter_disable,
+ .read = perf_swcounter_read,
+ .unthrottle = perf_swcounter_unthrottle,
+};
+
+/*
+ * hrtimer based swcounter callback
+ */
+
+static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+{
+ enum hrtimer_restart ret = HRTIMER_RESTART;
+ struct perf_sample_data data;
+ struct perf_counter *counter;
+ u64 period;
+
+ counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
+ counter->pmu->read(counter);
+
+ data.addr = 0;
+ data.regs = get_irq_regs();
+ /*
+ * In case we exclude kernel IPs or are somehow not in interrupt
+ * context, provide the next best thing, the user IP.
+ */
+ if ((counter->attr.exclude_kernel || !data.regs) &&
+ !counter->attr.exclude_user)
+ data.regs = task_pt_regs(current);
+
+ if (data.regs) {
+ if (perf_counter_overflow(counter, 0, &data))
+ ret = HRTIMER_NORESTART;
+ }
+
+ period = max_t(u64, 10000, counter->hw.sample_period);
+ hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+ return ret;
+}
+
+/*
+ * Software counter: cpu wall time clock
+ */
+
+static void cpu_clock_perf_counter_update(struct perf_counter *counter)
+{
+ int cpu = raw_smp_processor_id();
+ s64 prev;
+ u64 now;
+
+ now = cpu_clock(cpu);
+ prev = atomic64_read(&counter->hw.prev_count);
+ atomic64_set(&counter->hw.prev_count, now);
+ atomic64_add(now - prev, &counter->count);
+}
+
+static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+ int cpu = raw_smp_processor_id();
+
+ atomic64_set(&hwc->prev_count, cpu_clock(cpu));
+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hwc->hrtimer.function = perf_swcounter_hrtimer;
+ if (hwc->sample_period) {
+ u64 period = max_t(u64, 10000, hwc->sample_period);
+ __hrtimer_start_range_ns(&hwc->hrtimer,
+ ns_to_ktime(period), 0,
+ HRTIMER_MODE_REL, 0);
+ }
+
+ return 0;
+}
+
+static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
+{
+ if (counter->hw.sample_period)
+ hrtimer_cancel(&counter->hw.hrtimer);
+ cpu_clock_perf_counter_update(counter);
+}
+
+static void cpu_clock_perf_counter_read(struct perf_counter *counter)
+{
+ cpu_clock_perf_counter_update(counter);
+}
+
+static const struct pmu perf_ops_cpu_clock = {
+ .enable = cpu_clock_perf_counter_enable,
+ .disable = cpu_clock_perf_counter_disable,
+ .read = cpu_clock_perf_counter_read,
+};
+
+/*
+ * Software counter: task time clock
+ */
+
+static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
+{
+ u64 prev;
+ s64 delta;
+
+ prev = atomic64_xchg(&counter->hw.prev_count, now);
+ delta = now - prev;
+ atomic64_add(delta, &counter->count);
+}
+
+static int task_clock_perf_counter_enable(struct perf_counter *counter)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+ u64 now;
+
+ now = counter->ctx->time;
+
+ atomic64_set(&hwc->prev_count, now);
+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hwc->hrtimer.function = perf_swcounter_hrtimer;
+ if (hwc->sample_period) {
+ u64 period = max_t(u64, 10000, hwc->sample_period);
+ __hrtimer_start_range_ns(&hwc->hrtimer,
+ ns_to_ktime(period), 0,
+ HRTIMER_MODE_REL, 0);
+ }
+
+ return 0;
+}
+
+static void task_clock_perf_counter_disable(struct perf_counter *counter)
+{
+ if (counter->hw.sample_period)
+ hrtimer_cancel(&counter->hw.hrtimer);
+ task_clock_perf_counter_update(counter, counter->ctx->time);
+
+}
+
+static void task_clock_perf_counter_read(struct perf_counter *counter)
+{
+ u64 time;
+
+ if (!in_nmi()) {
+ update_context_time(counter->ctx);
+ time = counter->ctx->time;
+ } else {
+ u64 now = perf_clock();
+ u64 delta = now - counter->ctx->timestamp;
+ time = counter->ctx->time + delta;
+ }
+
+ task_clock_perf_counter_update(counter, time);
+}
+
+static const struct pmu perf_ops_task_clock = {
+ .enable = task_clock_perf_counter_enable,
+ .disable = task_clock_perf_counter_disable,
+ .read = task_clock_perf_counter_read,
+};
+
+#ifdef CONFIG_EVENT_PROFILE
+void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
+ int entry_size)
+{
+ struct perf_raw_record raw = {
+ .size = entry_size,
+ .data = record,
+ };
+
+ struct perf_sample_data data = {
+ .regs = get_irq_regs(),
+ .addr = addr,
+ .raw = &raw,
+ };
+
+ if (!data.regs)
+ data.regs = task_pt_regs(current);
+
+ do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
+}
+EXPORT_SYMBOL_GPL(perf_tpcounter_event);
+
+extern int ftrace_profile_enable(int);
+extern void ftrace_profile_disable(int);
+
+static void tp_perf_counter_destroy(struct perf_counter *counter)
+{
+ ftrace_profile_disable(counter->attr.config);
+}
+
+static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
+{
+ /*
+ * Raw tracepoint data is a severe data leak, only allow root to
+ * have these.
+ */
+ if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
+ perf_paranoid_tracepoint_raw() &&
+ !capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ if (ftrace_profile_enable(counter->attr.config))
+ return NULL;
+
+ counter->destroy = tp_perf_counter_destroy;
+
+ return &perf_ops_generic;
+}
+#else
+static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
+{
+ return NULL;
+}
+#endif
+
+atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
+
+static void sw_perf_counter_destroy(struct perf_counter *counter)
+{
+ u64 event = counter->attr.config;
+
+ WARN_ON(counter->parent);
+
+ atomic_dec(&perf_swcounter_enabled[event]);
+}
+
+static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
+{
+ const struct pmu *pmu = NULL;
+ u64 event = counter->attr.config;
+
+ /*
+ * Software counters (currently) can't in general distinguish
+ * between user, kernel and hypervisor events.
+ * However, context switches and cpu migrations are considered
+ * to be kernel events, and page faults are never hypervisor
+ * events.
+ */
+ switch (event) {
+ case PERF_COUNT_SW_CPU_CLOCK:
+ pmu = &perf_ops_cpu_clock;
+
+ break;
+ case PERF_COUNT_SW_TASK_CLOCK:
+ /*
+ * If the user instantiates this as a per-cpu counter,
+ * use the cpu_clock counter instead.
+ */
+ if (counter->ctx->task)
+ pmu = &perf_ops_task_clock;
+ else
+ pmu = &perf_ops_cpu_clock;
+
+ break;
+ case PERF_COUNT_SW_PAGE_FAULTS:
+ case PERF_COUNT_SW_PAGE_FAULTS_MIN:
+ case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
+ case PERF_COUNT_SW_CONTEXT_SWITCHES:
+ case PERF_COUNT_SW_CPU_MIGRATIONS:
+ if (!counter->parent) {
+ atomic_inc(&perf_swcounter_enabled[event]);
+ counter->destroy = sw_perf_counter_destroy;
+ }
+ pmu = &perf_ops_generic;
+ break;
+ }
+
+ return pmu;
+}
+
+/*
+ * Allocate and initialize a counter structure
+ */
+static struct perf_counter *
+perf_counter_alloc(struct perf_counter_attr *attr,
+ int cpu,
+ struct perf_counter_context *ctx,
+ struct perf_counter *group_leader,
+ struct perf_counter *parent_counter,
+ gfp_t gfpflags)
+{
+ const struct pmu *pmu;
+ struct perf_counter *counter;
+ struct hw_perf_counter *hwc;
+ long err;
+
+ counter = kzalloc(sizeof(*counter), gfpflags);
+ if (!counter)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Single counters are their own group leaders, with an
+ * empty sibling list:
+ */
+ if (!group_leader)
+ group_leader = counter;
+
+ mutex_init(&counter->child_mutex);
+ INIT_LIST_HEAD(&counter->child_list);
+
+ INIT_LIST_HEAD(&counter->list_entry);
+ INIT_LIST_HEAD(&counter->event_entry);
+ INIT_LIST_HEAD(&counter->sibling_list);
+ init_waitqueue_head(&counter->waitq);
+
+ mutex_init(&counter->mmap_mutex);
+
+ counter->cpu = cpu;
+ counter->attr = *attr;
+ counter->group_leader = group_leader;
+ counter->pmu = NULL;
+ counter->ctx = ctx;
+ counter->oncpu = -1;
+
+ counter->parent = parent_counter;
+
+ counter->ns = get_pid_ns(current->nsproxy->pid_ns);
+ counter->id = atomic64_inc_return(&perf_counter_id);
+
+ counter->state = PERF_COUNTER_STATE_INACTIVE;
+
+ if (attr->disabled)
+ counter->state = PERF_COUNTER_STATE_OFF;
+
+ pmu = NULL;
+
+ hwc = &counter->hw;
+ hwc->sample_period = attr->sample_period;
+ if (attr->freq && attr->sample_freq)
+ hwc->sample_period = 1;
+ hwc->last_period = hwc->sample_period;
+
+ atomic64_set(&hwc->period_left, hwc->sample_period);
+
+ /*
+ * we currently do not support PERF_FORMAT_GROUP on inherited counters
+ */
+ if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
+ goto done;
+
+ switch (attr->type) {
+ case PERF_TYPE_RAW:
+ case PERF_TYPE_HARDWARE:
+ case PERF_TYPE_HW_CACHE:
+ pmu = hw_perf_counter_init(counter);
+ break;
+
+ case PERF_TYPE_SOFTWARE:
+ pmu = sw_perf_counter_init(counter);
+ break;
+
+ case PERF_TYPE_TRACEPOINT:
+ pmu = tp_perf_counter_init(counter);
+ break;
+
+ default:
+ break;
+ }
+done:
+ err = 0;
+ if (!pmu)
+ err = -EINVAL;
+ else if (IS_ERR(pmu))
+ err = PTR_ERR(pmu);
+
+ if (err) {
+ if (counter->ns)
+ put_pid_ns(counter->ns);
+ kfree(counter);
+ return ERR_PTR(err);
+ }
+
+ counter->pmu = pmu;
+
+ if (!counter->parent) {
+ atomic_inc(&nr_counters);
+ if (counter->attr.mmap)
+ atomic_inc(&nr_mmap_counters);
+ if (counter->attr.comm)
+ atomic_inc(&nr_comm_counters);
+ if (counter->attr.task)
+ atomic_inc(&nr_task_counters);
+ }
+
+ return counter;
+}
+
+static int perf_copy_attr(struct perf_counter_attr __user *uattr,
+ struct perf_counter_attr *attr)
+{
+ int ret;
+ u32 size;
+
+ if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
+ return -EFAULT;
+
+ /*
+ * zero the full structure, so that a short copy will be nice.
+ */
+ memset(attr, 0, sizeof(*attr));
+
+ ret = get_user(size, &uattr->size);
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE) /* silly large */
+ goto err_size;
+
+ if (!size) /* abi compat */
+ size = PERF_ATTR_SIZE_VER0;
+
+ if (size < PERF_ATTR_SIZE_VER0)
+ goto err_size;
+
+ /*
+ * If we're handed a bigger struct than we know of,
+ * ensure all the unknown bits are 0.
+ */
+ if (size > sizeof(*attr)) {
+ unsigned long val;
+ unsigned long __user *addr;
+ unsigned long __user *end;
+
+ addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
+ sizeof(unsigned long));
+ end = PTR_ALIGN((void __user *)uattr + size,
+ sizeof(unsigned long));
+
+ for (; addr < end; addr += sizeof(unsigned long)) {
+ ret = get_user(val, addr);
+ if (ret)
+ return ret;
+ if (val)
+ goto err_size;
+ }
+ }
+
+ ret = copy_from_user(attr, uattr, size);
+ if (ret)
+ return -EFAULT;
+
+ /*
+ * If the type exists, the corresponding creation will verify
+ * the attr->config.
+ */
+ if (attr->type >= PERF_TYPE_MAX)
+ return -EINVAL;
+
+ if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
+ return -EINVAL;
+
+ if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
+ return -EINVAL;
+
+ if (attr->read_format & ~(PERF_FORMAT_MAX-1))
+ return -EINVAL;
+
+out:
+ return ret;
+
+err_size:
+ put_user(sizeof(*attr), &uattr->size);
+ ret = -E2BIG;
+ goto out;
+}
+
+int perf_counter_set_output(struct perf_counter *counter, int output_fd)
+{
+ struct perf_counter *output_counter = NULL;
+ struct file *output_file = NULL;
+ struct perf_counter *old_output;
+ int fput_needed = 0;
+ int ret = -EINVAL;
+
+ if (!output_fd)
+ goto set;
+
+ output_file = fget_light(output_fd, &fput_needed);
+ if (!output_file)
+ return -EBADF;
+
+ if (output_file->f_op != &perf_fops)
+ goto out;
+
+ output_counter = output_file->private_data;
+
+ /* Don't chain output fds */
+ if (output_counter->output)
+ goto out;
+
+ /* Don't set an output fd when we already have an output channel */
+ if (counter->data)
+ goto out;
+
+ atomic_long_inc(&output_file->f_count);
+
+set:
+ mutex_lock(&counter->mmap_mutex);
+ old_output = counter->output;
+ rcu_assign_pointer(counter->output, output_counter);
+ mutex_unlock(&counter->mmap_mutex);
+
+ if (old_output) {
+ /*
+ * we need to make sure no existing perf_output_*()
+ * is still referencing this counter.
+ */
+ synchronize_rcu();
+ fput(old_output->filp);
+ }
+
+ ret = 0;
+out:
+ fput_light(output_file, fput_needed);
+ return ret;
+}
+
+/**
+ * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
+ *
+ * @attr_uptr: event type attributes for monitoring/sampling
+ * @pid: target pid
+ * @cpu: target cpu
+ * @group_fd: group leader counter fd
+ */
+SYSCALL_DEFINE5(perf_counter_open,
+ struct perf_counter_attr __user *, attr_uptr,
+ pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
+{
+ struct perf_counter *counter, *group_leader;
+ struct perf_counter_attr attr;
+ struct perf_counter_context *ctx;
+ struct file *counter_file = NULL;
+ struct file *group_file = NULL;
+ int fput_needed = 0;
+ int fput_needed2 = 0;
+ int err;
+
+ /* for future expandability... */
+ if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
+ return -EINVAL;
+
+ err = perf_copy_attr(attr_uptr, &attr);
+ if (err)
+ return err;
+
+ if (!attr.exclude_kernel) {
+ if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ }
+
+ if (attr.freq) {
+ if (attr.sample_freq > sysctl_perf_counter_sample_rate)
+ return -EINVAL;
+ }
+
+ /*
+ * Get the target context (task or percpu):
+ */
+ ctx = find_get_context(pid, cpu);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ /*
+ * Look up the group leader (we will attach this counter to it):
+ */
+ group_leader = NULL;
+ if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
+ err = -EINVAL;
+ group_file = fget_light(group_fd, &fput_needed);
+ if (!group_file)
+ goto err_put_context;
+ if (group_file->f_op != &perf_fops)
+ goto err_put_context;
+
+ group_leader = group_file->private_data;
+ /*
+ * Do not allow a recursive hierarchy (this new sibling
+ * becoming part of another group-sibling):
+ */
+ if (group_leader->group_leader != group_leader)
+ goto err_put_context;
+ /*
+ * Do not allow to attach to a group in a different
+ * task or CPU context:
+ */
+ if (group_leader->ctx != ctx)
+ goto err_put_context;
+ /*
+ * Only a group leader can be exclusive or pinned
+ */
+ if (attr.exclusive || attr.pinned)
+ goto err_put_context;
+ }
+
+ counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
+ NULL, GFP_KERNEL);
+ err = PTR_ERR(counter);
+ if (IS_ERR(counter))
+ goto err_put_context;
+
+ err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
+ if (err < 0)
+ goto err_free_put_context;
+
+ counter_file = fget_light(err, &fput_needed2);
+ if (!counter_file)
+ goto err_free_put_context;
+
+ if (flags & PERF_FLAG_FD_OUTPUT) {
+ err = perf_counter_set_output(counter, group_fd);
+ if (err)
+ goto err_fput_free_put_context;
+ }
+
+ counter->filp = counter_file;
+ WARN_ON_ONCE(ctx->parent_ctx);
+ mutex_lock(&ctx->mutex);
+ perf_install_in_context(ctx, counter, cpu);
+ ++ctx->generation;
+ mutex_unlock(&ctx->mutex);
+
+ counter->owner = current;
+ get_task_struct(current);
+ mutex_lock(&current->perf_counter_mutex);
+ list_add_tail(&counter->owner_entry, &current->perf_counter_list);
+ mutex_unlock(&current->perf_counter_mutex);
+
+err_fput_free_put_context:
+ fput_light(counter_file, fput_needed2);
+
+err_free_put_context:
+ if (err < 0)
+ kfree(counter);
+
+err_put_context:
+ if (err < 0)
+ put_ctx(ctx);
+
+ fput_light(group_file, fput_needed);
+
+ return err;
+}
+
+/*
+ * inherit a counter from parent task to child task:
+ */
+static struct perf_counter *
+inherit_counter(struct perf_counter *parent_counter,
+ struct task_struct *parent,
+ struct perf_counter_context *parent_ctx,
+ struct task_struct *child,
+ struct perf_counter *group_leader,
+ struct perf_counter_context *child_ctx)
+{
+ struct perf_counter *child_counter;
+
+ /*
+ * Instead of creating recursive hierarchies of counters,
+ * we link inherited counters back to the original parent,
+ * which has a filp for sure, which we use as the reference
+ * count:
+ */
+ if (parent_counter->parent)
+ parent_counter = parent_counter->parent;
+
+ child_counter = perf_counter_alloc(&parent_counter->attr,
+ parent_counter->cpu, child_ctx,
+ group_leader, parent_counter,
+ GFP_KERNEL);
+ if (IS_ERR(child_counter))
+ return child_counter;
+ get_ctx(child_ctx);
+
+ /*
+ * Make the child state follow the state of the parent counter,
+ * not its attr.disabled bit. We hold the parent's mutex,
+ * so we won't race with perf_counter_{en, dis}able_family.
+ */
+ if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
+ child_counter->state = PERF_COUNTER_STATE_INACTIVE;
+ else
+ child_counter->state = PERF_COUNTER_STATE_OFF;
+
+ if (parent_counter->attr.freq)
+ child_counter->hw.sample_period = parent_counter->hw.sample_period;
+
+ /*
+ * Link it up in the child's context:
+ */
+ add_counter_to_ctx(child_counter, child_ctx);
+
+ /*
+ * Get a reference to the parent filp - we will fput it
+ * when the child counter exits. This is safe to do because
+ * we are in the parent and we know that the filp still
+ * exists and has a nonzero count:
+ */
+ atomic_long_inc(&parent_counter->filp->f_count);
+
+ /*
+ * Link this into the parent counter's child list
+ */
+ WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
+ mutex_lock(&parent_counter->child_mutex);
+ list_add_tail(&child_counter->child_list, &parent_counter->child_list);
+ mutex_unlock(&parent_counter->child_mutex);
+
+ return child_counter;
+}
+
+static int inherit_group(struct perf_counter *parent_counter,
+ struct task_struct *parent,
+ struct perf_counter_context *parent_ctx,
+ struct task_struct *child,
+ struct perf_counter_context *child_ctx)
+{
+ struct perf_counter *leader;
+ struct perf_counter *sub;
+ struct perf_counter *child_ctr;
+
+ leader = inherit_counter(parent_counter, parent, parent_ctx,
+ child, NULL, child_ctx);
+ if (IS_ERR(leader))
+ return PTR_ERR(leader);
+ list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
+ child_ctr = inherit_counter(sub, parent, parent_ctx,
+ child, leader, child_ctx);
+ if (IS_ERR(child_ctr))
+ return PTR_ERR(child_ctr);
+ }
+ return 0;
+}
+
+static void sync_child_counter(struct perf_counter *child_counter,
+ struct task_struct *child)
+{
+ struct perf_counter *parent_counter = child_counter->parent;
+ u64 child_val;
+
+ if (child_counter->attr.inherit_stat)
+ perf_counter_read_event(child_counter, child);
+
+ child_val = atomic64_read(&child_counter->count);
+
+ /*
+ * Add back the child's count to the parent's count:
+ */
+ atomic64_add(child_val, &parent_counter->count);
+ atomic64_add(child_counter->total_time_enabled,
+ &parent_counter->child_total_time_enabled);
+ atomic64_add(child_counter->total_time_running,
+ &parent_counter->child_total_time_running);
+
+ /*
+ * Remove this counter from the parent's list
+ */
+ WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
+ mutex_lock(&parent_counter->child_mutex);
+ list_del_init(&child_counter->child_list);
+ mutex_unlock(&parent_counter->child_mutex);
+
+ /*
+ * Release the parent counter, if this was the last
+ * reference to it.
+ */
+ fput(parent_counter->filp);
+}
+
+static void
+__perf_counter_exit_task(struct perf_counter *child_counter,
+ struct perf_counter_context *child_ctx,
+ struct task_struct *child)
+{
+ struct perf_counter *parent_counter;
+
+ update_counter_times(child_counter);
+ perf_counter_remove_from_context(child_counter);
+
+ parent_counter = child_counter->parent;
+ /*
+ * It can happen that parent exits first, and has counters
+ * that are still around due to the child reference. These
+ * counters need to be zapped - but otherwise linger.
+ */
+ if (parent_counter) {
+ sync_child_counter(child_counter, child);
+ free_counter(child_counter);
+ }
+}
+
+/*
+ * When a child task exits, feed back counter values to parent counters.
+ */
+void perf_counter_exit_task(struct task_struct *child)
+{
+ struct perf_counter *child_counter, *tmp;
+ struct perf_counter_context *child_ctx;
+ unsigned long flags;
+
+ if (likely(!child->perf_counter_ctxp)) {
+ perf_counter_task(child, NULL, 0);
+ return;
+ }
+
+ local_irq_save(flags);
+ /*
+ * We can't reschedule here because interrupts are disabled,
+ * and either child is current or it is a task that can't be
+ * scheduled, so we are now safe from rescheduling changing
+ * our context.
+ */
+ child_ctx = child->perf_counter_ctxp;
+ __perf_counter_task_sched_out(child_ctx);
+
+ /*
+ * Take the context lock here so that if find_get_context is
+ * reading child->perf_counter_ctxp, we wait until it has
+ * incremented the context's refcount before we do put_ctx below.
+ */
+ spin_lock(&child_ctx->lock);
+ child->perf_counter_ctxp = NULL;
+ /*
+ * If this context is a clone; unclone it so it can't get
+ * swapped to another process while we're removing all
+ * the counters from it.
+ */
+ unclone_ctx(child_ctx);
+ spin_unlock_irqrestore(&child_ctx->lock, flags);
+
+ /*
+ * Report the task dead after unscheduling the counters so that we
+ * won't get any samples after PERF_EVENT_EXIT. We can however still
+ * get a few PERF_EVENT_READ events.
+ */
+ perf_counter_task(child, child_ctx, 0);
+
+ /*
+ * We can recurse on the same lock type through:
+ *
+ * __perf_counter_exit_task()
+ * sync_child_counter()
+ * fput(parent_counter->filp)
+ * perf_release()
+ * mutex_lock(&ctx->mutex)
+ *
+ * But since its the parent context it won't be the same instance.
+ */
+ mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
+
+again:
+ list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
+ list_entry)
+ __perf_counter_exit_task(child_counter, child_ctx, child);
+
+ /*
+ * If the last counter was a group counter, it will have appended all
+ * its siblings to the list, but we obtained 'tmp' before that which
+ * will still point to the list head terminating the iteration.
+ */
+ if (!list_empty(&child_ctx->counter_list))
+ goto again;
+
+ mutex_unlock(&child_ctx->mutex);
+
+ put_ctx(child_ctx);
+}
+
+/*
+ * free an unexposed, unused context as created by inheritance by
+ * init_task below, used by fork() in case of fail.
+ */
+void perf_counter_free_task(struct task_struct *task)
+{
+ struct perf_counter_context *ctx = task->perf_counter_ctxp;
+ struct perf_counter *counter, *tmp;
+
+ if (!ctx)
+ return;
+
+ mutex_lock(&ctx->mutex);
+again:
+ list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
+ struct perf_counter *parent = counter->parent;
+
+ if (WARN_ON_ONCE(!parent))
+ continue;
+
+ mutex_lock(&parent->child_mutex);
+ list_del_init(&counter->child_list);
+ mutex_unlock(&parent->child_mutex);
+
+ fput(parent->filp);
+
+ list_del_counter(counter, ctx);
+ free_counter(counter);
+ }
+
+ if (!list_empty(&ctx->counter_list))
+ goto again;
+
+ mutex_unlock(&ctx->mutex);
+
+ put_ctx(ctx);
+}
+
+/*
+ * Initialize the perf_counter context in task_struct
+ */
+int perf_counter_init_task(struct task_struct *child)
+{
+ struct perf_counter_context *child_ctx, *parent_ctx;
+ struct perf_counter_context *cloned_ctx;
+ struct perf_counter *counter;
+ struct task_struct *parent = current;
+ int inherited_all = 1;
+ int ret = 0;
+
+ child->perf_counter_ctxp = NULL;
+
+ mutex_init(&child->perf_counter_mutex);
+ INIT_LIST_HEAD(&child->perf_counter_list);
+
+ if (likely(!parent->perf_counter_ctxp))
+ return 0;
+
+ /*
+ * This is executed from the parent task context, so inherit
+ * counters that have been marked for cloning.
+ * First allocate and initialize a context for the child.
+ */
+
+ child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
+ if (!child_ctx)
+ return -ENOMEM;
+
+ __perf_counter_init_context(child_ctx, child);
+ child->perf_counter_ctxp = child_ctx;
+ get_task_struct(child);
+
+ /*
+ * If the parent's context is a clone, pin it so it won't get
+ * swapped under us.
+ */
+ parent_ctx = perf_pin_task_context(parent);
+
+ /*
+ * No need to check if parent_ctx != NULL here; since we saw
+ * it non-NULL earlier, the only reason for it to become NULL
+ * is if we exit, and since we're currently in the middle of
+ * a fork we can't be exiting at the same time.
+ */
+
+ /*
+ * Lock the parent list. No need to lock the child - not PID
+ * hashed yet and not running, so nobody can access it.
+ */
+ mutex_lock(&parent_ctx->mutex);
+
+ /*
+ * We dont have to disable NMIs - we are only looking at
+ * the list, not manipulating it:
+ */
+ list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
+ if (counter != counter->group_leader)
+ continue;
+
+ if (!counter->attr.inherit) {
+ inherited_all = 0;
+ continue;
+ }
+
+ ret = inherit_group(counter, parent, parent_ctx,
+ child, child_ctx);
+ if (ret) {
+ inherited_all = 0;
+ break;
+ }
+ }
+
+ if (inherited_all) {
+ /*
+ * Mark the child context as a clone of the parent
+ * context, or of whatever the parent is a clone of.
+ * Note that if the parent is a clone, it could get
+ * uncloned at any point, but that doesn't matter
+ * because the list of counters and the generation
+ * count can't have changed since we took the mutex.
+ */
+ cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
+ if (cloned_ctx) {
+ child_ctx->parent_ctx = cloned_ctx;
+ child_ctx->parent_gen = parent_ctx->parent_gen;
+ } else {
+ child_ctx->parent_ctx = parent_ctx;
+ child_ctx->parent_gen = parent_ctx->generation;
+ }
+ get_ctx(child_ctx->parent_ctx);
+ }
+
+ mutex_unlock(&parent_ctx->mutex);
+
+ perf_unpin_context(parent_ctx);
+
+ return ret;
+}
+
+static void __cpuinit perf_counter_init_cpu(int cpu)
+{
+ struct perf_cpu_context *cpuctx;
+
+ cpuctx = &per_cpu(perf_cpu_context, cpu);
+ __perf_counter_init_context(&cpuctx->ctx, NULL);
+
+ spin_lock(&perf_resource_lock);
+ cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
+ spin_unlock(&perf_resource_lock);
+
+ hw_perf_counter_setup(cpu);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void __perf_counter_exit_cpu(void *info)
+{
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_counter_context *ctx = &cpuctx->ctx;
+ struct perf_counter *counter, *tmp;
+
+ list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
+ __perf_counter_remove_from_context(counter);
+}
+static void perf_counter_exit_cpu(int cpu)
+{
+ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+ struct perf_counter_context *ctx = &cpuctx->ctx;
+
+ mutex_lock(&ctx->mutex);
+ smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
+ mutex_unlock(&ctx->mutex);
+}
+#else
+static inline void perf_counter_exit_cpu(int cpu) { }
+#endif
+
+static int __cpuinit
+perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (long)hcpu;
+
+ switch (action) {
+
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ perf_counter_init_cpu(cpu);
+ break;
+
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ hw_perf_counter_setup_online(cpu);
+ break;
+
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ perf_counter_exit_cpu(cpu);
+ break;
+
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+/*
+ * This has to have a higher priority than migration_notifier in sched.c.
+ */
+static struct notifier_block __cpuinitdata perf_cpu_nb = {
+ .notifier_call = perf_cpu_notify,
+ .priority = 20,
+};
+
+void __init perf_counter_init(void)
+{
+ perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
+ (void *)(long)smp_processor_id());
+ perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
+ (void *)(long)smp_processor_id());
+ register_cpu_notifier(&perf_cpu_nb);
+}
+
+static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
+{
+ return sprintf(buf, "%d\n", perf_reserved_percpu);
+}
+
+static ssize_t
+perf_set_reserve_percpu(struct sysdev_class *class,
+ const char *buf,
+ size_t count)
+{
+ struct perf_cpu_context *cpuctx;
+ unsigned long val;
+ int err, cpu, mpt;
+
+ err = strict_strtoul(buf, 10, &val);
+ if (err)
+ return err;
+ if (val > perf_max_counters)
+ return -EINVAL;
+
+ spin_lock(&perf_resource_lock);
+ perf_reserved_percpu = val;
+ for_each_online_cpu(cpu) {
+ cpuctx = &per_cpu(perf_cpu_context, cpu);
+ spin_lock_irq(&cpuctx->ctx.lock);
+ mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
+ perf_max_counters - perf_reserved_percpu);
+ cpuctx->max_pertask = mpt;
+ spin_unlock_irq(&cpuctx->ctx.lock);
+ }
+ spin_unlock(&perf_resource_lock);
+
+ return count;
+}
+
+static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
+{
+ return sprintf(buf, "%d\n", perf_overcommit);
+}
+
+static ssize_t
+perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
+{
+ unsigned long val;
+ int err;
+
+ err = strict_strtoul(buf, 10, &val);
+ if (err)
+ return err;
+ if (val > 1)
+ return -EINVAL;
+
+ spin_lock(&perf_resource_lock);
+ perf_overcommit = val;
+ spin_unlock(&perf_resource_lock);
+
+ return count;
+}
+
+static SYSDEV_CLASS_ATTR(
+ reserve_percpu,
+ 0644,
+ perf_show_reserve_percpu,
+ perf_set_reserve_percpu
+ );
+
+static SYSDEV_CLASS_ATTR(
+ overcommit,
+ 0644,
+ perf_show_overcommit,
+ perf_set_overcommit
+ );
+
+static struct attribute *perfclass_attrs[] = {
+ &attr_reserve_percpu.attr,
+ &attr_overcommit.attr,
+ NULL
+};
+
+static struct attribute_group perfclass_attr_group = {
+ .attrs = perfclass_attrs,
+ .name = "perf_counters",
+};
+
+static int __init perf_counter_sysfs_init(void)
+{
+ return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
+ &perfclass_attr_group);
+}
+device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/pid.c b/kernel/pid.c
index 1b3586fe753..31310b5d3f5 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -378,31 +378,22 @@ EXPORT_SYMBOL(pid_task);
/*
* Must be called under rcu_read_lock() or with tasklist_lock read-held.
*/
-struct task_struct *find_task_by_pid_type_ns(int type, int nr,
- struct pid_namespace *ns)
+struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
- return pid_task(find_pid_ns(nr, ns), type);
+ return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}
-EXPORT_SYMBOL(find_task_by_pid_type_ns);
-
struct task_struct *find_task_by_vpid(pid_t vnr)
{
- return find_task_by_pid_type_ns(PIDTYPE_PID, vnr,
- current->nsproxy->pid_ns);
-}
-EXPORT_SYMBOL(find_task_by_vpid);
-
-struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
-{
- return find_task_by_pid_type_ns(PIDTYPE_PID, nr, ns);
+ return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
}
-EXPORT_SYMBOL(find_task_by_pid_ns);
struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
{
struct pid *pid;
rcu_read_lock();
+ if (type != PIDTYPE_PID)
+ task = task->group_leader;
pid = get_pid(task->pids[type].pid);
rcu_read_unlock();
return pid;
@@ -450,11 +441,24 @@ pid_t pid_vnr(struct pid *pid)
}
EXPORT_SYMBOL_GPL(pid_vnr);
-pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
+ struct pid_namespace *ns)
{
- return pid_nr_ns(task_pid(tsk), ns);
+ pid_t nr = 0;
+
+ rcu_read_lock();
+ if (!ns)
+ ns = current->nsproxy->pid_ns;
+ if (likely(pid_alive(task))) {
+ if (type != PIDTYPE_PID)
+ task = task->group_leader;
+ nr = pid_nr_ns(task->pids[type].pid, ns);
+ }
+ rcu_read_unlock();
+
+ return nr;
}
-EXPORT_SYMBOL(task_pid_nr_ns);
+EXPORT_SYMBOL(__task_pid_nr_ns);
pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
@@ -462,18 +466,6 @@ pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
}
EXPORT_SYMBOL(task_tgid_nr_ns);
-pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
- return pid_nr_ns(task_pgrp(tsk), ns);
-}
-EXPORT_SYMBOL(task_pgrp_nr_ns);
-
-pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
- return pid_nr_ns(task_session(tsk), ns);
-}
-EXPORT_SYMBOL(task_session_nr_ns);
-
struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
return ns_of_pid(task_pid(tsk));
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index fab8ea86fac..821722ae58a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -67,9 +67,10 @@ err_alloc:
return NULL;
}
-static struct pid_namespace *create_pid_namespace(unsigned int level)
+static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
{
struct pid_namespace *ns;
+ unsigned int level = parent_pid_ns->level + 1;
int i;
ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
@@ -86,6 +87,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
kref_init(&ns->kref);
ns->level = level;
+ ns->parent = get_pid_ns(parent_pid_ns);
set_bit(0, ns->pidmap[0].page);
atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -114,25 +116,11 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
{
- struct pid_namespace *new_ns;
-
- BUG_ON(!old_ns);
- new_ns = get_pid_ns(old_ns);
if (!(flags & CLONE_NEWPID))
- goto out;
-
- new_ns = ERR_PTR(-EINVAL);
+ return get_pid_ns(old_ns);
if (flags & CLONE_THREAD)
- goto out_put;
-
- new_ns = create_pid_namespace(old_ns->level + 1);
- if (!IS_ERR(new_ns))
- new_ns->parent = get_pid_ns(old_ns);
-
-out_put:
- put_pid_ns(old_ns);
-out:
- return new_ns;
+ return ERR_PTR(-EINVAL);
+ return create_pid_namespace(old_ns);
}
void free_pid_ns(struct kref *kref)
@@ -152,6 +140,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
{
int nr;
int rc;
+ struct task_struct *task;
/*
* The last thread in the cgroup-init thread group is terminating.
@@ -169,7 +158,19 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
read_lock(&tasklist_lock);
nr = next_pidmap(pid_ns, 1);
while (nr > 0) {
- kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr);
+ rcu_read_lock();
+
+ /*
+ * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring
+ * any nested-container's init processes don't ignore the
+ * signal
+ */
+ task = pid_task(find_vpid(nr), PIDTYPE_PID);
+ if (task)
+ force_sig(SIGKILL, task);
+
+ rcu_read_unlock();
+
nr = next_pidmap(pid_ns, nr);
}
read_unlock(&tasklist_lock);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 157de3a4783..e33a21cb940 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -10,76 +10,6 @@
#include <linux/kernel_stat.h>
/*
- * Allocate the thread_group_cputime structure appropriately and fill in the
- * current values of the fields. Called from copy_signal() via
- * thread_group_cputime_clone_thread() when adding a second or subsequent
- * thread to a thread group. Assumes interrupts are enabled when called.
- */
-int thread_group_cputime_alloc(struct task_struct *tsk)
-{
- struct signal_struct *sig = tsk->signal;
- struct task_cputime *cputime;
-
- /*
- * If we have multiple threads and we don't already have a
- * per-CPU task_cputime struct (checked in the caller), allocate
- * one and fill it in with the times accumulated so far. We may
- * race with another thread so recheck after we pick up the sighand
- * lock.
- */
- cputime = alloc_percpu(struct task_cputime);
- if (cputime == NULL)
- return -ENOMEM;
- spin_lock_irq(&tsk->sighand->siglock);
- if (sig->cputime.totals) {
- spin_unlock_irq(&tsk->sighand->siglock);
- free_percpu(cputime);
- return 0;
- }
- sig->cputime.totals = cputime;
- cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id());
- cputime->utime = tsk->utime;
- cputime->stime = tsk->stime;
- cputime->sum_exec_runtime = tsk->se.sum_exec_runtime;
- spin_unlock_irq(&tsk->sighand->siglock);
- return 0;
-}
-
-/**
- * thread_group_cputime - Sum the thread group time fields across all CPUs.
- *
- * @tsk: The task we use to identify the thread group.
- * @times: task_cputime structure in which we return the summed fields.
- *
- * Walk the list of CPUs to sum the per-CPU time fields in the thread group
- * time structure.
- */
-void thread_group_cputime(
- struct task_struct *tsk,
- struct task_cputime *times)
-{
- struct task_cputime *totals, *tot;
- int i;
-
- totals = tsk->signal->cputime.totals;
- if (!totals) {
- times->utime = tsk->utime;
- times->stime = tsk->stime;
- times->sum_exec_runtime = tsk->se.sum_exec_runtime;
- return;
- }
-
- times->stime = times->utime = cputime_zero;
- times->sum_exec_runtime = 0;
- for_each_possible_cpu(i) {
- tot = per_cpu_ptr(totals, i);
- times->utime = cputime_add(times->utime, tot->utime);
- times->stime = cputime_add(times->stime, tot->stime);
- times->sum_exec_runtime += tot->sum_exec_runtime;
- }
-}
-
-/*
* Called after updating RLIMIT_CPU to set timer expiration if necessary.
*/
void update_rlimit_cpu(unsigned long rlim_new)
@@ -88,7 +18,7 @@ void update_rlimit_cpu(unsigned long rlim_new)
cputime = secs_to_cputime(rlim_new);
if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
- cputime_lt(current->signal->it_prof_expires, cputime)) {
+ cputime_gt(current->signal->it_prof_expires, cputime)) {
spin_lock_irq(&current->sighand->siglock);
set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
spin_unlock_irq(&current->sighand->siglock);
@@ -294,12 +224,77 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
cpu->cpu = virt_ticks(p);
break;
case CPUCLOCK_SCHED:
- cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = task_sched_runtime(p);
break;
}
return 0;
}
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+{
+ struct sighand_struct *sighand;
+ struct signal_struct *sig;
+ struct task_struct *t;
+
+ *times = INIT_CPUTIME;
+
+ rcu_read_lock();
+ sighand = rcu_dereference(tsk->sighand);
+ if (!sighand)
+ goto out;
+
+ sig = tsk->signal;
+
+ t = tsk;
+ do {
+ times->utime = cputime_add(times->utime, t->utime);
+ times->stime = cputime_add(times->stime, t->stime);
+ times->sum_exec_runtime += t->se.sum_exec_runtime;
+
+ t = next_thread(t);
+ } while (t != tsk);
+
+ times->utime = cputime_add(times->utime, sig->utime);
+ times->stime = cputime_add(times->stime, sig->stime);
+ times->sum_exec_runtime += sig->sum_sched_runtime;
+out:
+ rcu_read_unlock();
+}
+
+static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
+{
+ if (cputime_gt(b->utime, a->utime))
+ a->utime = b->utime;
+
+ if (cputime_gt(b->stime, a->stime))
+ a->stime = b->stime;
+
+ if (b->sum_exec_runtime > a->sum_exec_runtime)
+ a->sum_exec_runtime = b->sum_exec_runtime;
+}
+
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
+{
+ struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+ struct task_cputime sum;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cputimer->lock, flags);
+ if (!cputimer->running) {
+ cputimer->running = 1;
+ /*
+ * The POSIX timer interface allows for absolute time expiry
+ * values through the TIMER_ABSTIME flag, therefore we have
+ * to synchronize the timer to the clock every time we start
+ * it.
+ */
+ thread_group_cputime(tsk, &sum);
+ update_gt_cputime(&cputimer->cputime, &sum);
+ }
+ *times = cputimer->cputime;
+ spin_unlock_irqrestore(&cputimer->lock, flags);
+}
+
/*
* Sample a process (thread group) clock for the given group_leader task.
* Must be called with tasklist_lock held for reading.
@@ -310,18 +305,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
{
struct task_cputime cputime;
- thread_group_cputime(p, &cputime);
switch (CPUCLOCK_WHICH(which_clock)) {
default:
return -EINVAL;
case CPUCLOCK_PROF:
+ thread_group_cputime(p, &cputime);
cpu->cpu = cputime_add(cputime.utime, cputime.stime);
break;
case CPUCLOCK_VIRT:
+ thread_group_cputime(p, &cputime);
cpu->cpu = cputime.utime;
break;
case CPUCLOCK_SCHED:
- cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+ cpu->sched = thread_group_sched_runtime(p);
break;
}
return 0;
@@ -525,11 +521,12 @@ void posix_cpu_timers_exit(struct task_struct *tsk)
}
void posix_cpu_timers_exit_group(struct task_struct *tsk)
{
- struct task_cputime cputime;
+ struct signal_struct *const sig = tsk->signal;
- thread_group_cputime(tsk, &cputime);
cleanup_timers(tsk->signal->cpu_timers,
- cputime.utime, cputime.stime, cputime.sum_exec_runtime);
+ cputime_add(tsk->utime, sig->utime),
+ cputime_add(tsk->stime, sig->stime),
+ tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
}
static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
@@ -686,6 +683,33 @@ static void cpu_timer_fire(struct k_itimer *timer)
}
/*
+ * Sample a process (thread group) timer for the given group_leader task.
+ * Must be called with tasklist_lock held for reading.
+ */
+static int cpu_timer_sample_group(const clockid_t which_clock,
+ struct task_struct *p,
+ union cpu_time_count *cpu)
+{
+ struct task_cputime cputime;
+
+ thread_group_cputimer(p, &cputime);
+ switch (CPUCLOCK_WHICH(which_clock)) {
+ default:
+ return -EINVAL;
+ case CPUCLOCK_PROF:
+ cpu->cpu = cputime_add(cputime.utime, cputime.stime);
+ break;
+ case CPUCLOCK_VIRT:
+ cpu->cpu = cputime.utime;
+ break;
+ case CPUCLOCK_SCHED:
+ cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+ break;
+ }
+ return 0;
+}
+
+/*
* Guts of sys_timer_settime for CPU timers.
* This is called with the timer locked and interrupts disabled.
* If we return TIMER_RETRY, it's necessary to release the timer's lock
@@ -746,7 +770,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
cpu_clock_sample(timer->it_clock, p, &val);
} else {
- cpu_clock_sample_group(timer->it_clock, p, &val);
+ cpu_timer_sample_group(timer->it_clock, p, &val);
}
if (old) {
@@ -894,7 +918,7 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
read_unlock(&tasklist_lock);
goto dead;
} else {
- cpu_clock_sample_group(timer->it_clock, p, &now);
+ cpu_timer_sample_group(timer->it_clock, p, &now);
clear_dead = (unlikely(p->exit_state) &&
thread_group_empty(p));
}
@@ -1034,6 +1058,19 @@ static void check_thread_timers(struct task_struct *tsk,
}
}
+static void stop_process_timers(struct task_struct *tsk)
+{
+ struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+ unsigned long flags;
+
+ if (!cputimer->running)
+ return;
+
+ spin_lock_irqsave(&cputimer->lock, flags);
+ cputimer->running = 0;
+ spin_unlock_irqrestore(&cputimer->lock, flags);
+}
+
/*
* Check for any per-thread CPU timers that have fired and move them
* off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1057,13 +1094,15 @@ static void check_process_timers(struct task_struct *tsk,
sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
list_empty(&timers[CPUCLOCK_VIRT]) &&
cputime_eq(sig->it_virt_expires, cputime_zero) &&
- list_empty(&timers[CPUCLOCK_SCHED]))
+ list_empty(&timers[CPUCLOCK_SCHED])) {
+ stop_process_timers(tsk);
return;
+ }
/*
* Collect the current process totals.
*/
- thread_group_cputime(tsk, &cputime);
+ thread_group_cputimer(tsk, &cputime);
utime = cputime.utime;
ptime = cputime_add(utime, cputime.stime);
sum_sched_runtime = cputime.sum_exec_runtime;
@@ -1234,7 +1273,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
clear_dead_task(timer, now);
goto out_unlock;
}
- cpu_clock_sample_group(timer->it_clock, p, &now);
+ cpu_timer_sample_group(timer->it_clock, p, &now);
bump_cpu_timer(timer, now);
/* Leave the tasklist_lock locked for the call below. */
}
@@ -1329,11 +1368,12 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
if (!task_cputime_zero(&sig->cputime_expires)) {
struct task_cputime group_sample;
- thread_group_cputime(tsk, &group_sample);
+ thread_group_cputimer(tsk, &group_sample);
if (task_cputime_expired(&group_sample, &sig->cputime_expires))
return 1;
}
- return 0;
+
+ return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY;
}
/*
@@ -1381,19 +1421,19 @@ void run_posix_cpu_timers(struct task_struct *tsk)
* timer call will interfere.
*/
list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) {
- int firing;
+ int cpu_firing;
+
spin_lock(&timer->it_lock);
list_del_init(&timer->it.cpu.entry);
- firing = timer->it.cpu.firing;
+ cpu_firing = timer->it.cpu.firing;
timer->it.cpu.firing = 0;
/*
* The firing flag is -1 if we collided with a reset
* of the timer, which already reported this
* almost-firing as an overrun. So don't generate an event.
*/
- if (likely(firing >= 0)) {
+ if (likely(cpu_firing >= 0))
cpu_timer_fire(timer);
- }
spin_unlock(&timer->it_lock);
}
}
@@ -1411,7 +1451,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
struct list_head *head;
BUG_ON(clock_idx == CPUCLOCK_SCHED);
- cpu_clock_sample_group(clock_idx, tsk, &now);
+ cpu_timer_sample_group(clock_idx, tsk, &now);
if (oldval) {
if (!cputime_eq(*oldval, cputime_zero)) {
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 052ec4d195c..d089d052c4a 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -202,6 +202,12 @@ static int no_timer_create(struct k_itimer *new_timer)
return -EOPNOTSUPP;
}
+static int no_nsleep(const clockid_t which_clock, int flags,
+ struct timespec *tsave, struct timespec __user *rmtp)
+{
+ return -EOPNOTSUPP;
+}
+
/*
* Return nonzero if we know a priori this clockid_t value is bogus.
*/
@@ -254,6 +260,7 @@ static __init int init_posix_timers(void)
.clock_get = posix_get_monotonic_raw,
.clock_set = do_posix_clock_nosettime,
.timer_create = no_timer_create,
+ .nsleep = no_nsleep,
};
register_posix_clock(CLOCK_REALTIME, &clock_realtime);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 23bd4daeb96..72067cbdb37 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -116,9 +116,13 @@ config SUSPEND_FREEZER
Turning OFF this setting is NOT recommended! If in doubt, say Y.
+config HIBERNATION_NVS
+ bool
+
config HIBERNATION
bool "Hibernation (aka 'suspend to disk')"
depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
+ select HIBERNATION_NVS if HAS_IOMEM
---help---
Enable the suspend to disk (STD) functionality, which is usually
called "hibernation" in user interfaces. STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index d7a10167a25..c3b81c30e5d 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -3,9 +3,12 @@ ifeq ($(CONFIG_PM_DEBUG),y)
EXTRA_CFLAGS += -DDEBUG
endif
-obj-y := main.o
+obj-$(CONFIG_PM) += main.o
obj-$(CONFIG_PM_SLEEP) += console.o
obj-$(CONFIG_FREEZER) += process.o
-obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o
+obj-$(CONFIG_SUSPEND) += suspend.o
+obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
+obj-$(CONFIG_HIBERNATION) += swsusp.o hibernate.o snapshot.o swap.o user.o
+obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index b8628be2a46..a3961b205de 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -78,6 +78,12 @@ void pm_restore_console(void)
}
set_console(orig_fgconsole);
release_console_sem();
+
+ if (vt_waitactive(orig_fgconsole)) {
+ pr_debug("Resume: Can't switch VCs.");
+ return;
+ }
+
kmsg_redirect = orig_kmsg;
}
#endif
diff --git a/kernel/power/disk.c b/kernel/power/hibernate.c
index 45e8541ab7e..81d2e746489 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/hibernate.c
@@ -1,12 +1,12 @@
/*
- * kernel/power/disk.c - Suspend-to-disk support.
+ * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support.
*
* Copyright (c) 2003 Patrick Mochel
* Copyright (c) 2003 Open Source Development Lab
* Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
*
* This file is released under the GPLv2.
- *
*/
#include <linux/suspend.h>
@@ -22,6 +22,8 @@
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/freezer.h>
+#include <scsi/scsi_scan.h>
+#include <asm/suspend.h>
#include "power.h"
@@ -71,6 +73,14 @@ void hibernation_set_ops(struct platform_hibernation_ops *ops)
mutex_unlock(&pm_mutex);
}
+static bool entering_platform_hibernation;
+
+bool system_entering_hibernation(void)
+{
+ return entering_platform_hibernation;
+}
+EXPORT_SYMBOL(system_entering_hibernation);
+
#ifdef CONFIG_PM_DEBUG
static void hibernation_debug_sleep(void)
{
@@ -205,18 +215,34 @@ static int create_image(int platform_mode)
if (error)
return error;
- device_pm_lock();
- local_irq_disable();
- /* At this point, device_suspend() has been called, but *not*
- * device_power_down(). We *must* call device_power_down() now.
+ /* At this point, dpm_suspend_start() has been called, but *not*
+ * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
* Otherwise, drivers for some devices (e.g. interrupt controllers)
* become desynchronized with the actual state of the hardware
* at resume time, and evil weirdness ensues.
*/
- error = device_power_down(PMSG_FREEZE);
+ error = dpm_suspend_noirq(PMSG_FREEZE);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down, "
"aborting hibernation\n");
+ return error;
+ }
+
+ error = platform_pre_snapshot(platform_mode);
+ if (error || hibernation_test(TEST_PLATFORM))
+ goto Platform_finish;
+
+ error = disable_nonboot_cpus();
+ if (error || hibernation_test(TEST_CPUS)
+ || hibernation_testmode(HIBERNATION_TEST))
+ goto Enable_cpus;
+
+ local_irq_disable();
+
+ error = sysdev_suspend(PMSG_FREEZE);
+ if (error) {
+ printk(KERN_ERR "PM: Some system devices failed to power down, "
+ "aborting hibernation\n");
goto Enable_irqs;
}
@@ -233,15 +259,25 @@ static int create_image(int platform_mode)
restore_processor_state();
if (!in_suspend)
platform_leave(platform_mode);
+
Power_up:
- /* NOTE: device_power_up() is just a resume() for devices
+ sysdev_resume();
+ /* NOTE: dpm_resume_noirq() is just a resume() for devices
* that suspended with irqs off ... no overall powerup.
*/
- device_power_up(in_suspend ?
- (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+
Enable_irqs:
local_irq_enable();
- device_pm_unlock();
+
+ Enable_cpus:
+ enable_nonboot_cpus();
+
+ Platform_finish:
+ platform_finish(platform_mode);
+
+ dpm_resume_noirq(in_suspend ?
+ (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+
return error;
}
@@ -249,7 +285,7 @@ static int create_image(int platform_mode)
* hibernation_snapshot - quiesce devices and create the hibernation
* snapshot image.
* @platform_mode - if set, use the platform driver, if available, to
- * prepare the platform frimware for the power transition.
+ * prepare the platform firmware for the power transition.
*
* Must be called with pm_mutex held
*/
@@ -268,34 +304,18 @@ int hibernation_snapshot(int platform_mode)
goto Close;
suspend_console();
- error = device_suspend(PMSG_FREEZE);
+ error = dpm_suspend_start(PMSG_FREEZE);
if (error)
goto Recover_platform;
if (hibernation_test(TEST_DEVICES))
goto Recover_platform;
- error = platform_pre_snapshot(platform_mode);
- if (error || hibernation_test(TEST_PLATFORM))
- goto Finish;
-
- error = disable_nonboot_cpus();
- if (!error) {
- if (hibernation_test(TEST_CPUS))
- goto Enable_cpus;
-
- if (hibernation_testmode(HIBERNATION_TEST))
- goto Enable_cpus;
+ error = create_image(platform_mode);
+ /* Control returns here after successful restore */
- error = create_image(platform_mode);
- /* Control returns here after successful restore */
- }
- Enable_cpus:
- enable_nonboot_cpus();
- Finish:
- platform_finish(platform_mode);
Resume_devices:
- device_resume(in_suspend ?
+ dpm_resume_end(in_suspend ?
(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
resume_console();
Close:
@@ -315,18 +335,31 @@ int hibernation_snapshot(int platform_mode)
* kernel.
*/
-static int resume_target_kernel(void)
+static int resume_target_kernel(bool platform_mode)
{
int error;
- device_pm_lock();
- local_irq_disable();
- error = device_power_down(PMSG_QUIESCE);
+ error = dpm_suspend_noirq(PMSG_QUIESCE);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down, "
"aborting resume\n");
- goto Enable_irqs;
+ return error;
}
+
+ error = platform_pre_restore(platform_mode);
+ if (error)
+ goto Cleanup;
+
+ error = disable_nonboot_cpus();
+ if (error)
+ goto Enable_cpus;
+
+ local_irq_disable();
+
+ error = sysdev_suspend(PMSG_QUIESCE);
+ if (error)
+ goto Enable_irqs;
+
/* We'll ignore saved state, but this gets preempt count (etc) right */
save_processor_state();
error = restore_highmem();
@@ -349,10 +382,20 @@ static int resume_target_kernel(void)
swsusp_free();
restore_processor_state();
touch_softlockup_watchdog();
- device_power_up(PMSG_RECOVER);
+
+ sysdev_resume();
+
Enable_irqs:
local_irq_enable();
- device_pm_unlock();
+
+ Enable_cpus:
+ enable_nonboot_cpus();
+
+ Cleanup:
+ platform_restore_cleanup(platform_mode);
+
+ dpm_resume_noirq(PMSG_RECOVER);
+
return error;
}
@@ -360,7 +403,7 @@ static int resume_target_kernel(void)
* hibernation_restore - quiesce devices and restore the hibernation
* snapshot image. If successful, control returns in hibernation_snaphot()
* @platform_mode - if set, use the platform driver, if available, to
- * prepare the platform frimware for the transition.
+ * prepare the platform firmware for the transition.
*
* Must be called with pm_mutex held
*/
@@ -371,20 +414,11 @@ int hibernation_restore(int platform_mode)
pm_prepare_console();
suspend_console();
- error = device_suspend(PMSG_QUIESCE);
- if (error)
- goto Finish;
-
- error = platform_pre_restore(platform_mode);
+ error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
- error = disable_nonboot_cpus();
- if (!error)
- error = resume_target_kernel();
- enable_nonboot_cpus();
+ error = resume_target_kernel(platform_mode);
+ dpm_resume_end(PMSG_RECOVER);
}
- platform_restore_cleanup(platform_mode);
- device_resume(PMSG_RECOVER);
- Finish:
resume_console();
pm_restore_console();
return error;
@@ -411,44 +445,50 @@ int hibernation_platform_enter(void)
if (error)
goto Close;
+ entering_platform_hibernation = true;
suspend_console();
- error = device_suspend(PMSG_HIBERNATE);
+ error = dpm_suspend_start(PMSG_HIBERNATE);
if (error) {
if (hibernation_ops->recover)
hibernation_ops->recover();
goto Resume_devices;
}
- error = hibernation_ops->prepare();
+ error = dpm_suspend_noirq(PMSG_HIBERNATE);
if (error)
goto Resume_devices;
+ error = hibernation_ops->prepare();
+ if (error)
+ goto Platofrm_finish;
+
error = disable_nonboot_cpus();
if (error)
- goto Finish;
+ goto Platofrm_finish;
- device_pm_lock();
local_irq_disable();
- error = device_power_down(PMSG_HIBERNATE);
- if (!error) {
- hibernation_ops->enter();
- /* We should never get here */
- while (1);
- }
- local_irq_enable();
- device_pm_unlock();
+ sysdev_suspend(PMSG_HIBERNATE);
+ hibernation_ops->enter();
+ /* We should never get here */
+ while (1);
/*
* We don't need to reenable the nonboot CPUs or resume consoles, since
* the system is going to be halted anyway.
*/
- Finish:
+ Platofrm_finish:
hibernation_ops->finish();
+
+ dpm_suspend_noirq(PMSG_RESTORE);
+
Resume_devices:
- device_resume(PMSG_RESTORE);
+ entering_platform_hibernation = false;
+ dpm_resume_end(PMSG_RESTORE);
resume_console();
+
Close:
hibernation_ops->end();
+
return error;
}
@@ -585,6 +625,12 @@ static int software_resume(void)
unsigned int flags;
/*
+ * If the user said "noresume".. bail out early.
+ */
+ if (noresume)
+ return 0;
+
+ /*
* name_to_dev_t() below takes a sysfs buffer mutex when sysfs
* is configured into the kernel. Since the regular hibernate
* trigger path is via sysfs which takes a buffer mutex before
@@ -595,28 +641,43 @@ static int software_resume(void)
* here to avoid lockdep complaining.
*/
mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING);
- if (!swsusp_resume_device) {
- if (!strlen(resume_file)) {
- mutex_unlock(&pm_mutex);
- return -ENOENT;
- }
- swsusp_resume_device = name_to_dev_t(resume_file);
- pr_debug("PM: Resume from partition %s\n", resume_file);
- } else {
- pr_debug("PM: Resume from partition %d:%d\n",
- MAJOR(swsusp_resume_device),
- MINOR(swsusp_resume_device));
+
+ if (swsusp_resume_device)
+ goto Check_image;
+
+ if (!strlen(resume_file)) {
+ error = -ENOENT;
+ goto Unlock;
}
- if (noresume) {
- /**
- * FIXME: If noresume is specified, we need to find the
- * partition and reset it back to normal swap space.
+ pr_debug("PM: Checking image partition %s\n", resume_file);
+
+ /* Check if the device is there */
+ swsusp_resume_device = name_to_dev_t(resume_file);
+ if (!swsusp_resume_device) {
+ /*
+ * Some device discovery might still be in progress; we need
+ * to wait for this to finish.
*/
- mutex_unlock(&pm_mutex);
- return 0;
+ wait_for_device_probe();
+ /*
+ * We can't depend on SCSI devices being available after loading
+ * one of their modules until scsi_complete_async_scans() is
+ * called and the resume device usually is a SCSI one.
+ */
+ scsi_complete_async_scans();
+
+ swsusp_resume_device = name_to_dev_t(resume_file);
+ if (!swsusp_resume_device) {
+ error = -ENODEV;
+ goto Unlock;
+ }
}
+ Check_image:
+ pr_debug("PM: Resume from partition %d:%d\n",
+ MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
+
pr_debug("PM: Checking hibernation image.\n");
error = swsusp_check();
if (error)
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c
new file mode 100644
index 00000000000..39ac698ef83
--- /dev/null
+++ b/kernel/power/hibernate_nvs.c
@@ -0,0 +1,135 @@
+/*
+ * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
+ *
+ * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+
+/*
+ * Platforms, like ACPI, may want us to save some memory used by them during
+ * hibernation and to restore the contents of this memory during the subsequent
+ * resume. The code below implements a mechanism allowing us to do that.
+ */
+
+struct nvs_page {
+ unsigned long phys_start;
+ unsigned int size;
+ void *kaddr;
+ void *data;
+ struct list_head node;
+};
+
+static LIST_HEAD(nvs_list);
+
+/**
+ * hibernate_nvs_register - register platform NVS memory region to save
+ * @start - physical address of the region
+ * @size - size of the region
+ *
+ * The NVS region need not be page-aligned (both ends) and we arrange
+ * things so that the data from page-aligned addresses in this region will
+ * be copied into separate RAM pages.
+ */
+int hibernate_nvs_register(unsigned long start, unsigned long size)
+{
+ struct nvs_page *entry, *next;
+
+ while (size > 0) {
+ unsigned int nr_bytes;
+
+ entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
+ if (!entry)
+ goto Error;
+
+ list_add_tail(&entry->node, &nvs_list);
+ entry->phys_start = start;
+ nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
+ entry->size = (size < nr_bytes) ? size : nr_bytes;
+
+ start += entry->size;
+ size -= entry->size;
+ }
+ return 0;
+
+ Error:
+ list_for_each_entry_safe(entry, next, &nvs_list, node) {
+ list_del(&entry->node);
+ kfree(entry);
+ }
+ return -ENOMEM;
+}
+
+/**
+ * hibernate_nvs_free - free data pages allocated for saving NVS regions
+ */
+void hibernate_nvs_free(void)
+{
+ struct nvs_page *entry;
+
+ list_for_each_entry(entry, &nvs_list, node)
+ if (entry->data) {
+ free_page((unsigned long)entry->data);
+ entry->data = NULL;
+ if (entry->kaddr) {
+ iounmap(entry->kaddr);
+ entry->kaddr = NULL;
+ }
+ }
+}
+
+/**
+ * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
+ */
+int hibernate_nvs_alloc(void)
+{
+ struct nvs_page *entry;
+
+ list_for_each_entry(entry, &nvs_list, node) {
+ entry->data = (void *)__get_free_page(GFP_KERNEL);
+ if (!entry->data) {
+ hibernate_nvs_free();
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
+/**
+ * hibernate_nvs_save - save NVS memory regions
+ */
+void hibernate_nvs_save(void)
+{
+ struct nvs_page *entry;
+
+ printk(KERN_INFO "PM: Saving platform NVS memory\n");
+
+ list_for_each_entry(entry, &nvs_list, node)
+ if (entry->data) {
+ entry->kaddr = ioremap(entry->phys_start, entry->size);
+ memcpy(entry->data, entry->kaddr, entry->size);
+ }
+}
+
+/**
+ * hibernate_nvs_restore - restore NVS memory regions
+ *
+ * This function is going to be called with interrupts disabled, so it
+ * cannot iounmap the virtual addresses used to access the NVS region.
+ */
+void hibernate_nvs_restore(void)
+{
+ struct nvs_page *entry;
+
+ printk(KERN_INFO "PM: Restoring platform NVS memory\n");
+
+ list_for_each_entry(entry, &nvs_list, node)
+ if (entry->data)
+ memcpy(entry->kaddr, entry->data, entry->size);
+}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 23998887397..f710e36930c 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,20 +8,9 @@
*
*/
-#include <linux/module.h>
-#include <linux/suspend.h>
#include <linux/kobject.h>
#include <linux/string.h>
-#include <linux/delay.h>
-#include <linux/errno.h>
-#include <linux/kmod.h>
-#include <linux/init.h>
-#include <linux/console.h>
-#include <linux/cpu.h>
#include <linux/resume-trace.h>
-#include <linux/freezer.h>
-#include <linux/vmstat.h>
-#include <linux/syscalls.h>
#include "power.h"
@@ -57,16 +46,6 @@ int pm_notifier_call_chain(unsigned long val)
#ifdef CONFIG_PM_DEBUG
int pm_test_level = TEST_NONE;
-static int suspend_test(int level)
-{
- if (pm_test_level == level) {
- printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
- mdelay(5000);
- return 1;
- }
- return 0;
-}
-
static const char * const pm_tests[__TEST_AFTER_LAST] = {
[TEST_NONE] = "none",
[TEST_CORE] = "core",
@@ -125,347 +104,10 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
}
power_attr(pm_test);
-#else /* !CONFIG_PM_DEBUG */
-static inline int suspend_test(int level) { return 0; }
-#endif /* !CONFIG_PM_DEBUG */
+#endif /* CONFIG_PM_DEBUG */
#endif /* CONFIG_PM_SLEEP */
-#ifdef CONFIG_SUSPEND
-
-#ifdef CONFIG_PM_TEST_SUSPEND
-
-/*
- * We test the system suspend code by setting an RTC wakealarm a short
- * time in the future, then suspending. Suspending the devices won't
- * normally take long ... some systems only need a few milliseconds.
- *
- * The time it takes is system-specific though, so when we test this
- * during system bootup we allow a LOT of time.
- */
-#define TEST_SUSPEND_SECONDS 5
-
-static unsigned long suspend_test_start_time;
-
-static void suspend_test_start(void)
-{
- /* FIXME Use better timebase than "jiffies", ideally a clocksource.
- * What we want is a hardware counter that will work correctly even
- * during the irqs-are-off stages of the suspend/resume cycle...
- */
- suspend_test_start_time = jiffies;
-}
-
-static void suspend_test_finish(const char *label)
-{
- long nj = jiffies - suspend_test_start_time;
- unsigned msec;
-
- msec = jiffies_to_msecs(abs(nj));
- pr_info("PM: %s took %d.%03d seconds\n", label,
- msec / 1000, msec % 1000);
-
- /* Warning on suspend means the RTC alarm period needs to be
- * larger -- the system was sooo slooowwww to suspend that the
- * alarm (should have) fired before the system went to sleep!
- *
- * Warning on either suspend or resume also means the system
- * has some performance issues. The stack dump of a WARN_ON
- * is more likely to get the right attention than a printk...
- */
- WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
-}
-
-#else
-
-static void suspend_test_start(void)
-{
-}
-
-static void suspend_test_finish(const char *label)
-{
-}
-
-#endif
-
-/* This is just an arbitrary number */
-#define FREE_PAGE_NUMBER (100)
-
-static struct platform_suspend_ops *suspend_ops;
-
-/**
- * suspend_set_ops - Set the global suspend method table.
- * @ops: Pointer to ops structure.
- */
-
-void suspend_set_ops(struct platform_suspend_ops *ops)
-{
- mutex_lock(&pm_mutex);
- suspend_ops = ops;
- mutex_unlock(&pm_mutex);
-}
-
-/**
- * suspend_valid_only_mem - generic memory-only valid callback
- *
- * Platform drivers that implement mem suspend only and only need
- * to check for that in their .valid callback can use this instead
- * of rolling their own .valid callback.
- */
-int suspend_valid_only_mem(suspend_state_t state)
-{
- return state == PM_SUSPEND_MEM;
-}
-
-/**
- * suspend_prepare - Do prep work before entering low-power state.
- *
- * This is common code that is called for each state that we're entering.
- * Run suspend notifiers, allocate a console and stop all processes.
- */
-static int suspend_prepare(void)
-{
- int error;
- unsigned int free_pages;
-
- if (!suspend_ops || !suspend_ops->enter)
- return -EPERM;
-
- pm_prepare_console();
-
- error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
- if (error)
- goto Finish;
-
- error = usermodehelper_disable();
- if (error)
- goto Finish;
-
- if (suspend_freeze_processes()) {
- error = -EAGAIN;
- goto Thaw;
- }
-
- free_pages = global_page_state(NR_FREE_PAGES);
- if (free_pages < FREE_PAGE_NUMBER) {
- pr_debug("PM: free some memory\n");
- shrink_all_memory(FREE_PAGE_NUMBER - free_pages);
- if (nr_free_pages() < FREE_PAGE_NUMBER) {
- error = -ENOMEM;
- printk(KERN_ERR "PM: No enough memory\n");
- }
- }
- if (!error)
- return 0;
-
- Thaw:
- suspend_thaw_processes();
- usermodehelper_enable();
- Finish:
- pm_notifier_call_chain(PM_POST_SUSPEND);
- pm_restore_console();
- return error;
-}
-
-/* default implementation */
-void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
-{
- local_irq_disable();
-}
-
-/* default implementation */
-void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
-{
- local_irq_enable();
-}
-
-/**
- * suspend_enter - enter the desired system sleep state.
- * @state: state to enter
- *
- * This function should be called after devices have been suspended.
- */
-static int suspend_enter(suspend_state_t state)
-{
- int error = 0;
-
- device_pm_lock();
- arch_suspend_disable_irqs();
- BUG_ON(!irqs_disabled());
-
- if ((error = device_power_down(PMSG_SUSPEND))) {
- printk(KERN_ERR "PM: Some devices failed to power down\n");
- goto Done;
- }
-
- if (!suspend_test(TEST_CORE))
- error = suspend_ops->enter(state);
-
- device_power_up(PMSG_RESUME);
- Done:
- arch_suspend_enable_irqs();
- BUG_ON(irqs_disabled());
- device_pm_unlock();
- return error;
-}
-
-/**
- * suspend_devices_and_enter - suspend devices and enter the desired system
- * sleep state.
- * @state: state to enter
- */
-int suspend_devices_and_enter(suspend_state_t state)
-{
- int error;
-
- if (!suspend_ops)
- return -ENOSYS;
-
- if (suspend_ops->begin) {
- error = suspend_ops->begin(state);
- if (error)
- goto Close;
- }
- suspend_console();
- suspend_test_start();
- error = device_suspend(PMSG_SUSPEND);
- if (error) {
- printk(KERN_ERR "PM: Some devices failed to suspend\n");
- goto Recover_platform;
- }
- suspend_test_finish("suspend devices");
- if (suspend_test(TEST_DEVICES))
- goto Recover_platform;
-
- if (suspend_ops->prepare) {
- error = suspend_ops->prepare();
- if (error)
- goto Resume_devices;
- }
-
- if (suspend_test(TEST_PLATFORM))
- goto Finish;
-
- error = disable_nonboot_cpus();
- if (!error && !suspend_test(TEST_CPUS))
- suspend_enter(state);
-
- enable_nonboot_cpus();
- Finish:
- if (suspend_ops->finish)
- suspend_ops->finish();
- Resume_devices:
- suspend_test_start();
- device_resume(PMSG_RESUME);
- suspend_test_finish("resume devices");
- resume_console();
- Close:
- if (suspend_ops->end)
- suspend_ops->end();
- return error;
-
- Recover_platform:
- if (suspend_ops->recover)
- suspend_ops->recover();
- goto Resume_devices;
-}
-
-/**
- * suspend_finish - Do final work before exiting suspend sequence.
- *
- * Call platform code to clean up, restart processes, and free the
- * console that we've allocated. This is not called for suspend-to-disk.
- */
-static void suspend_finish(void)
-{
- suspend_thaw_processes();
- usermodehelper_enable();
- pm_notifier_call_chain(PM_POST_SUSPEND);
- pm_restore_console();
-}
-
-
-
-
-static const char * const pm_states[PM_SUSPEND_MAX] = {
- [PM_SUSPEND_STANDBY] = "standby",
- [PM_SUSPEND_MEM] = "mem",
-};
-
-static inline int valid_state(suspend_state_t state)
-{
- /* All states need lowlevel support and need to be valid
- * to the lowlevel implementation, no valid callback
- * implies that none are valid. */
- if (!suspend_ops || !suspend_ops->valid || !suspend_ops->valid(state))
- return 0;
- return 1;
-}
-
-
-/**
- * enter_state - Do common work of entering low-power state.
- * @state: pm_state structure for state we're entering.
- *
- * Make sure we're the only ones trying to enter a sleep state. Fail
- * if someone has beat us to it, since we don't want anything weird to
- * happen when we wake up.
- * Then, do the setup for suspend, enter the state, and cleaup (after
- * we've woken up).
- */
-static int enter_state(suspend_state_t state)
-{
- int error;
-
- if (!valid_state(state))
- return -ENODEV;
-
- if (!mutex_trylock(&pm_mutex))
- return -EBUSY;
-
- printk(KERN_INFO "PM: Syncing filesystems ... ");
- sys_sync();
- printk("done.\n");
-
- pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
- error = suspend_prepare();
- if (error)
- goto Unlock;
-
- if (suspend_test(TEST_FREEZER))
- goto Finish;
-
- pr_debug("PM: Entering %s sleep\n", pm_states[state]);
- error = suspend_devices_and_enter(state);
-
- Finish:
- pr_debug("PM: Finishing wakeup.\n");
- suspend_finish();
- Unlock:
- mutex_unlock(&pm_mutex);
- return error;
-}
-
-
-/**
- * pm_suspend - Externally visible function for suspending system.
- * @state: Enumerated value of state to enter.
- *
- * Determine whether or not value is within range, get state
- * structure, and enter (above).
- */
-
-int pm_suspend(suspend_state_t state)
-{
- if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
- return enter_state(state);
- return -EINVAL;
-}
-
-EXPORT_SYMBOL(pm_suspend);
-
-#endif /* CONFIG_SUSPEND */
-
struct kobject *power_kobj;
/**
@@ -478,7 +120,6 @@ struct kobject *power_kobj;
* store() accepts one of those strings, translates it into the
* proper enumerated value, and initiates a suspend transition.
*/
-
static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
@@ -576,7 +217,6 @@ static struct attribute_group attr_group = {
.attrs = g,
};
-
static int __init pm_init(void)
{
power_kobj = kobject_create_and_add("power", NULL);
@@ -586,144 +226,3 @@ static int __init pm_init(void)
}
core_initcall(pm_init);
-
-
-#ifdef CONFIG_PM_TEST_SUSPEND
-
-#include <linux/rtc.h>
-
-/*
- * To test system suspend, we need a hands-off mechanism to resume the
- * system. RTCs wake alarms are a common self-contained mechanism.
- */
-
-static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
-{
- static char err_readtime[] __initdata =
- KERN_ERR "PM: can't read %s time, err %d\n";
- static char err_wakealarm [] __initdata =
- KERN_ERR "PM: can't set %s wakealarm, err %d\n";
- static char err_suspend[] __initdata =
- KERN_ERR "PM: suspend test failed, error %d\n";
- static char info_test[] __initdata =
- KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
-
- unsigned long now;
- struct rtc_wkalrm alm;
- int status;
-
- /* this may fail if the RTC hasn't been initialized */
- status = rtc_read_time(rtc, &alm.time);
- if (status < 0) {
- printk(err_readtime, dev_name(&rtc->dev), status);
- return;
- }
- rtc_tm_to_time(&alm.time, &now);
-
- memset(&alm, 0, sizeof alm);
- rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
- alm.enabled = true;
-
- status = rtc_set_alarm(rtc, &alm);
- if (status < 0) {
- printk(err_wakealarm, dev_name(&rtc->dev), status);
- return;
- }
-
- if (state == PM_SUSPEND_MEM) {
- printk(info_test, pm_states[state]);
- status = pm_suspend(state);
- if (status == -ENODEV)
- state = PM_SUSPEND_STANDBY;
- }
- if (state == PM_SUSPEND_STANDBY) {
- printk(info_test, pm_states[state]);
- status = pm_suspend(state);
- }
- if (status < 0)
- printk(err_suspend, status);
-
- /* Some platforms can't detect that the alarm triggered the
- * wakeup, or (accordingly) disable it after it afterwards.
- * It's supposed to give oneshot behavior; cope.
- */
- alm.enabled = false;
- rtc_set_alarm(rtc, &alm);
-}
-
-static int __init has_wakealarm(struct device *dev, void *name_ptr)
-{
- struct rtc_device *candidate = to_rtc_device(dev);
-
- if (!candidate->ops->set_alarm)
- return 0;
- if (!device_may_wakeup(candidate->dev.parent))
- return 0;
-
- *(const char **)name_ptr = dev_name(dev);
- return 1;
-}
-
-/*
- * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
- * at startup time. They're normally disabled, for faster boot and because
- * we can't know which states really work on this particular system.
- */
-static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
-
-static char warn_bad_state[] __initdata =
- KERN_WARNING "PM: can't test '%s' suspend state\n";
-
-static int __init setup_test_suspend(char *value)
-{
- unsigned i;
-
- /* "=mem" ==> "mem" */
- value++;
- for (i = 0; i < PM_SUSPEND_MAX; i++) {
- if (!pm_states[i])
- continue;
- if (strcmp(pm_states[i], value) != 0)
- continue;
- test_state = (__force suspend_state_t) i;
- return 0;
- }
- printk(warn_bad_state, value);
- return 0;
-}
-__setup("test_suspend", setup_test_suspend);
-
-static int __init test_suspend(void)
-{
- static char warn_no_rtc[] __initdata =
- KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
-
- char *pony = NULL;
- struct rtc_device *rtc = NULL;
-
- /* PM is initialized by now; is that state testable? */
- if (test_state == PM_SUSPEND_ON)
- goto done;
- if (!valid_state(test_state)) {
- printk(warn_bad_state, pm_states[test_state]);
- goto done;
- }
-
- /* RTCs have initialized by now too ... can we use one? */
- class_find_device(rtc_class, NULL, &pony, has_wakealarm);
- if (pony)
- rtc = rtc_class_open(pony);
- if (!rtc) {
- printk(warn_no_rtc);
- goto done;
- }
-
- /* go for it */
- test_wakealarm(rtc, test_state);
- rtc_class_close(rtc);
-done:
- return 0;
-}
-late_initcall(test_suspend);
-
-#endif /* CONFIG_PM_TEST_SUSPEND */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46b5ec7a3af..26d5a26f82e 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -45,7 +45,7 @@ static inline char *check_image_kernel(struct swsusp_info *info)
*/
#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
-/* kernel/power/disk.c */
+/* kernel/power/hibernate.c */
extern int hibernation_snapshot(int platform_mode);
extern int hibernation_restore(int platform_mode);
extern int hibernation_platform_enter(void);
@@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void);
extern int create_basic_memory_bitmaps(void);
extern void free_basic_memory_bitmaps(void);
-extern unsigned int count_data_pages(void);
+extern int swsusp_shrink_memory(void);
/**
* Auxiliary structure used for reading the snapshot image data and
@@ -147,9 +147,8 @@ extern int swsusp_swap_in_use(void);
*/
#define SF_PLATFORM_MODE 1
-/* kernel/power/disk.c */
+/* kernel/power/hibernate.c */
extern int swsusp_check(void);
-extern int swsusp_shrink_memory(void);
extern void swsusp_free(void);
extern int swsusp_read(unsigned int *flags_p);
extern int swsusp_write(unsigned int flags);
@@ -161,22 +160,36 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
unsigned int, char *);
#ifdef CONFIG_SUSPEND
-/* kernel/power/main.c */
+/* kernel/power/suspend.c */
+extern const char *const pm_states[];
+
+extern bool valid_state(suspend_state_t state);
extern int suspend_devices_and_enter(suspend_state_t state);
+extern int enter_state(suspend_state_t state);
#else /* !CONFIG_SUSPEND */
static inline int suspend_devices_and_enter(suspend_state_t state)
{
return -ENOSYS;
}
+static inline int enter_state(suspend_state_t state) { return -ENOSYS; }
+static inline bool valid_state(suspend_state_t state) { return false; }
#endif /* !CONFIG_SUSPEND */
+#ifdef CONFIG_PM_TEST_SUSPEND
+/* kernel/power/suspend_test.c */
+extern void suspend_test_start(void);
+extern void suspend_test_finish(const char *label);
+#else /* !CONFIG_PM_TEST_SUSPEND */
+static inline void suspend_test_start(void) {}
+static inline void suspend_test_finish(const char *label) {}
+#endif /* !CONFIG_PM_TEST_SUSPEND */
+
#ifdef CONFIG_PM_SLEEP
/* kernel/power/main.c */
extern int pm_notifier_call_chain(unsigned long val);
#endif
#ifdef CONFIG_HIGHMEM
-unsigned int count_highmem_pages(void);
int restore_highmem(void);
#else
static inline unsigned int count_highmem_pages(void) { return 0; }
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 97890831e1b..e8b33700627 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -34,7 +34,7 @@ static struct sysrq_key_op sysrq_poweroff_op = {
.handler = handle_poweroff,
.help_msg = "powerOff",
.action_msg = "Power Off",
- .enable_mask = SYSRQ_ENABLE_BOOT,
+ .enable_mask = SYSRQ_ENABLE_BOOT,
};
static int pm_sysrq_init(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index ca634019497..da2072d7381 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -117,9 +117,12 @@ int freeze_processes(void)
if (error)
goto Exit;
printk("done.");
+
+ oom_killer_disable();
Exit:
BUG_ON(in_atomic());
printk("\n");
+
return error;
}
@@ -145,6 +148,8 @@ static void thaw_tasks(bool nosig_only)
void thaw_processes(void)
{
+ oom_killer_enable();
+
printk("Restarting tasks ... ");
thaw_tasks(true);
thaw_tasks(false);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f5fc2d7680f..523a451b45d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -39,6 +39,14 @@ static int swsusp_page_is_free(struct page *);
static void swsusp_set_page_forbidden(struct page *);
static void swsusp_unset_page_forbidden(struct page *);
+/*
+ * Preferred image size in bytes (tunable via /sys/power/image_size).
+ * When it is set to N, swsusp will do its best to ensure the image
+ * size will not exceed N bytes, but if that is impossible, it will
+ * try to create the smallest image possible.
+ */
+unsigned long image_size = 500 * 1024 * 1024;
+
/* List of PBEs needed for restoring the pages that were allocated before
* the suspend and included in the suspend image, but have also been
* allocated by the "resume" kernel, so their contents cannot be written
@@ -321,13 +329,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
INIT_LIST_HEAD(list);
- for_each_zone(zone) {
+ for_each_populated_zone(zone) {
unsigned long zone_start, zone_end;
struct mem_extent *ext, *cur, *aux;
- if (!populated_zone(zone))
- continue;
-
zone_start = zone->zone_start_pfn;
zone_end = zone->zone_start_pfn + zone->spanned_pages;
@@ -804,8 +809,8 @@ static unsigned int count_free_highmem_pages(void)
struct zone *zone;
unsigned int cnt = 0;
- for_each_zone(zone)
- if (populated_zone(zone) && is_highmem(zone))
+ for_each_populated_zone(zone)
+ if (is_highmem(zone))
cnt += zone_page_state(zone, NR_FREE_PAGES);
return cnt;
@@ -843,7 +848,7 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
* pages.
*/
-unsigned int count_highmem_pages(void)
+static unsigned int count_highmem_pages(void)
{
struct zone *zone;
unsigned int n = 0;
@@ -905,7 +910,7 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
* pages.
*/
-unsigned int count_data_pages(void)
+static unsigned int count_data_pages(void)
{
struct zone *zone;
unsigned long pfn, max_zone_pfn;
@@ -1061,6 +1066,74 @@ void swsusp_free(void)
buffer = NULL;
}
+/**
+ * swsusp_shrink_memory - Try to free as much memory as needed
+ *
+ * ... but do not OOM-kill anyone
+ *
+ * Notice: all userland should be stopped before it is called, or
+ * livelock is possible.
+ */
+
+#define SHRINK_BITE 10000
+static inline unsigned long __shrink_memory(long tmp)
+{
+ if (tmp > SHRINK_BITE)
+ tmp = SHRINK_BITE;
+ return shrink_all_memory(tmp);
+}
+
+int swsusp_shrink_memory(void)
+{
+ long tmp;
+ struct zone *zone;
+ unsigned long pages = 0;
+ unsigned int i = 0;
+ char *p = "-\\|/";
+ struct timeval start, stop;
+
+ printk(KERN_INFO "PM: Shrinking memory... ");
+ do_gettimeofday(&start);
+ do {
+ long size, highmem_size;
+
+ highmem_size = count_highmem_pages();
+ size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
+ tmp = size;
+ size += highmem_size;
+ for_each_populated_zone(zone) {
+ tmp += snapshot_additional_pages(zone);
+ if (is_highmem(zone)) {
+ highmem_size -=
+ zone_page_state(zone, NR_FREE_PAGES);
+ } else {
+ tmp -= zone_page_state(zone, NR_FREE_PAGES);
+ tmp += zone->lowmem_reserve[ZONE_NORMAL];
+ }
+ }
+
+ if (highmem_size < 0)
+ highmem_size = 0;
+
+ tmp += highmem_size;
+ if (tmp > 0) {
+ tmp = __shrink_memory(tmp);
+ if (!tmp)
+ return -ENOMEM;
+ pages += tmp;
+ } else if (size > image_size / PAGE_SIZE) {
+ tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
+ pages += tmp;
+ }
+ printk("\b%c", p[i++%4]);
+ } while (tmp > 0);
+ do_gettimeofday(&stop);
+ printk("\bdone (%lu pages freed)\n", pages);
+ swsusp_show_speed(&start, &stop, pages, "Freed");
+
+ return 0;
+}
+
#ifdef CONFIG_HIGHMEM
/**
* count_pages_for_highmem - compute the number of non-highmem pages
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
new file mode 100644
index 00000000000..6f10dfc2d3e
--- /dev/null
+++ b/kernel/power/suspend.c
@@ -0,0 +1,300 @@
+/*
+ * kernel/power/suspend.c - Suspend to RAM and standby functionality.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/cpu.h>
+#include <linux/syscalls.h>
+
+#include "power.h"
+
+const char *const pm_states[PM_SUSPEND_MAX] = {
+ [PM_SUSPEND_STANDBY] = "standby",
+ [PM_SUSPEND_MEM] = "mem",
+};
+
+static struct platform_suspend_ops *suspend_ops;
+
+/**
+ * suspend_set_ops - Set the global suspend method table.
+ * @ops: Pointer to ops structure.
+ */
+void suspend_set_ops(struct platform_suspend_ops *ops)
+{
+ mutex_lock(&pm_mutex);
+ suspend_ops = ops;
+ mutex_unlock(&pm_mutex);
+}
+
+bool valid_state(suspend_state_t state)
+{
+ /*
+ * All states need lowlevel support and need to be valid to the lowlevel
+ * implementation, no valid callback implies that none are valid.
+ */
+ return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
+}
+
+/**
+ * suspend_valid_only_mem - generic memory-only valid callback
+ *
+ * Platform drivers that implement mem suspend only and only need
+ * to check for that in their .valid callback can use this instead
+ * of rolling their own .valid callback.
+ */
+int suspend_valid_only_mem(suspend_state_t state)
+{
+ return state == PM_SUSPEND_MEM;
+}
+
+static int suspend_test(int level)
+{
+#ifdef CONFIG_PM_DEBUG
+ if (pm_test_level == level) {
+ printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
+ mdelay(5000);
+ return 1;
+ }
+#endif /* !CONFIG_PM_DEBUG */
+ return 0;
+}
+
+/**
+ * suspend_prepare - Do prep work before entering low-power state.
+ *
+ * This is common code that is called for each state that we're entering.
+ * Run suspend notifiers, allocate a console and stop all processes.
+ */
+static int suspend_prepare(void)
+{
+ int error;
+
+ if (!suspend_ops || !suspend_ops->enter)
+ return -EPERM;
+
+ pm_prepare_console();
+
+ error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
+ if (error)
+ goto Finish;
+
+ error = usermodehelper_disable();
+ if (error)
+ goto Finish;
+
+ error = suspend_freeze_processes();
+ if (!error)
+ return 0;
+
+ suspend_thaw_processes();
+ usermodehelper_enable();
+ Finish:
+ pm_notifier_call_chain(PM_POST_SUSPEND);
+ pm_restore_console();
+ return error;
+}
+
+/* default implementation */
+void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
+{
+ local_irq_disable();
+}
+
+/* default implementation */
+void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
+{
+ local_irq_enable();
+}
+
+/**
+ * suspend_enter - enter the desired system sleep state.
+ * @state: state to enter
+ *
+ * This function should be called after devices have been suspended.
+ */
+static int suspend_enter(suspend_state_t state)
+{
+ int error;
+
+ if (suspend_ops->prepare) {
+ error = suspend_ops->prepare();
+ if (error)
+ return error;
+ }
+
+ error = dpm_suspend_noirq(PMSG_SUSPEND);
+ if (error) {
+ printk(KERN_ERR "PM: Some devices failed to power down\n");
+ goto Platfrom_finish;
+ }
+
+ if (suspend_ops->prepare_late) {
+ error = suspend_ops->prepare_late();
+ if (error)
+ goto Power_up_devices;
+ }
+
+ if (suspend_test(TEST_PLATFORM))
+ goto Platform_wake;
+
+ error = disable_nonboot_cpus();
+ if (error || suspend_test(TEST_CPUS))
+ goto Enable_cpus;
+
+ arch_suspend_disable_irqs();
+ BUG_ON(!irqs_disabled());
+
+ error = sysdev_suspend(PMSG_SUSPEND);
+ if (!error) {
+ if (!suspend_test(TEST_CORE))
+ error = suspend_ops->enter(state);
+ sysdev_resume();
+ }
+
+ arch_suspend_enable_irqs();
+ BUG_ON(irqs_disabled());
+
+ Enable_cpus:
+ enable_nonboot_cpus();
+
+ Platform_wake:
+ if (suspend_ops->wake)
+ suspend_ops->wake();
+
+ Power_up_devices:
+ dpm_resume_noirq(PMSG_RESUME);
+
+ Platfrom_finish:
+ if (suspend_ops->finish)
+ suspend_ops->finish();
+
+ return error;
+}
+
+/**
+ * suspend_devices_and_enter - suspend devices and enter the desired system
+ * sleep state.
+ * @state: state to enter
+ */
+int suspend_devices_and_enter(suspend_state_t state)
+{
+ int error;
+
+ if (!suspend_ops)
+ return -ENOSYS;
+
+ if (suspend_ops->begin) {
+ error = suspend_ops->begin(state);
+ if (error)
+ goto Close;
+ }
+ suspend_console();
+ suspend_test_start();
+ error = dpm_suspend_start(PMSG_SUSPEND);
+ if (error) {
+ printk(KERN_ERR "PM: Some devices failed to suspend\n");
+ goto Recover_platform;
+ }
+ suspend_test_finish("suspend devices");
+ if (suspend_test(TEST_DEVICES))
+ goto Recover_platform;
+
+ suspend_enter(state);
+
+ Resume_devices:
+ suspend_test_start();
+ dpm_resume_end(PMSG_RESUME);
+ suspend_test_finish("resume devices");
+ resume_console();
+ Close:
+ if (suspend_ops->end)
+ suspend_ops->end();
+ return error;
+
+ Recover_platform:
+ if (suspend_ops->recover)
+ suspend_ops->recover();
+ goto Resume_devices;
+}
+
+/**
+ * suspend_finish - Do final work before exiting suspend sequence.
+ *
+ * Call platform code to clean up, restart processes, and free the
+ * console that we've allocated. This is not called for suspend-to-disk.
+ */
+static void suspend_finish(void)
+{
+ suspend_thaw_processes();
+ usermodehelper_enable();
+ pm_notifier_call_chain(PM_POST_SUSPEND);
+ pm_restore_console();
+}
+
+/**
+ * enter_state - Do common work of entering low-power state.
+ * @state: pm_state structure for state we're entering.
+ *
+ * Make sure we're the only ones trying to enter a sleep state. Fail
+ * if someone has beat us to it, since we don't want anything weird to
+ * happen when we wake up.
+ * Then, do the setup for suspend, enter the state, and cleaup (after
+ * we've woken up).
+ */
+int enter_state(suspend_state_t state)
+{
+ int error;
+
+ if (!valid_state(state))
+ return -ENODEV;
+
+ if (!mutex_trylock(&pm_mutex))
+ return -EBUSY;
+
+ printk(KERN_INFO "PM: Syncing filesystems ... ");
+ sys_sync();
+ printk("done.\n");
+
+ pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
+ error = suspend_prepare();
+ if (error)
+ goto Unlock;
+
+ if (suspend_test(TEST_FREEZER))
+ goto Finish;
+
+ pr_debug("PM: Entering %s sleep\n", pm_states[state]);
+ error = suspend_devices_and_enter(state);
+
+ Finish:
+ pr_debug("PM: Finishing wakeup.\n");
+ suspend_finish();
+ Unlock:
+ mutex_unlock(&pm_mutex);
+ return error;
+}
+
+/**
+ * pm_suspend - Externally visible function for suspending system.
+ * @state: Enumerated value of state to enter.
+ *
+ * Determine whether or not value is within range, get state
+ * structure, and enter (above).
+ */
+int pm_suspend(suspend_state_t state)
+{
+ if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
+ return enter_state(state);
+ return -EINVAL;
+}
+EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
new file mode 100644
index 00000000000..17d8bb1acf9
--- /dev/null
+++ b/kernel/power/suspend_test.c
@@ -0,0 +1,187 @@
+/*
+ * kernel/power/suspend_test.c - Suspend to RAM and standby test facility.
+ *
+ * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/init.h>
+#include <linux/rtc.h>
+
+#include "power.h"
+
+/*
+ * We test the system suspend code by setting an RTC wakealarm a short
+ * time in the future, then suspending. Suspending the devices won't
+ * normally take long ... some systems only need a few milliseconds.
+ *
+ * The time it takes is system-specific though, so when we test this
+ * during system bootup we allow a LOT of time.
+ */
+#define TEST_SUSPEND_SECONDS 5
+
+static unsigned long suspend_test_start_time;
+
+void suspend_test_start(void)
+{
+ /* FIXME Use better timebase than "jiffies", ideally a clocksource.
+ * What we want is a hardware counter that will work correctly even
+ * during the irqs-are-off stages of the suspend/resume cycle...
+ */
+ suspend_test_start_time = jiffies;
+}
+
+void suspend_test_finish(const char *label)
+{
+ long nj = jiffies - suspend_test_start_time;
+ unsigned msec;
+
+ msec = jiffies_to_msecs(abs(nj));
+ pr_info("PM: %s took %d.%03d seconds\n", label,
+ msec / 1000, msec % 1000);
+
+ /* Warning on suspend means the RTC alarm period needs to be
+ * larger -- the system was sooo slooowwww to suspend that the
+ * alarm (should have) fired before the system went to sleep!
+ *
+ * Warning on either suspend or resume also means the system
+ * has some performance issues. The stack dump of a WARN_ON
+ * is more likely to get the right attention than a printk...
+ */
+ WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
+}
+
+/*
+ * To test system suspend, we need a hands-off mechanism to resume the
+ * system. RTCs wake alarms are a common self-contained mechanism.
+ */
+
+static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
+{
+ static char err_readtime[] __initdata =
+ KERN_ERR "PM: can't read %s time, err %d\n";
+ static char err_wakealarm [] __initdata =
+ KERN_ERR "PM: can't set %s wakealarm, err %d\n";
+ static char err_suspend[] __initdata =
+ KERN_ERR "PM: suspend test failed, error %d\n";
+ static char info_test[] __initdata =
+ KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
+
+ unsigned long now;
+ struct rtc_wkalrm alm;
+ int status;
+
+ /* this may fail if the RTC hasn't been initialized */
+ status = rtc_read_time(rtc, &alm.time);
+ if (status < 0) {
+ printk(err_readtime, dev_name(&rtc->dev), status);
+ return;
+ }
+ rtc_tm_to_time(&alm.time, &now);
+
+ memset(&alm, 0, sizeof alm);
+ rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
+ alm.enabled = true;
+
+ status = rtc_set_alarm(rtc, &alm);
+ if (status < 0) {
+ printk(err_wakealarm, dev_name(&rtc->dev), status);
+ return;
+ }
+
+ if (state == PM_SUSPEND_MEM) {
+ printk(info_test, pm_states[state]);
+ status = pm_suspend(state);
+ if (status == -ENODEV)
+ state = PM_SUSPEND_STANDBY;
+ }
+ if (state == PM_SUSPEND_STANDBY) {
+ printk(info_test, pm_states[state]);
+ status = pm_suspend(state);
+ }
+ if (status < 0)
+ printk(err_suspend, status);
+
+ /* Some platforms can't detect that the alarm triggered the
+ * wakeup, or (accordingly) disable it after it afterwards.
+ * It's supposed to give oneshot behavior; cope.
+ */
+ alm.enabled = false;
+ rtc_set_alarm(rtc, &alm);
+}
+
+static int __init has_wakealarm(struct device *dev, void *name_ptr)
+{
+ struct rtc_device *candidate = to_rtc_device(dev);
+
+ if (!candidate->ops->set_alarm)
+ return 0;
+ if (!device_may_wakeup(candidate->dev.parent))
+ return 0;
+
+ *(const char **)name_ptr = dev_name(dev);
+ return 1;
+}
+
+/*
+ * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
+ * at startup time. They're normally disabled, for faster boot and because
+ * we can't know which states really work on this particular system.
+ */
+static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
+
+static char warn_bad_state[] __initdata =
+ KERN_WARNING "PM: can't test '%s' suspend state\n";
+
+static int __init setup_test_suspend(char *value)
+{
+ unsigned i;
+
+ /* "=mem" ==> "mem" */
+ value++;
+ for (i = 0; i < PM_SUSPEND_MAX; i++) {
+ if (!pm_states[i])
+ continue;
+ if (strcmp(pm_states[i], value) != 0)
+ continue;
+ test_state = (__force suspend_state_t) i;
+ return 0;
+ }
+ printk(warn_bad_state, value);
+ return 0;
+}
+__setup("test_suspend", setup_test_suspend);
+
+static int __init test_suspend(void)
+{
+ static char warn_no_rtc[] __initdata =
+ KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
+
+ char *pony = NULL;
+ struct rtc_device *rtc = NULL;
+
+ /* PM is initialized by now; is that state testable? */
+ if (test_state == PM_SUSPEND_ON)
+ goto done;
+ if (!valid_state(test_state)) {
+ printk(warn_bad_state, pm_states[test_state]);
+ goto done;
+ }
+
+ /* RTCs have initialized by now too ... can we use one? */
+ class_find_device(rtc_class, NULL, &pony, has_wakealarm);
+ if (pony)
+ rtc = rtc_class_open(pony);
+ if (!rtc) {
+ printk(warn_no_rtc);
+ goto done;
+ }
+
+ /* go for it */
+ test_wakealarm(rtc, test_state);
+ rtc_class_close(rtc);
+done:
+ return 0;
+}
+late_initcall(test_suspend);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 6da14358537..8ba052c86d4 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -60,11 +60,10 @@ static struct block_device *resume_bdev;
static int submit(int rw, pgoff_t page_off, struct page *page,
struct bio **bio_chain)
{
+ const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
struct bio *bio;
bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
- if (!bio)
- return -ENOMEM;
bio->bi_sector = page_off * (PAGE_SIZE >> 9);
bio->bi_bdev = resume_bdev;
bio->bi_end_io = end_swap_bio_read;
@@ -80,7 +79,7 @@ static int submit(int rw, pgoff_t page_off, struct page *page,
bio_get(bio);
if (bio_chain == NULL) {
- submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+ submit_bio(bio_rw, bio);
wait_on_page_locked(page);
if (rw == READ)
bio_set_pages_dirty(bio);
@@ -90,7 +89,7 @@ static int submit(int rw, pgoff_t page_off, struct page *page,
get_page(page); /* These pages are freed later */
bio->bi_private = *bio_chain;
*bio_chain = bio;
- submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+ submit_bio(bio_rw, bio);
}
return 0;
}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index a92c9145155..6a07f4dbf2f 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -51,17 +51,10 @@
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/rbtree.h>
+#include <linux/io.h>
#include "power.h"
-/*
- * Preferred image size in bytes (tunable via /sys/power/image_size).
- * When it is set to N, swsusp will do its best to ensure the image
- * size will not exceed N bytes, but if that is impossible, it will
- * try to create the smallest image possible.
- */
-unsigned long image_size = 500 * 1024 * 1024;
-
int in_suspend __nosavedata = 0;
/**
@@ -193,194 +186,3 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
centisecs / 100, centisecs % 100,
kps / 1000, (kps % 1000) / 10);
}
-
-/**
- * swsusp_shrink_memory - Try to free as much memory as needed
- *
- * ... but do not OOM-kill anyone
- *
- * Notice: all userland should be stopped before it is called, or
- * livelock is possible.
- */
-
-#define SHRINK_BITE 10000
-static inline unsigned long __shrink_memory(long tmp)
-{
- if (tmp > SHRINK_BITE)
- tmp = SHRINK_BITE;
- return shrink_all_memory(tmp);
-}
-
-int swsusp_shrink_memory(void)
-{
- long tmp;
- struct zone *zone;
- unsigned long pages = 0;
- unsigned int i = 0;
- char *p = "-\\|/";
- struct timeval start, stop;
-
- printk(KERN_INFO "PM: Shrinking memory... ");
- do_gettimeofday(&start);
- do {
- long size, highmem_size;
-
- highmem_size = count_highmem_pages();
- size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
- tmp = size;
- size += highmem_size;
- for_each_zone (zone)
- if (populated_zone(zone)) {
- tmp += snapshot_additional_pages(zone);
- if (is_highmem(zone)) {
- highmem_size -=
- zone_page_state(zone, NR_FREE_PAGES);
- } else {
- tmp -= zone_page_state(zone, NR_FREE_PAGES);
- tmp += zone->lowmem_reserve[ZONE_NORMAL];
- }
- }
-
- if (highmem_size < 0)
- highmem_size = 0;
-
- tmp += highmem_size;
- if (tmp > 0) {
- tmp = __shrink_memory(tmp);
- if (!tmp)
- return -ENOMEM;
- pages += tmp;
- } else if (size > image_size / PAGE_SIZE) {
- tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
- pages += tmp;
- }
- printk("\b%c", p[i++%4]);
- } while (tmp > 0);
- do_gettimeofday(&stop);
- printk("\bdone (%lu pages freed)\n", pages);
- swsusp_show_speed(&start, &stop, pages, "Freed");
-
- return 0;
-}
-
-/*
- * Platforms, like ACPI, may want us to save some memory used by them during
- * hibernation and to restore the contents of this memory during the subsequent
- * resume. The code below implements a mechanism allowing us to do that.
- */
-
-struct nvs_page {
- unsigned long phys_start;
- unsigned int size;
- void *kaddr;
- void *data;
- struct list_head node;
-};
-
-static LIST_HEAD(nvs_list);
-
-/**
- * hibernate_nvs_register - register platform NVS memory region to save
- * @start - physical address of the region
- * @size - size of the region
- *
- * The NVS region need not be page-aligned (both ends) and we arrange
- * things so that the data from page-aligned addresses in this region will
- * be copied into separate RAM pages.
- */
-int hibernate_nvs_register(unsigned long start, unsigned long size)
-{
- struct nvs_page *entry, *next;
-
- while (size > 0) {
- unsigned int nr_bytes;
-
- entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
- if (!entry)
- goto Error;
-
- list_add_tail(&entry->node, &nvs_list);
- entry->phys_start = start;
- nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
- entry->size = (size < nr_bytes) ? size : nr_bytes;
-
- start += entry->size;
- size -= entry->size;
- }
- return 0;
-
- Error:
- list_for_each_entry_safe(entry, next, &nvs_list, node) {
- list_del(&entry->node);
- kfree(entry);
- }
- return -ENOMEM;
-}
-
-/**
- * hibernate_nvs_free - free data pages allocated for saving NVS regions
- */
-void hibernate_nvs_free(void)
-{
- struct nvs_page *entry;
-
- list_for_each_entry(entry, &nvs_list, node)
- if (entry->data) {
- free_page((unsigned long)entry->data);
- entry->data = NULL;
- if (entry->kaddr) {
- iounmap(entry->kaddr);
- entry->kaddr = NULL;
- }
- }
-}
-
-/**
- * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
- */
-int hibernate_nvs_alloc(void)
-{
- struct nvs_page *entry;
-
- list_for_each_entry(entry, &nvs_list, node) {
- entry->data = (void *)__get_free_page(GFP_KERNEL);
- if (!entry->data) {
- hibernate_nvs_free();
- return -ENOMEM;
- }
- }
- return 0;
-}
-
-/**
- * hibernate_nvs_save - save NVS memory regions
- */
-void hibernate_nvs_save(void)
-{
- struct nvs_page *entry;
-
- printk(KERN_INFO "PM: Saving platform NVS memory\n");
-
- list_for_each_entry(entry, &nvs_list, node)
- if (entry->data) {
- entry->kaddr = ioremap(entry->phys_start, entry->size);
- memcpy(entry->data, entry->kaddr, entry->size);
- }
-}
-
-/**
- * hibernate_nvs_restore - restore NVS memory regions
- *
- * This function is going to be called with interrupts disabled, so it
- * cannot iounmap the virtual addresses used to access the NVS region.
- */
-void hibernate_nvs_restore(void)
-{
- struct nvs_page *entry;
-
- printk(KERN_INFO "PM: Restoring platform NVS memory\n");
-
- list_for_each_entry(entry, &nvs_list, node)
- if (entry->data)
- memcpy(entry->kaddr, entry->data, entry->size);
-}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 005b93d839b..bf0014d6a5f 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -23,7 +23,7 @@
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/freezer.h>
-#include <linux/smp_lock.h>
+#include <scsi/scsi_scan.h>
#include <asm/uaccess.h>
@@ -92,18 +92,26 @@ static int snapshot_open(struct inode *inode, struct file *filp)
filp->private_data = data;
memset(&data->handle, 0, sizeof(struct snapshot_handle));
if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
+ /* Hibernating. The image device should be accessible. */
data->swap = swsusp_resume_device ?
swap_type_of(swsusp_resume_device, 0, NULL) : -1;
data->mode = O_RDONLY;
- error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+ error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
if (error)
- pm_notifier_call_chain(PM_POST_RESTORE);
+ pm_notifier_call_chain(PM_POST_HIBERNATION);
} else {
+ /*
+ * Resuming. We may need to wait for the image device to
+ * appear.
+ */
+ wait_for_device_probe();
+ scsi_complete_async_scans();
+
data->swap = -1;
data->mode = O_WRONLY;
- error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+ error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
if (error)
- pm_notifier_call_chain(PM_POST_HIBERNATION);
+ pm_notifier_call_chain(PM_POST_RESTORE);
}
if (error)
atomic_inc(&snapshot_device_available);
diff --git a/kernel/printk.c b/kernel/printk.c
index 69188f226a9..e10d193a833 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,10 +32,17 @@
#include <linux/security.h>
#include <linux/bootmem.h>
#include <linux/syscalls.h>
+#include <linux/kexec.h>
#include <asm/uaccess.h>
/*
+ * for_each_console() allows you to iterate on each console
+ */
+#define for_each_console(con) \
+ for (con = console_drivers; con != NULL; con = con->next)
+
+/*
* Architectures can override it:
*/
void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
@@ -60,6 +67,8 @@ int console_printk[4] = {
DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
};
+static int saved_console_loglevel = -1;
+
/*
* Low level drivers may need that to know if they can schedule in
* their unblank() callback or not. So let's export it.
@@ -73,7 +82,6 @@ EXPORT_SYMBOL(oops_in_progress);
* driver system.
*/
static DECLARE_MUTEX(console_sem);
-static DECLARE_MUTEX(secondary_console_sem);
struct console *console_drivers;
EXPORT_SYMBOL_GPL(console_drivers);
@@ -136,6 +144,24 @@ static char *log_buf = __log_buf;
static int log_buf_len = __LOG_BUF_LEN;
static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
+#ifdef CONFIG_KEXEC
+/*
+ * This appends the listed symbols to /proc/vmcoreinfo
+ *
+ * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to
+ * obtain access to symbols that are otherwise very difficult to locate. These
+ * symbols are specifically used so that utilities can access and extract the
+ * dmesg log from a vmcore file after a crash.
+ */
+void log_buf_kexec_setup(void)
+{
+ VMCOREINFO_SYMBOL(log_buf);
+ VMCOREINFO_SYMBOL(log_end);
+ VMCOREINFO_SYMBOL(log_buf_len);
+ VMCOREINFO_SYMBOL(logged_chars);
+}
+#endif
+
static int __init log_buf_len_setup(char *str)
{
unsigned size = memparse(str, &str);
@@ -354,10 +380,15 @@ int do_syslog(int type, char __user *buf, int len)
logged_chars = 0;
break;
case 6: /* Disable logging to console */
+ if (saved_console_loglevel == -1)
+ saved_console_loglevel = console_loglevel;
console_loglevel = minimum_console_loglevel;
break;
case 7: /* Enable logging to console */
- console_loglevel = default_console_loglevel;
+ if (saved_console_loglevel != -1) {
+ console_loglevel = saved_console_loglevel;
+ saved_console_loglevel = -1;
+ }
break;
case 8: /* Set level of messages printed to console */
error = -EINVAL;
@@ -366,6 +397,8 @@ int do_syslog(int type, char __user *buf, int len)
if (len < minimum_console_loglevel)
len = minimum_console_loglevel;
console_loglevel = len;
+ /* Implicitly re-enable logging to console */
+ saved_console_loglevel = -1;
error = 0;
break;
case 9: /* Number of chars in the log buffer */
@@ -394,7 +427,7 @@ static void __call_console_drivers(unsigned start, unsigned end)
{
struct console *con;
- for (con = console_drivers; con; con = con->next) {
+ for_each_console(con) {
if ((con->flags & CON_ENABLED) && con->write &&
(cpu_online(smp_processor_id()) ||
(con->flags & CON_ANYTIME)))
@@ -526,7 +559,7 @@ static int have_callable_console(void)
{
struct console *con;
- for (con = console_drivers; con; con = con->next)
+ for_each_console(con)
if (con->flags & CON_ANYTIME)
return 1;
@@ -669,20 +702,35 @@ asmlinkage int vprintk(const char *fmt, va_list args)
sizeof(printk_buf) - printed_len, fmt, args);
+ p = printk_buf;
+
+ /* Do we have a loglevel in the string? */
+ if (p[0] == '<') {
+ unsigned char c = p[1];
+ if (c && p[2] == '>') {
+ switch (c) {
+ case '0' ... '7': /* loglevel */
+ current_log_level = c - '0';
+ /* Fallthrough - make sure we're on a new line */
+ case 'd': /* KERN_DEFAULT */
+ if (!new_text_line) {
+ emit_log_char('\n');
+ new_text_line = 1;
+ }
+ /* Fallthrough - skip the loglevel */
+ case 'c': /* KERN_CONT */
+ p += 3;
+ break;
+ }
+ }
+ }
+
/*
* Copy the output into log_buf. If the caller didn't provide
* appropriate log level tags, we insert them here
*/
- for (p = printk_buf; *p; p++) {
+ for ( ; *p; p++) {
if (new_text_line) {
- /* If a token, set current_log_level and skip over */
- if (p[0] == '<' && p[1] >= '0' && p[1] <= '7' &&
- p[2] == '>') {
- current_log_level = p[1] - '0';
- p += 3;
- printed_len -= 3;
- }
-
/* Always output the token */
emit_log_char('<');
emit_log_char(current_log_level + '0');
@@ -891,12 +939,14 @@ void suspend_console(void)
printk("Suspending console(s) (use no_console_suspend to debug)\n");
acquire_console_sem();
console_suspended = 1;
+ up(&console_sem);
}
void resume_console(void)
{
if (!console_suspend_enabled)
return;
+ down(&console_sem);
console_suspended = 0;
release_console_sem();
}
@@ -912,11 +962,9 @@ void resume_console(void)
void acquire_console_sem(void)
{
BUG_ON(in_interrupt());
- if (console_suspended) {
- down(&secondary_console_sem);
- return;
- }
down(&console_sem);
+ if (console_suspended)
+ return;
console_locked = 1;
console_may_schedule = 1;
}
@@ -926,6 +974,10 @@ int try_acquire_console_sem(void)
{
if (down_trylock(&console_sem))
return -1;
+ if (console_suspended) {
+ up(&console_sem);
+ return -1;
+ }
console_locked = 1;
console_may_schedule = 0;
return 0;
@@ -979,7 +1031,7 @@ void release_console_sem(void)
unsigned wake_klogd = 0;
if (console_suspended) {
- up(&secondary_console_sem);
+ up(&console_sem);
return;
}
@@ -1045,7 +1097,7 @@ void console_unblank(void)
console_locked = 1;
console_may_schedule = 0;
- for (c = console_drivers; c != NULL; c = c->next)
+ for_each_console(c)
if ((c->flags & CON_ENABLED) && c->unblank)
c->unblank();
release_console_sem();
@@ -1060,7 +1112,7 @@ struct tty_driver *console_device(int *index)
struct tty_driver *driver = NULL;
acquire_console_sem();
- for (c = console_drivers; c != NULL; c = c->next) {
+ for_each_console(c) {
if (!c->device)
continue;
driver = c->device(c, index);
@@ -1097,25 +1149,49 @@ EXPORT_SYMBOL(console_start);
* to register the console printing procedure with printk() and to
* print any messages that were printed by the kernel before the
* console driver was initialized.
+ *
+ * This can happen pretty early during the boot process (because of
+ * early_printk) - sometimes before setup_arch() completes - be careful
+ * of what kernel features are used - they may not be initialised yet.
+ *
+ * There are two types of consoles - bootconsoles (early_printk) and
+ * "real" consoles (everything which is not a bootconsole) which are
+ * handled differently.
+ * - Any number of bootconsoles can be registered at any time.
+ * - As soon as a "real" console is registered, all bootconsoles
+ * will be unregistered automatically.
+ * - Once a "real" console is registered, any attempt to register a
+ * bootconsoles will be rejected
*/
-void register_console(struct console *console)
+void register_console(struct console *newcon)
{
int i;
unsigned long flags;
- struct console *bootconsole = NULL;
+ struct console *bcon = NULL;
- if (console_drivers) {
- if (console->flags & CON_BOOT)
- return;
- if (console_drivers->flags & CON_BOOT)
- bootconsole = console_drivers;
+ /*
+ * before we register a new CON_BOOT console, make sure we don't
+ * already have a valid console
+ */
+ if (console_drivers && newcon->flags & CON_BOOT) {
+ /* find the last or real console */
+ for_each_console(bcon) {
+ if (!(bcon->flags & CON_BOOT)) {
+ printk(KERN_INFO "Too late to register bootconsole %s%d\n",
+ newcon->name, newcon->index);
+ return;
+ }
+ }
}
- if (preferred_console < 0 || bootconsole || !console_drivers)
+ if (console_drivers && console_drivers->flags & CON_BOOT)
+ bcon = console_drivers;
+
+ if (preferred_console < 0 || bcon || !console_drivers)
preferred_console = selected_console;
- if (console->early_setup)
- console->early_setup();
+ if (newcon->early_setup)
+ newcon->early_setup();
/*
* See if we want to use this console driver. If we
@@ -1123,13 +1199,13 @@ void register_console(struct console *console)
* that registers here.
*/
if (preferred_console < 0) {
- if (console->index < 0)
- console->index = 0;
- if (console->setup == NULL ||
- console->setup(console, NULL) == 0) {
- console->flags |= CON_ENABLED;
- if (console->device) {
- console->flags |= CON_CONSDEV;
+ if (newcon->index < 0)
+ newcon->index = 0;
+ if (newcon->setup == NULL ||
+ newcon->setup(newcon, NULL) == 0) {
+ newcon->flags |= CON_ENABLED;
+ if (newcon->device) {
+ newcon->flags |= CON_CONSDEV;
preferred_console = 0;
}
}
@@ -1141,64 +1217,62 @@ void register_console(struct console *console)
*/
for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0];
i++) {
- if (strcmp(console_cmdline[i].name, console->name) != 0)
+ if (strcmp(console_cmdline[i].name, newcon->name) != 0)
continue;
- if (console->index >= 0 &&
- console->index != console_cmdline[i].index)
+ if (newcon->index >= 0 &&
+ newcon->index != console_cmdline[i].index)
continue;
- if (console->index < 0)
- console->index = console_cmdline[i].index;
+ if (newcon->index < 0)
+ newcon->index = console_cmdline[i].index;
#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
if (console_cmdline[i].brl_options) {
- console->flags |= CON_BRL;
- braille_register_console(console,
+ newcon->flags |= CON_BRL;
+ braille_register_console(newcon,
console_cmdline[i].index,
console_cmdline[i].options,
console_cmdline[i].brl_options);
return;
}
#endif
- if (console->setup &&
- console->setup(console, console_cmdline[i].options) != 0)
+ if (newcon->setup &&
+ newcon->setup(newcon, console_cmdline[i].options) != 0)
break;
- console->flags |= CON_ENABLED;
- console->index = console_cmdline[i].index;
+ newcon->flags |= CON_ENABLED;
+ newcon->index = console_cmdline[i].index;
if (i == selected_console) {
- console->flags |= CON_CONSDEV;
+ newcon->flags |= CON_CONSDEV;
preferred_console = selected_console;
}
break;
}
- if (!(console->flags & CON_ENABLED))
+ if (!(newcon->flags & CON_ENABLED))
return;
- if (bootconsole && (console->flags & CON_CONSDEV)) {
- printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n",
- bootconsole->name, bootconsole->index,
- console->name, console->index);
- unregister_console(bootconsole);
- console->flags &= ~CON_PRINTBUFFER;
- } else {
- printk(KERN_INFO "console [%s%d] enabled\n",
- console->name, console->index);
- }
+ /*
+ * If we have a bootconsole, and are switching to a real console,
+ * don't print everything out again, since when the boot console, and
+ * the real console are the same physical device, it's annoying to
+ * see the beginning boot messages twice
+ */
+ if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
+ newcon->flags &= ~CON_PRINTBUFFER;
/*
* Put this console in the list - keep the
* preferred driver at the head of the list.
*/
acquire_console_sem();
- if ((console->flags & CON_CONSDEV) || console_drivers == NULL) {
- console->next = console_drivers;
- console_drivers = console;
- if (console->next)
- console->next->flags &= ~CON_CONSDEV;
+ if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
+ newcon->next = console_drivers;
+ console_drivers = newcon;
+ if (newcon->next)
+ newcon->next->flags &= ~CON_CONSDEV;
} else {
- console->next = console_drivers->next;
- console_drivers->next = console;
+ newcon->next = console_drivers->next;
+ console_drivers->next = newcon;
}
- if (console->flags & CON_PRINTBUFFER) {
+ if (newcon->flags & CON_PRINTBUFFER) {
/*
* release_console_sem() will print out the buffered messages
* for us.
@@ -1208,6 +1282,28 @@ void register_console(struct console *console)
spin_unlock_irqrestore(&logbuf_lock, flags);
}
release_console_sem();
+
+ /*
+ * By unregistering the bootconsoles after we enable the real console
+ * we get the "console xxx enabled" message on all the consoles -
+ * boot consoles, real consoles, etc - this is to ensure that end
+ * users know there might be something in the kernel's log buffer that
+ * went to the bootconsole (that they do not see on the real console)
+ */
+ if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
+ /* we need to iterate through twice, to make sure we print
+ * everything out, before we unregister the console(s)
+ */
+ printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n",
+ newcon->name, newcon->index);
+ for_each_console(bcon)
+ if (bcon->flags & CON_BOOT)
+ unregister_console(bcon);
+ } else {
+ printk(KERN_INFO "%sconsole [%s%d] enabled\n",
+ (newcon->flags & CON_BOOT) ? "boot" : "" ,
+ newcon->name, newcon->index);
+ }
}
EXPORT_SYMBOL(register_console);
@@ -1250,11 +1346,13 @@ EXPORT_SYMBOL(unregister_console);
static int __init disable_boot_consoles(void)
{
- if (console_drivers != NULL) {
- if (console_drivers->flags & CON_BOOT) {
+ struct console *con;
+
+ for_each_console(con) {
+ if (con->flags & CON_BOOT) {
printk(KERN_INFO "turn off boot console %s%d\n",
- console_drivers->name, console_drivers->index);
- return unregister_console(console_drivers);
+ con->name, con->index);
+ unregister_console(con);
}
}
return 0;
@@ -1289,8 +1387,11 @@ EXPORT_SYMBOL(printk_ratelimit);
bool printk_timed_ratelimit(unsigned long *caller_jiffies,
unsigned int interval_msecs)
{
- if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) {
- *caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs);
+ if (*caller_jiffies == 0
+ || !time_in_range(jiffies, *caller_jiffies,
+ *caller_jiffies
+ + msecs_to_jiffies(interval_msecs))) {
+ *caller_jiffies = jiffies;
return true;
}
return false;
diff --git a/kernel/profile.c b/kernel/profile.c
index 784933acf5b..419250ebec4 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -111,20 +111,18 @@ int __ref profile_init(void)
/* only text is profiled */
prof_len = (_etext - _stext) >> prof_shift;
buffer_bytes = prof_len*sizeof(atomic_t);
- if (!slab_is_available()) {
- prof_buffer = alloc_bootmem(buffer_bytes);
- alloc_bootmem_cpumask_var(&prof_cpu_mask);
- return 0;
- }
if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
return -ENOMEM;
- prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL);
+ cpumask_copy(prof_cpu_mask, cpu_possible_mask);
+
+ prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
if (prof_buffer)
return 0;
- prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO);
+ prof_buffer = alloc_pages_exact(buffer_bytes,
+ GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
if (prof_buffer)
return 0;
@@ -368,7 +366,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
node = cpu_to_node(cpu);
per_cpu(cpu_profile_flip, cpu) = 0;
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
- page = alloc_pages_node(node,
+ page = alloc_pages_exact_node(node,
GFP_KERNEL | __GFP_ZERO,
0);
if (!page)
@@ -376,7 +374,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
}
if (!per_cpu(cpu_profile_hits, cpu)[0]) {
- page = alloc_pages_node(node,
+ page = alloc_pages_exact_node(node,
GFP_KERNEL | __GFP_ZERO,
0);
if (!page)
@@ -567,14 +565,14 @@ static int create_hash_tables(void)
int node = cpu_to_node(cpu);
struct page *page;
- page = alloc_pages_node(node,
+ page = alloc_pages_exact_node(node,
GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
0);
if (!page)
goto out_cleanup;
per_cpu(cpu_profile_hits, cpu)[1]
= (struct profile_hit *)page_address(page);
- page = alloc_pages_node(node,
+ page = alloc_pages_exact_node(node,
GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
0);
if (!page)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c9cf48b21f0..307c285af59 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -21,22 +21,10 @@
#include <linux/audit.h>
#include <linux/pid_namespace.h>
#include <linux/syscalls.h>
-
-#include <asm/pgtable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
- * Initialize a new task whose father had been ptraced.
- *
- * Called from copy_process().
- */
-void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
-{
- arch_ptrace_fork(child, clone_flags);
-}
-
-/*
* ptrace a task: make the debugger its new parent and
* move it to the ptrace list.
*
@@ -48,7 +36,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
list_add(&child->ptrace_entry, &new_parent->ptraced);
child->parent = new_parent;
}
-
+
/*
* Turn a tracing stop into a normal stop now, since with no tracer there
* would be no way to wake it up with SIGCONT or SIGKILL. If there was a
@@ -60,11 +48,15 @@ static void ptrace_untrace(struct task_struct *child)
{
spin_lock(&child->sighand->siglock);
if (task_is_traced(child)) {
- if (child->signal->flags & SIGNAL_STOP_STOPPED) {
+ /*
+ * If the group stop is completed or in progress,
+ * this thread was already counted as stopped.
+ */
+ if (child->signal->flags & SIGNAL_STOP_STOPPED ||
+ child->signal->group_stop_count)
__set_task_state(child, TASK_STOPPED);
- } else {
+ else
signal_wake_up(child, 1);
- }
}
spin_unlock(&child->sighand->siglock);
}
@@ -160,7 +152,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
if (!dumpable && !capable(CAP_SYS_PTRACE))
return -EPERM;
- return security_ptrace_may_access(task, mode);
+ return security_ptrace_access_check(task, mode);
}
bool ptrace_may_access(struct task_struct *task, unsigned int mode)
@@ -169,84 +161,140 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
task_lock(task);
err = __ptrace_may_access(task, mode);
task_unlock(task);
- return (!err ? true : false);
+ return !err;
}
int ptrace_attach(struct task_struct *task)
{
int retval;
- unsigned long flags;
audit_ptrace(task);
retval = -EPERM;
+ if (unlikely(task->flags & PF_KTHREAD))
+ goto out;
if (same_thread_group(task, current))
goto out;
- /* Protect exec's credential calculations against our interference;
- * SUID, SGID and LSM creds get determined differently under ptrace.
+ /*
+ * Protect exec's credential calculations against our interference;
+ * interference; SUID, SGID and LSM creds get determined differently
+ * under ptrace.
*/
- retval = mutex_lock_interruptible(&current->cred_exec_mutex);
- if (retval < 0)
+ retval = -ERESTARTNOINTR;
+ if (mutex_lock_interruptible(&task->cred_guard_mutex))
goto out;
- retval = -EPERM;
-repeat:
- /*
- * Nasty, nasty.
- *
- * We want to hold both the task-lock and the
- * tasklist_lock for writing at the same time.
- * But that's against the rules (tasklist_lock
- * is taken for reading by interrupts on other
- * cpu's that may have task_lock).
- */
task_lock(task);
- if (!write_trylock_irqsave(&tasklist_lock, flags)) {
- task_unlock(task);
- do {
- cpu_relax();
- } while (!write_can_lock(&tasklist_lock));
- goto repeat;
- }
-
- if (!task->mm)
- goto bad;
- /* the same process cannot be attached many times */
- if (task->ptrace & PT_PTRACED)
- goto bad;
retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
+ task_unlock(task);
if (retval)
- goto bad;
+ goto unlock_creds;
- /* Go */
- task->ptrace |= PT_PTRACED;
+ write_lock_irq(&tasklist_lock);
+ retval = -EPERM;
+ if (unlikely(task->exit_state))
+ goto unlock_tasklist;
+ if (task->ptrace)
+ goto unlock_tasklist;
+
+ task->ptrace = PT_PTRACED;
if (capable(CAP_SYS_PTRACE))
task->ptrace |= PT_PTRACE_CAP;
__ptrace_link(task, current);
-
send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
-bad:
- write_unlock_irqrestore(&tasklist_lock, flags);
- task_unlock(task);
- mutex_unlock(&current->cred_exec_mutex);
+
+ retval = 0;
+unlock_tasklist:
+ write_unlock_irq(&tasklist_lock);
+unlock_creds:
+ mutex_unlock(&task->cred_guard_mutex);
out:
return retval;
}
-static inline void __ptrace_detach(struct task_struct *child, unsigned int data)
+/**
+ * ptrace_traceme -- helper for PTRACE_TRACEME
+ *
+ * Performs checks and sets PT_PTRACED.
+ * Should be used by all ptrace implementations for PTRACE_TRACEME.
+ */
+int ptrace_traceme(void)
{
- child->exit_code = data;
- /* .. re-parent .. */
- __ptrace_unlink(child);
- /* .. and wake it up. */
- if (child->exit_state != EXIT_ZOMBIE)
- wake_up_process(child);
+ int ret = -EPERM;
+
+ write_lock_irq(&tasklist_lock);
+ /* Are we already being traced? */
+ if (!current->ptrace) {
+ ret = security_ptrace_traceme(current->parent);
+ /*
+ * Check PF_EXITING to ensure ->real_parent has not passed
+ * exit_ptrace(). Otherwise we don't report the error but
+ * pretend ->real_parent untraces us right after return.
+ */
+ if (!ret && !(current->real_parent->flags & PF_EXITING)) {
+ current->ptrace = PT_PTRACED;
+ __ptrace_link(current, current->real_parent);
+ }
+ }
+ write_unlock_irq(&tasklist_lock);
+
+ return ret;
+}
+
+/*
+ * Called with irqs disabled, returns true if childs should reap themselves.
+ */
+static int ignoring_children(struct sighand_struct *sigh)
+{
+ int ret;
+ spin_lock(&sigh->siglock);
+ ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
+ (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
+ spin_unlock(&sigh->siglock);
+ return ret;
+}
+
+/*
+ * Called with tasklist_lock held for writing.
+ * Unlink a traced task, and clean it up if it was a traced zombie.
+ * Return true if it needs to be reaped with release_task().
+ * (We can't call release_task() here because we already hold tasklist_lock.)
+ *
+ * If it's a zombie, our attachedness prevented normal parent notification
+ * or self-reaping. Do notification now if it would have happened earlier.
+ * If it should reap itself, return true.
+ *
+ * If it's our own child, there is no notification to do.
+ * But if our normal children self-reap, then this child
+ * was prevented by ptrace and we must reap it now.
+ */
+static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
+{
+ __ptrace_unlink(p);
+
+ if (p->exit_state == EXIT_ZOMBIE) {
+ if (!task_detached(p) && thread_group_empty(p)) {
+ if (!same_thread_group(p->real_parent, tracer))
+ do_notify_parent(p, p->exit_signal);
+ else if (ignoring_children(tracer->sighand))
+ p->exit_signal = -1;
+ }
+ if (task_detached(p)) {
+ /* Mark it as in the process of being reaped. */
+ p->exit_state = EXIT_DEAD;
+ return true;
+ }
+ }
+
+ return false;
}
int ptrace_detach(struct task_struct *child, unsigned int data)
{
+ bool dead = false;
+
if (!valid_signal(data))
return -EIO;
@@ -255,14 +303,47 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
write_lock_irq(&tasklist_lock);
- /* protect against de_thread()->release_task() */
- if (child->ptrace)
- __ptrace_detach(child, data);
+ /*
+ * This child can be already killed. Make sure de_thread() or
+ * our sub-thread doing do_wait() didn't do release_task() yet.
+ */
+ if (child->ptrace) {
+ child->exit_code = data;
+ dead = __ptrace_detach(current, child);
+ if (!child->exit_state)
+ wake_up_process(child);
+ }
write_unlock_irq(&tasklist_lock);
+ if (unlikely(dead))
+ release_task(child);
+
return 0;
}
+/*
+ * Detach all tasks we were using ptrace on.
+ */
+void exit_ptrace(struct task_struct *tracer)
+{
+ struct task_struct *p, *n;
+ LIST_HEAD(ptrace_dead);
+
+ write_lock_irq(&tasklist_lock);
+ list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
+ if (__ptrace_detach(tracer, p))
+ list_add(&p->ptrace_entry, &ptrace_dead);
+ }
+ write_unlock_irq(&tasklist_lock);
+
+ BUG_ON(!list_empty(&tracer->ptraced));
+
+ list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
+ list_del_init(&p->ptrace_entry);
+ release_task(p);
+ }
+}
+
int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
{
int copied = 0;
@@ -283,7 +364,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst
copied += retval;
src += retval;
dst += retval;
- len -= retval;
+ len -= retval;
}
return copied;
}
@@ -308,7 +389,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
copied += retval;
src += retval;
dst += retval;
- len -= retval;
+ len -= retval;
}
return copied;
}
@@ -343,37 +424,33 @@ static int ptrace_setoptions(struct task_struct *child, long data)
static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
{
+ unsigned long flags;
int error = -ESRCH;
- read_lock(&tasklist_lock);
- if (likely(child->sighand != NULL)) {
+ if (lock_task_sighand(child, &flags)) {
error = -EINVAL;
- spin_lock_irq(&child->sighand->siglock);
if (likely(child->last_siginfo != NULL)) {
*info = *child->last_siginfo;
error = 0;
}
- spin_unlock_irq(&child->sighand->siglock);
+ unlock_task_sighand(child, &flags);
}
- read_unlock(&tasklist_lock);
return error;
}
static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
{
+ unsigned long flags;
int error = -ESRCH;
- read_lock(&tasklist_lock);
- if (likely(child->sighand != NULL)) {
+ if (lock_task_sighand(child, &flags)) {
error = -EINVAL;
- spin_lock_irq(&child->sighand->siglock);
if (likely(child->last_siginfo != NULL)) {
*child->last_siginfo = *info;
error = 0;
}
- spin_unlock_irq(&child->sighand->siglock);
+ unlock_task_sighand(child, &flags);
}
- read_unlock(&tasklist_lock);
return error;
}
@@ -421,9 +498,9 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
if (unlikely(!arch_has_single_step()))
return -EIO;
user_enable_single_step(child);
- }
- else
+ } else {
user_disable_single_step(child);
+ }
child->exit_code = data;
wake_up_process(child);
@@ -500,71 +577,16 @@ int ptrace_request(struct task_struct *child, long request,
return ret;
}
-/**
- * ptrace_traceme -- helper for PTRACE_TRACEME
- *
- * Performs checks and sets PT_PTRACED.
- * Should be used by all ptrace implementations for PTRACE_TRACEME.
- */
-int ptrace_traceme(void)
-{
- int ret = -EPERM;
-
- /*
- * Are we already being traced?
- */
-repeat:
- task_lock(current);
- if (!(current->ptrace & PT_PTRACED)) {
- /*
- * See ptrace_attach() comments about the locking here.
- */
- unsigned long flags;
- if (!write_trylock_irqsave(&tasklist_lock, flags)) {
- task_unlock(current);
- do {
- cpu_relax();
- } while (!write_can_lock(&tasklist_lock));
- goto repeat;
- }
-
- ret = security_ptrace_traceme(current->parent);
-
- /*
- * Set the ptrace bit in the process ptrace flags.
- * Then link us on our parent's ptraced list.
- */
- if (!ret) {
- current->ptrace |= PT_PTRACED;
- __ptrace_link(current, current->real_parent);
- }
-
- write_unlock_irqrestore(&tasklist_lock, flags);
- }
- task_unlock(current);
- return ret;
-}
-
-/**
- * ptrace_get_task_struct -- grab a task struct reference for ptrace
- * @pid: process id to grab a task_struct reference of
- *
- * This function is a helper for ptrace implementations. It checks
- * permissions and then grabs a task struct for use of the actual
- * ptrace implementation.
- *
- * Returns the task_struct for @pid or an ERR_PTR() on failure.
- */
-struct task_struct *ptrace_get_task_struct(pid_t pid)
+static struct task_struct *ptrace_get_task_struct(pid_t pid)
{
struct task_struct *child;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
child = find_task_by_vpid(pid);
if (child)
get_task_struct(child);
+ rcu_read_unlock();
- read_unlock(&tasklist_lock);
if (!child)
return ERR_PTR(-ESRCH);
return child;
@@ -612,8 +634,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
goto out_put_task_struct;
ret = arch_ptrace(child, request, addr, data);
- if (ret < 0)
- goto out_put_task_struct;
out_put_task_struct:
put_task_struct(child);
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
deleted file mode 100644
index 490934fc7ac..00000000000
--- a/kernel/rcuclassic.c
+++ /dev/null
@@ -1,788 +0,0 @@
-/*
- * Read-Copy Update mechanism for mutual exclusion
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright IBM Corporation, 2001
- *
- * Authors: Dipankar Sarma <dipankar@in.ibm.com>
- * Manfred Spraul <manfred@colorfullife.com>
- *
- * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
- * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
- * Papers:
- * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
- * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- * Documentation/RCU
- *
- */
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/rcupdate.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <asm/atomic.h>
-#include <linux/bitops.h>
-#include <linux/module.h>
-#include <linux/completion.h>
-#include <linux/moduleparam.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/mutex.h>
-#include <linux/time.h>
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static struct lock_class_key rcu_lock_key;
-struct lockdep_map rcu_lock_map =
- STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
-EXPORT_SYMBOL_GPL(rcu_lock_map);
-#endif
-
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_ctrlblk = {
- .cur = -300,
- .completed = -300,
- .pending = -300,
- .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
- .cpumask = CPU_BITS_NONE,
-};
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
- .cur = -300,
- .completed = -300,
- .pending = -300,
- .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
- .cpumask = CPU_BITS_NONE,
-};
-
-DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
-DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
-
-static int blimit = 10;
-static int qhimark = 10000;
-static int qlowmark = 100;
-
-#ifdef CONFIG_SMP
-static void force_quiescent_state(struct rcu_data *rdp,
- struct rcu_ctrlblk *rcp)
-{
- int cpu;
- unsigned long flags;
-
- set_need_resched();
- spin_lock_irqsave(&rcp->lock, flags);
- if (unlikely(!rcp->signaled)) {
- rcp->signaled = 1;
- /*
- * Don't send IPI to itself. With irqs disabled,
- * rdp->cpu is the current cpu.
- *
- * cpu_online_mask is updated by the _cpu_down()
- * using __stop_machine(). Since we're in irqs disabled
- * section, __stop_machine() is not exectuting, hence
- * the cpu_online_mask is stable.
- *
- * However, a cpu might have been offlined _just_ before
- * we disabled irqs while entering here.
- * And rcu subsystem might not yet have handled the CPU_DEAD
- * notification, leading to the offlined cpu's bit
- * being set in the rcp->cpumask.
- *
- * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent
- * sending smp_reschedule() to an offlined CPU.
- */
- for_each_cpu_and(cpu,
- to_cpumask(rcp->cpumask), cpu_online_mask) {
- if (cpu != rdp->cpu)
- smp_send_reschedule(cpu);
- }
- }
- spin_unlock_irqrestore(&rcp->lock, flags);
-}
-#else
-static inline void force_quiescent_state(struct rcu_data *rdp,
- struct rcu_ctrlblk *rcp)
-{
- set_need_resched();
-}
-#endif
-
-static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
- struct rcu_data *rdp)
-{
- long batch;
-
- head->next = NULL;
- smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
-
- /*
- * Determine the batch number of this callback.
- *
- * Using ACCESS_ONCE to avoid the following error when gcc eliminates
- * local variable "batch" and emits codes like this:
- * 1) rdp->batch = rcp->cur + 1 # gets old value
- * ......
- * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
- * then [*nxttail[0], *nxttail[1]) may contain callbacks
- * that batch# = rdp->batch, see the comment of struct rcu_data.
- */
- batch = ACCESS_ONCE(rcp->cur) + 1;
-
- if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
- /* process callbacks */
- rdp->nxttail[0] = rdp->nxttail[1];
- rdp->nxttail[1] = rdp->nxttail[2];
- if (rcu_batch_after(batch - 1, rdp->batch))
- rdp->nxttail[0] = rdp->nxttail[2];
- }
-
- rdp->batch = batch;
- *rdp->nxttail[2] = head;
- rdp->nxttail[2] = &head->next;
-
- if (unlikely(++rdp->qlen > qhimark)) {
- rdp->blimit = INT_MAX;
- force_quiescent_state(rdp, &rcu_ctrlblk);
- }
-}
-
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-
-static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
-{
- rcp->gp_start = jiffies;
- rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
-}
-
-static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
-{
- int cpu;
- long delta;
- unsigned long flags;
-
- /* Only let one CPU complain about others per time interval. */
-
- spin_lock_irqsave(&rcp->lock, flags);
- delta = jiffies - rcp->jiffies_stall;
- if (delta < 2 || rcp->cur != rcp->completed) {
- spin_unlock_irqrestore(&rcp->lock, flags);
- return;
- }
- rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
- spin_unlock_irqrestore(&rcp->lock, flags);
-
- /* OK, time to rat on our buddy... */
-
- printk(KERN_ERR "INFO: RCU detected CPU stalls:");
- for_each_possible_cpu(cpu) {
- if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask)))
- printk(" %d", cpu);
- }
- printk(" (detected by %d, t=%ld jiffies)\n",
- smp_processor_id(), (long)(jiffies - rcp->gp_start));
-}
-
-static void print_cpu_stall(struct rcu_ctrlblk *rcp)
-{
- unsigned long flags;
-
- printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
- smp_processor_id(), jiffies,
- jiffies - rcp->gp_start);
- dump_stack();
- spin_lock_irqsave(&rcp->lock, flags);
- if ((long)(jiffies - rcp->jiffies_stall) >= 0)
- rcp->jiffies_stall =
- jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
- spin_unlock_irqrestore(&rcp->lock, flags);
- set_need_resched(); /* kick ourselves to get things going. */
-}
-
-static void check_cpu_stall(struct rcu_ctrlblk *rcp)
-{
- long delta;
-
- delta = jiffies - rcp->jiffies_stall;
- if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) &&
- delta >= 0) {
-
- /* We haven't checked in, so go dump stack. */
- print_cpu_stall(rcp);
-
- } else if (rcp->cur != rcp->completed && delta >= 2) {
-
- /* They had two seconds to dump stack, so complain. */
- print_other_cpu_stall(rcp);
- }
-}
-
-#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-
-static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
-{
-}
-
-static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-
-/**
- * call_rcu - Queue an RCU callback for invocation after a grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
- *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
- */
-void call_rcu(struct rcu_head *head,
- void (*func)(struct rcu_head *rcu))
-{
- unsigned long flags;
-
- head->func = func;
- local_irq_save(flags);
- __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
-
-/**
- * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
- *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_bh() assumes
- * that the read-side critical sections end on completion of a softirq
- * handler. This means that read-side critical sections in process
- * context must not be interrupted by softirqs. This interface is to be
- * used when most of the read-side critical sections are in softirq context.
- * RCU read-side critical sections are delimited by rcu_read_lock() and
- * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
- * and rcu_read_unlock_bh(), if in process context. These may be nested.
- */
-void call_rcu_bh(struct rcu_head *head,
- void (*func)(struct rcu_head *rcu))
-{
- unsigned long flags;
-
- head->func = func;
- local_irq_save(flags);
- __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(call_rcu_bh);
-
-/*
- * Return the number of RCU batches processed thus far. Useful
- * for debug and statistics.
- */
-long rcu_batches_completed(void)
-{
- return rcu_ctrlblk.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
-/*
- * Return the number of RCU batches processed thus far. Useful
- * for debug and statistics.
- */
-long rcu_batches_completed_bh(void)
-{
- return rcu_bh_ctrlblk.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
-
-/* Raises the softirq for processing rcu_callbacks. */
-static inline void raise_rcu_softirq(void)
-{
- raise_softirq(RCU_SOFTIRQ);
-}
-
-/*
- * Invoke the completed RCU callbacks. They are expected to be in
- * a per-cpu list.
- */
-static void rcu_do_batch(struct rcu_data *rdp)
-{
- unsigned long flags;
- struct rcu_head *next, *list;
- int count = 0;
-
- list = rdp->donelist;
- while (list) {
- next = list->next;
- prefetch(next);
- list->func(list);
- list = next;
- if (++count >= rdp->blimit)
- break;
- }
- rdp->donelist = list;
-
- local_irq_save(flags);
- rdp->qlen -= count;
- local_irq_restore(flags);
- if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
- rdp->blimit = blimit;
-
- if (!rdp->donelist)
- rdp->donetail = &rdp->donelist;
- else
- raise_rcu_softirq();
-}
-
-/*
- * Grace period handling:
- * The grace period handling consists out of two steps:
- * - A new grace period is started.
- * This is done by rcu_start_batch. The start is not broadcasted to
- * all cpus, they must pick this up by comparing rcp->cur with
- * rdp->quiescbatch. All cpus are recorded in the
- * rcu_ctrlblk.cpumask bitmap.
- * - All cpus must go through a quiescent state.
- * Since the start of the grace period is not broadcasted, at least two
- * calls to rcu_check_quiescent_state are required:
- * The first call just notices that a new grace period is running. The
- * following calls check if there was a quiescent state since the beginning
- * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
- * the bitmap is empty, then the grace period is completed.
- * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
- * period (if necessary).
- */
-
-/*
- * Register a new batch of callbacks, and start it up if there is currently no
- * active batch and the batch to be registered has not already occurred.
- * Caller must hold rcu_ctrlblk.lock.
- */
-static void rcu_start_batch(struct rcu_ctrlblk *rcp)
-{
- if (rcp->cur != rcp->pending &&
- rcp->completed == rcp->cur) {
- rcp->cur++;
- record_gp_stall_check_time(rcp);
-
- /*
- * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
- * Barrier Otherwise it can cause tickless idle CPUs to be
- * included in rcp->cpumask, which will extend graceperiods
- * unnecessarily.
- */
- smp_mb();
- cpumask_andnot(to_cpumask(rcp->cpumask),
- cpu_online_mask, nohz_cpu_mask);
-
- rcp->signaled = 0;
- }
-}
-
-/*
- * cpu went through a quiescent state since the beginning of the grace period.
- * Clear it from the cpu mask and complete the grace period if it was the last
- * cpu. Start another grace period if someone has further entries pending
- */
-static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
-{
- cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask));
- if (cpumask_empty(to_cpumask(rcp->cpumask))) {
- /* batch completed ! */
- rcp->completed = rcp->cur;
- rcu_start_batch(rcp);
- }
-}
-
-/*
- * Check if the cpu has gone through a quiescent state (say context
- * switch). If so and if it already hasn't done so in this RCU
- * quiescent cycle, then indicate that it has done so.
- */
-static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
- struct rcu_data *rdp)
-{
- unsigned long flags;
-
- if (rdp->quiescbatch != rcp->cur) {
- /* start new grace period: */
- rdp->qs_pending = 1;
- rdp->passed_quiesc = 0;
- rdp->quiescbatch = rcp->cur;
- return;
- }
-
- /* Grace period already completed for this cpu?
- * qs_pending is checked instead of the actual bitmap to avoid
- * cacheline trashing.
- */
- if (!rdp->qs_pending)
- return;
-
- /*
- * Was there a quiescent state since the beginning of the grace
- * period? If no, then exit and wait for the next call.
- */
- if (!rdp->passed_quiesc)
- return;
- rdp->qs_pending = 0;
-
- spin_lock_irqsave(&rcp->lock, flags);
- /*
- * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
- * during cpu startup. Ignore the quiescent state.
- */
- if (likely(rdp->quiescbatch == rcp->cur))
- cpu_quiet(rdp->cpu, rcp);
-
- spin_unlock_irqrestore(&rcp->lock, flags);
-}
-
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
- * locking requirements, the list it's pulling from has to belong to a cpu
- * which is dead and hence not processing interrupts.
- */
-static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
- struct rcu_head **tail, long batch)
-{
- unsigned long flags;
-
- if (list) {
- local_irq_save(flags);
- this_rdp->batch = batch;
- *this_rdp->nxttail[2] = list;
- this_rdp->nxttail[2] = tail;
- local_irq_restore(flags);
- }
-}
-
-static void __rcu_offline_cpu(struct rcu_data *this_rdp,
- struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
- unsigned long flags;
-
- /*
- * if the cpu going offline owns the grace period
- * we can block indefinitely waiting for it, so flush
- * it here
- */
- spin_lock_irqsave(&rcp->lock, flags);
- if (rcp->cur != rcp->completed)
- cpu_quiet(rdp->cpu, rcp);
- rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
- rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
- spin_unlock(&rcp->lock);
-
- this_rdp->qlen += rdp->qlen;
- local_irq_restore(flags);
-}
-
-static void rcu_offline_cpu(int cpu)
-{
- struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
- struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
-
- __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
- &per_cpu(rcu_data, cpu));
- __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
- &per_cpu(rcu_bh_data, cpu));
- put_cpu_var(rcu_data);
- put_cpu_var(rcu_bh_data);
-}
-
-#else
-
-static void rcu_offline_cpu(int cpu)
-{
-}
-
-#endif
-
-/*
- * This does the RCU processing work from softirq context.
- */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
- struct rcu_data *rdp)
-{
- unsigned long flags;
- long completed_snap;
-
- if (rdp->nxtlist) {
- local_irq_save(flags);
- completed_snap = ACCESS_ONCE(rcp->completed);
-
- /*
- * move the other grace-period-completed entries to
- * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
- */
- if (!rcu_batch_before(completed_snap, rdp->batch))
- rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
- else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
- rdp->nxttail[0] = rdp->nxttail[1];
-
- /*
- * the grace period for entries in
- * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
- * move these entries to donelist
- */
- if (rdp->nxttail[0] != &rdp->nxtlist) {
- *rdp->donetail = rdp->nxtlist;
- rdp->donetail = rdp->nxttail[0];
- rdp->nxtlist = *rdp->nxttail[0];
- *rdp->donetail = NULL;
-
- if (rdp->nxttail[1] == rdp->nxttail[0])
- rdp->nxttail[1] = &rdp->nxtlist;
- if (rdp->nxttail[2] == rdp->nxttail[0])
- rdp->nxttail[2] = &rdp->nxtlist;
- rdp->nxttail[0] = &rdp->nxtlist;
- }
-
- local_irq_restore(flags);
-
- if (rcu_batch_after(rdp->batch, rcp->pending)) {
- unsigned long flags2;
-
- /* and start it/schedule start if it's a new batch */
- spin_lock_irqsave(&rcp->lock, flags2);
- if (rcu_batch_after(rdp->batch, rcp->pending)) {
- rcp->pending = rdp->batch;
- rcu_start_batch(rcp);
- }
- spin_unlock_irqrestore(&rcp->lock, flags2);
- }
- }
-
- rcu_check_quiescent_state(rcp, rdp);
- if (rdp->donelist)
- rcu_do_batch(rdp);
-}
-
-static void rcu_process_callbacks(struct softirq_action *unused)
-{
- /*
- * Memory references from any prior RCU read-side critical sections
- * executed by the interrupted code must be see before any RCU
- * grace-period manupulations below.
- */
-
- smp_mb(); /* See above block comment. */
-
- __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
- __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
-
- /*
- * Memory references from any later RCU read-side critical sections
- * executed by the interrupted code must be see after any RCU
- * grace-period manupulations above.
- */
-
- smp_mb(); /* See above block comment. */
-}
-
-static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
- /* Check for CPU stalls, if enabled. */
- check_cpu_stall(rcp);
-
- if (rdp->nxtlist) {
- long completed_snap = ACCESS_ONCE(rcp->completed);
-
- /*
- * This cpu has pending rcu entries and the grace period
- * for them has completed.
- */
- if (!rcu_batch_before(completed_snap, rdp->batch))
- return 1;
- if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
- rdp->nxttail[0] != rdp->nxttail[1])
- return 1;
- if (rdp->nxttail[0] != &rdp->nxtlist)
- return 1;
-
- /*
- * This cpu has pending rcu entries and the new batch
- * for then hasn't been started nor scheduled start
- */
- if (rcu_batch_after(rdp->batch, rcp->pending))
- return 1;
- }
-
- /* This cpu has finished callbacks to invoke */
- if (rdp->donelist)
- return 1;
-
- /* The rcu core waits for a quiescent state from the cpu */
- if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
- return 1;
-
- /* nothing to do */
- return 0;
-}
-
-/*
- * Check to see if there is any immediate RCU-related work to be done
- * by the current CPU, returning 1 if so. This function is part of the
- * RCU implementation; it is -not- an exported member of the RCU API.
- */
-int rcu_pending(int cpu)
-{
- return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
- __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
-}
-
-/*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so. This function is part of the RCU implementation; it is -not-
- * an exported member of the RCU API.
- */
-int rcu_needs_cpu(int cpu)
-{
- struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
- struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
-
- return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
-}
-
-/*
- * Top-level function driving RCU grace-period detection, normally
- * invoked from the scheduler-clock interrupt. This function simply
- * increments counters that are read only from softirq by this same
- * CPU, so there are no memory barriers required.
- */
-void rcu_check_callbacks(int cpu, int user)
-{
- if (user ||
- (idle_cpu(cpu) && !in_softirq() &&
- hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
-
- /*
- * Get here if this CPU took its interrupt from user
- * mode or from the idle loop, and if this is not a
- * nested interrupt. In this case, the CPU is in
- * a quiescent state, so count it.
- *
- * Also do a memory barrier. This is needed to handle
- * the case where writes from a preempt-disable section
- * of code get reordered into schedule() by this CPU's
- * write buffer. The memory barrier makes sure that
- * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
- * by other CPUs to happen after any such write.
- */
-
- smp_mb(); /* See above block comment. */
- rcu_qsctr_inc(cpu);
- rcu_bh_qsctr_inc(cpu);
-
- } else if (!in_softirq()) {
-
- /*
- * Get here if this CPU did not take its interrupt from
- * softirq, in other words, if it is not interrupting
- * a rcu_bh read-side critical section. This is an _bh
- * critical section, so count it. The memory barrier
- * is needed for the same reason as is the above one.
- */
-
- smp_mb(); /* See above block comment. */
- rcu_bh_qsctr_inc(cpu);
- }
- raise_rcu_softirq();
-}
-
-static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
- struct rcu_data *rdp)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rcp->lock, flags);
- memset(rdp, 0, sizeof(*rdp));
- rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
- rdp->donetail = &rdp->donelist;
- rdp->quiescbatch = rcp->completed;
- rdp->qs_pending = 0;
- rdp->cpu = cpu;
- rdp->blimit = blimit;
- spin_unlock_irqrestore(&rcp->lock, flags);
-}
-
-static void __cpuinit rcu_online_cpu(int cpu)
-{
- struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
- struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
-
- rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
- rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
-}
-
-static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- long cpu = (long)hcpu;
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- rcu_online_cpu(cpu);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- rcu_offline_cpu(cpu);
- break;
- default:
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata rcu_nb = {
- .notifier_call = rcu_cpu_notify,
-};
-
-/*
- * Initializes rcu mechanism. Assumed to be called early.
- * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
- * Note that rcu_qsctr and friends are implicitly
- * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
- */
-void __init __rcu_init(void)
-{
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
- printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
- rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
- (void *)(long)smp_processor_id());
- /* Register notifier for non-boot CPUs */
- register_cpu_notifier(&rcu_nb);
-}
-
-module_param(blimit, int, 0);
-module_param(qhimark, int, 0);
-module_param(qlowmark, int, 0);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index d92a76a881a..bd5d5c8e514 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,6 +44,7 @@
#include <linux/cpu.h>
#include <linux/mutex.h>
#include <linux/module.h>
+#include <linux/kernel_stat.h>
enum rcu_barrier {
RCU_BARRIER_STD,
@@ -55,6 +56,11 @@ static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
static atomic_t rcu_barrier_cpu_count;
static DEFINE_MUTEX(rcu_barrier_mutex);
static struct completion rcu_barrier_completion;
+int rcu_scheduler_active __read_mostly;
+
+static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
+static struct rcu_head rcu_migrate_head[3];
+static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
/*
* Awaken the corresponding synchronize_rcu() instance now that a
@@ -80,6 +86,10 @@ void wakeme_after_rcu(struct rcu_head *head)
void synchronize_rcu(void)
{
struct rcu_synchronize rcu;
+
+ if (rcu_blocking_is_gp())
+ return;
+
init_completion(&rcu.completion);
/* Will wake me after RCU finished. */
call_rcu(&rcu.head, wakeme_after_rcu);
@@ -88,6 +98,30 @@ void synchronize_rcu(void)
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
+/**
+ * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full rcu_bh grace
+ * period has elapsed, in other words after all currently executing rcu_bh
+ * read-side critical sections have completed. RCU read-side critical
+ * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
+ * and may be nested.
+ */
+void synchronize_rcu_bh(void)
+{
+ struct rcu_synchronize rcu;
+
+ if (rcu_blocking_is_gp())
+ return;
+
+ init_completion(&rcu.completion);
+ /* Will wake me after RCU finished. */
+ call_rcu_bh(&rcu.head, wakeme_after_rcu);
+ /* Wait for it. */
+ wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
+
static void rcu_barrier_callback(struct rcu_head *notused)
{
if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -116,6 +150,12 @@ static void rcu_barrier_func(void *type)
}
}
+static inline void wait_migrated_callbacks(void)
+{
+ wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
+ smp_mb(); /* In case we didn't sleep. */
+}
+
/*
* Orchestrate the specified type of RCU barrier, waiting for all
* RCU callbacks of the specified type to complete.
@@ -141,6 +181,7 @@ static void _rcu_barrier(enum rcu_barrier type)
complete(&rcu_barrier_completion);
wait_for_completion(&rcu_barrier_completion);
mutex_unlock(&rcu_barrier_mutex);
+ wait_migrated_callbacks();
}
/**
@@ -170,8 +211,61 @@ void rcu_barrier_sched(void)
}
EXPORT_SYMBOL_GPL(rcu_barrier_sched);
+static void rcu_migrate_callback(struct rcu_head *notused)
+{
+ if (atomic_dec_and_test(&rcu_migrate_type_count))
+ wake_up(&rcu_migrate_wq);
+}
+
+extern int rcu_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu);
+
+static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ rcu_cpu_notify(self, action, hcpu);
+ if (action == CPU_DYING) {
+ /*
+ * preempt_disable() in on_each_cpu() prevents stop_machine(),
+ * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
+ * returns, all online cpus have queued rcu_barrier_func(),
+ * and the dead cpu(if it exist) queues rcu_migrate_callback()s.
+ *
+ * These callbacks ensure _rcu_barrier() waits for all
+ * RCU callbacks of the specified type to complete.
+ */
+ atomic_set(&rcu_migrate_type_count, 3);
+ call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
+ call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
+ call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
+ } else if (action == CPU_DOWN_PREPARE) {
+ /* Don't need to wait until next removal operation. */
+ /* rcu_migrate_head is protected by cpu_add_remove_lock */
+ wait_migrated_callbacks();
+ }
+
+ return NOTIFY_OK;
+}
+
void __init rcu_init(void)
{
+ int i;
+
__rcu_init();
+ cpu_notifier(rcu_barrier_cpu_hotplug, 0);
+
+ /*
+ * We don't need protection against CPU-hotplug here because
+ * this is called early in boot, before either interrupts
+ * or the scheduler are operational.
+ */
+ for_each_online_cpu(i)
+ rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
}
+void rcu_scheduler_starting(void)
+{
+ WARN_ON(num_online_cpus() != 1);
+ WARN_ON(nr_context_switches() > 0);
+ rcu_scheduler_active = 1;
+}
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
deleted file mode 100644
index 33cfc50781f..00000000000
--- a/kernel/rcupreempt.c
+++ /dev/null
@@ -1,1502 +0,0 @@
-/*
- * Read-Copy Update mechanism for mutual exclusion, realtime implementation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright IBM Corporation, 2006
- *
- * Authors: Paul E. McKenney <paulmck@us.ibm.com>
- * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
- * for pushing me away from locks and towards counters, and
- * to Suparna Bhattacharya for pushing me completely away
- * from atomic instructions on the read side.
- *
- * - Added handling of Dynamic Ticks
- * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
- * - Steven Rostedt <srostedt@redhat.com>
- *
- * Papers: http://www.rdrop.com/users/paulmck/RCU
- *
- * Design Document: http://lwn.net/Articles/253651/
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- * Documentation/RCU/ *.txt
- *
- */
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/rcupdate.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <asm/atomic.h>
-#include <linux/bitops.h>
-#include <linux/module.h>
-#include <linux/kthread.h>
-#include <linux/completion.h>
-#include <linux/moduleparam.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/random.h>
-#include <linux/delay.h>
-#include <linux/cpumask.h>
-#include <linux/rcupreempt_trace.h>
-#include <asm/byteorder.h>
-
-/*
- * PREEMPT_RCU data structures.
- */
-
-/*
- * GP_STAGES specifies the number of times the state machine has
- * to go through the all the rcu_try_flip_states (see below)
- * in a single Grace Period.
- *
- * GP in GP_STAGES stands for Grace Period ;)
- */
-#define GP_STAGES 2
-struct rcu_data {
- spinlock_t lock; /* Protect rcu_data fields. */
- long completed; /* Number of last completed batch. */
- int waitlistcount;
- struct rcu_head *nextlist;
- struct rcu_head **nexttail;
- struct rcu_head *waitlist[GP_STAGES];
- struct rcu_head **waittail[GP_STAGES];
- struct rcu_head *donelist; /* from waitlist & waitschedlist */
- struct rcu_head **donetail;
- long rcu_flipctr[2];
- struct rcu_head *nextschedlist;
- struct rcu_head **nextschedtail;
- struct rcu_head *waitschedlist;
- struct rcu_head **waitschedtail;
- int rcu_sched_sleeping;
-#ifdef CONFIG_RCU_TRACE
- struct rcupreempt_trace trace;
-#endif /* #ifdef CONFIG_RCU_TRACE */
-};
-
-/*
- * States for rcu_try_flip() and friends.
- */
-
-enum rcu_try_flip_states {
-
- /*
- * Stay here if nothing is happening. Flip the counter if somthing
- * starts happening. Denoted by "I"
- */
- rcu_try_flip_idle_state,
-
- /*
- * Wait here for all CPUs to notice that the counter has flipped. This
- * prevents the old set of counters from ever being incremented once
- * we leave this state, which in turn is necessary because we cannot
- * test any individual counter for zero -- we can only check the sum.
- * Denoted by "A".
- */
- rcu_try_flip_waitack_state,
-
- /*
- * Wait here for the sum of the old per-CPU counters to reach zero.
- * Denoted by "Z".
- */
- rcu_try_flip_waitzero_state,
-
- /*
- * Wait here for each of the other CPUs to execute a memory barrier.
- * This is necessary to ensure that these other CPUs really have
- * completed executing their RCU read-side critical sections, despite
- * their CPUs wildly reordering memory. Denoted by "M".
- */
- rcu_try_flip_waitmb_state,
-};
-
-/*
- * States for rcu_ctrlblk.rcu_sched_sleep.
- */
-
-enum rcu_sched_sleep_states {
- rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */
- rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */
- rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */
-};
-
-struct rcu_ctrlblk {
- spinlock_t fliplock; /* Protect state-machine transitions. */
- long completed; /* Number of last completed batch. */
- enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
- the rcu state machine */
- spinlock_t schedlock; /* Protect rcu_sched sleep state. */
- enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
- wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
-};
-
-static DEFINE_PER_CPU(struct rcu_data, rcu_data);
-static struct rcu_ctrlblk rcu_ctrlblk = {
- .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
- .completed = 0,
- .rcu_try_flip_state = rcu_try_flip_idle_state,
- .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
- .sched_sleep = rcu_sched_not_sleeping,
- .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
-};
-
-static struct task_struct *rcu_sched_grace_period_task;
-
-#ifdef CONFIG_RCU_TRACE
-static char *rcu_try_flip_state_names[] =
- { "idle", "waitack", "waitzero", "waitmb" };
-#endif /* #ifdef CONFIG_RCU_TRACE */
-
-static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly
- = CPU_BITS_NONE;
-
-/*
- * Enum and per-CPU flag to determine when each CPU has seen
- * the most recent counter flip.
- */
-
-enum rcu_flip_flag_values {
- rcu_flip_seen, /* Steady/initial state, last flip seen. */
- /* Only GP detector can update. */
- rcu_flipped /* Flip just completed, need confirmation. */
- /* Only corresponding CPU can update. */
-};
-static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
- = rcu_flip_seen;
-
-/*
- * Enum and per-CPU flag to determine when each CPU has executed the
- * needed memory barrier to fence in memory references from its last RCU
- * read-side critical section in the just-completed grace period.
- */
-
-enum rcu_mb_flag_values {
- rcu_mb_done, /* Steady/initial state, no mb()s required. */
- /* Only GP detector can update. */
- rcu_mb_needed /* Flip just completed, need an mb(). */
- /* Only corresponding CPU can update. */
-};
-static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
- = rcu_mb_done;
-
-/*
- * RCU_DATA_ME: find the current CPU's rcu_data structure.
- * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
- */
-#define RCU_DATA_ME() (&__get_cpu_var(rcu_data))
-#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu))
-
-/*
- * Helper macro for tracing when the appropriate rcu_data is not
- * cached in a local variable, but where the CPU number is so cached.
- */
-#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
-
-/*
- * Helper macro for tracing when the appropriate rcu_data is not
- * cached in a local variable.
- */
-#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
-
-/*
- * Helper macro for tracing when the appropriate rcu_data is pointed
- * to by a local variable.
- */
-#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
-
-#define RCU_SCHED_BATCH_TIME (HZ / 50)
-
-/*
- * Return the number of RCU batches processed thus far. Useful
- * for debug and statistics.
- */
-long rcu_batches_completed(void)
-{
- return rcu_ctrlblk.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
-void __rcu_read_lock(void)
-{
- int idx;
- struct task_struct *t = current;
- int nesting;
-
- nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
- if (nesting != 0) {
-
- /* An earlier rcu_read_lock() covers us, just count it. */
-
- t->rcu_read_lock_nesting = nesting + 1;
-
- } else {
- unsigned long flags;
-
- /*
- * We disable interrupts for the following reasons:
- * - If we get scheduling clock interrupt here, and we
- * end up acking the counter flip, it's like a promise
- * that we will never increment the old counter again.
- * Thus we will break that promise if that
- * scheduling clock interrupt happens between the time
- * we pick the .completed field and the time that we
- * increment our counter.
- *
- * - We don't want to be preempted out here.
- *
- * NMIs can still occur, of course, and might themselves
- * contain rcu_read_lock().
- */
-
- local_irq_save(flags);
-
- /*
- * Outermost nesting of rcu_read_lock(), so increment
- * the current counter for the current CPU. Use volatile
- * casts to prevent the compiler from reordering.
- */
-
- idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
- ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
-
- /*
- * Now that the per-CPU counter has been incremented, we
- * are protected from races with rcu_read_lock() invoked
- * from NMI handlers on this CPU. We can therefore safely
- * increment the nesting counter, relieving further NMIs
- * of the need to increment the per-CPU counter.
- */
-
- ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
-
- /*
- * Now that we have preventing any NMIs from storing
- * to the ->rcu_flipctr_idx, we can safely use it to
- * remember which counter to decrement in the matching
- * rcu_read_unlock().
- */
-
- ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
- local_irq_restore(flags);
- }
-}
-EXPORT_SYMBOL_GPL(__rcu_read_lock);
-
-void __rcu_read_unlock(void)
-{
- int idx;
- struct task_struct *t = current;
- int nesting;
-
- nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
- if (nesting > 1) {
-
- /*
- * We are still protected by the enclosing rcu_read_lock(),
- * so simply decrement the counter.
- */
-
- t->rcu_read_lock_nesting = nesting - 1;
-
- } else {
- unsigned long flags;
-
- /*
- * Disable local interrupts to prevent the grace-period
- * detection state machine from seeing us half-done.
- * NMIs can still occur, of course, and might themselves
- * contain rcu_read_lock() and rcu_read_unlock().
- */
-
- local_irq_save(flags);
-
- /*
- * Outermost nesting of rcu_read_unlock(), so we must
- * decrement the current counter for the current CPU.
- * This must be done carefully, because NMIs can
- * occur at any point in this code, and any rcu_read_lock()
- * and rcu_read_unlock() pairs in the NMI handlers
- * must interact non-destructively with this code.
- * Lots of volatile casts, and -very- careful ordering.
- *
- * Changes to this code, including this one, must be
- * inspected, validated, and tested extremely carefully!!!
- */
-
- /*
- * First, pick up the index.
- */
-
- idx = ACCESS_ONCE(t->rcu_flipctr_idx);
-
- /*
- * Now that we have fetched the counter index, it is
- * safe to decrement the per-task RCU nesting counter.
- * After this, any interrupts or NMIs will increment and
- * decrement the per-CPU counters.
- */
- ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
-
- /*
- * It is now safe to decrement this task's nesting count.
- * NMIs that occur after this statement will route their
- * rcu_read_lock() calls through this "else" clause, and
- * will thus start incrementing the per-CPU counter on
- * their own. They will also clobber ->rcu_flipctr_idx,
- * but that is OK, since we have already fetched it.
- */
-
- ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
- local_irq_restore(flags);
- }
-}
-EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-
-/*
- * If a global counter flip has occurred since the last time that we
- * advanced callbacks, advance them. Hardware interrupts must be
- * disabled when calling this function.
- */
-static void __rcu_advance_callbacks(struct rcu_data *rdp)
-{
- int cpu;
- int i;
- int wlc = 0;
-
- if (rdp->completed != rcu_ctrlblk.completed) {
- if (rdp->waitlist[GP_STAGES - 1] != NULL) {
- *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
- rdp->donetail = rdp->waittail[GP_STAGES - 1];
- RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
- }
- for (i = GP_STAGES - 2; i >= 0; i--) {
- if (rdp->waitlist[i] != NULL) {
- rdp->waitlist[i + 1] = rdp->waitlist[i];
- rdp->waittail[i + 1] = rdp->waittail[i];
- wlc++;
- } else {
- rdp->waitlist[i + 1] = NULL;
- rdp->waittail[i + 1] =
- &rdp->waitlist[i + 1];
- }
- }
- if (rdp->nextlist != NULL) {
- rdp->waitlist[0] = rdp->nextlist;
- rdp->waittail[0] = rdp->nexttail;
- wlc++;
- rdp->nextlist = NULL;
- rdp->nexttail = &rdp->nextlist;
- RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
- } else {
- rdp->waitlist[0] = NULL;
- rdp->waittail[0] = &rdp->waitlist[0];
- }
- rdp->waitlistcount = wlc;
- rdp->completed = rcu_ctrlblk.completed;
- }
-
- /*
- * Check to see if this CPU needs to report that it has seen
- * the most recent counter flip, thereby declaring that all
- * subsequent rcu_read_lock() invocations will respect this flip.
- */
-
- cpu = raw_smp_processor_id();
- if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
- smp_mb(); /* Subsequent counter accesses must see new value */
- per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
- smp_mb(); /* Subsequent RCU read-side critical sections */
- /* seen -after- acknowledgement. */
- }
-}
-
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
- .dynticks = 1,
-};
-
-#ifdef CONFIG_NO_HZ
-static DEFINE_PER_CPU(int, rcu_update_flag);
-
-/**
- * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
- *
- * If the CPU was idle with dynamic ticks active, this updates the
- * rcu_dyntick_sched.dynticks to let the RCU handling know that the
- * CPU is active.
- */
-void rcu_irq_enter(void)
-{
- int cpu = smp_processor_id();
- struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
- if (per_cpu(rcu_update_flag, cpu))
- per_cpu(rcu_update_flag, cpu)++;
-
- /*
- * Only update if we are coming from a stopped ticks mode
- * (rcu_dyntick_sched.dynticks is even).
- */
- if (!in_interrupt() &&
- (rdssp->dynticks & 0x1) == 0) {
- /*
- * The following might seem like we could have a race
- * with NMI/SMIs. But this really isn't a problem.
- * Here we do a read/modify/write, and the race happens
- * when an NMI/SMI comes in after the read and before
- * the write. But NMI/SMIs will increment this counter
- * twice before returning, so the zero bit will not
- * be corrupted by the NMI/SMI which is the most important
- * part.
- *
- * The only thing is that we would bring back the counter
- * to a postion that it was in during the NMI/SMI.
- * But the zero bit would be set, so the rest of the
- * counter would again be ignored.
- *
- * On return from the IRQ, the counter may have the zero
- * bit be 0 and the counter the same as the return from
- * the NMI/SMI. If the state machine was so unlucky to
- * see that, it still doesn't matter, since all
- * RCU read-side critical sections on this CPU would
- * have already completed.
- */
- rdssp->dynticks++;
- /*
- * The following memory barrier ensures that any
- * rcu_read_lock() primitives in the irq handler
- * are seen by other CPUs to follow the above
- * increment to rcu_dyntick_sched.dynticks. This is
- * required in order for other CPUs to correctly
- * determine when it is safe to advance the RCU
- * grace-period state machine.
- */
- smp_mb(); /* see above block comment. */
- /*
- * Since we can't determine the dynamic tick mode from
- * the rcu_dyntick_sched.dynticks after this routine,
- * we use a second flag to acknowledge that we came
- * from an idle state with ticks stopped.
- */
- per_cpu(rcu_update_flag, cpu)++;
- /*
- * If we take an NMI/SMI now, they will also increment
- * the rcu_update_flag, and will not update the
- * rcu_dyntick_sched.dynticks on exit. That is for
- * this IRQ to do.
- */
- }
-}
-
-/**
- * rcu_irq_exit - Called from exiting Hard irq context.
- *
- * If the CPU was idle with dynamic ticks active, update the
- * rcu_dyntick_sched.dynticks to put let the RCU handling be
- * aware that the CPU is going back to idle with no ticks.
- */
-void rcu_irq_exit(void)
-{
- int cpu = smp_processor_id();
- struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
- /*
- * rcu_update_flag is set if we interrupted the CPU
- * when it was idle with ticks stopped.
- * Once this occurs, we keep track of interrupt nesting
- * because a NMI/SMI could also come in, and we still
- * only want the IRQ that started the increment of the
- * rcu_dyntick_sched.dynticks to be the one that modifies
- * it on exit.
- */
- if (per_cpu(rcu_update_flag, cpu)) {
- if (--per_cpu(rcu_update_flag, cpu))
- return;
-
- /* This must match the interrupt nesting */
- WARN_ON(in_interrupt());
-
- /*
- * If an NMI/SMI happens now we are still
- * protected by the rcu_dyntick_sched.dynticks being odd.
- */
-
- /*
- * The following memory barrier ensures that any
- * rcu_read_unlock() primitives in the irq handler
- * are seen by other CPUs to preceed the following
- * increment to rcu_dyntick_sched.dynticks. This
- * is required in order for other CPUs to determine
- * when it is safe to advance the RCU grace-period
- * state machine.
- */
- smp_mb(); /* see above block comment. */
- rdssp->dynticks++;
- WARN_ON(rdssp->dynticks & 0x1);
- }
-}
-
-void rcu_nmi_enter(void)
-{
- rcu_irq_enter();
-}
-
-void rcu_nmi_exit(void)
-{
- rcu_irq_exit();
-}
-
-static void dyntick_save_progress_counter(int cpu)
-{
- struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
- rdssp->dynticks_snap = rdssp->dynticks;
-}
-
-static inline int
-rcu_try_flip_waitack_needed(int cpu)
-{
- long curr;
- long snap;
- struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
- curr = rdssp->dynticks;
- snap = rdssp->dynticks_snap;
- smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
-
- /*
- * If the CPU remained in dynticks mode for the entire time
- * and didn't take any interrupts, NMIs, SMIs, or whatever,
- * then it cannot be in the middle of an rcu_read_lock(), so
- * the next rcu_read_lock() it executes must use the new value
- * of the counter. So we can safely pretend that this CPU
- * already acknowledged the counter.
- */
-
- if ((curr == snap) && ((curr & 0x1) == 0))
- return 0;
-
- /*
- * If the CPU passed through or entered a dynticks idle phase with
- * no active irq handlers, then, as above, we can safely pretend
- * that this CPU already acknowledged the counter.
- */
-
- if ((curr - snap) > 2 || (curr & 0x1) == 0)
- return 0;
-
- /* We need this CPU to explicitly acknowledge the counter flip. */
-
- return 1;
-}
-
-static inline int
-rcu_try_flip_waitmb_needed(int cpu)
-{
- long curr;
- long snap;
- struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
- curr = rdssp->dynticks;
- snap = rdssp->dynticks_snap;
- smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
-
- /*
- * If the CPU remained in dynticks mode for the entire time
- * and didn't take any interrupts, NMIs, SMIs, or whatever,
- * then it cannot have executed an RCU read-side critical section
- * during that time, so there is no need for it to execute a
- * memory barrier.
- */
-
- if ((curr == snap) && ((curr & 0x1) == 0))
- return 0;
-
- /*
- * If the CPU either entered or exited an outermost interrupt,
- * SMI, NMI, or whatever handler, then we know that it executed
- * a memory barrier when doing so. So we don't need another one.
- */
- if (curr != snap)
- return 0;
-
- /* We need the CPU to execute a memory barrier. */
-
- return 1;
-}
-
-static void dyntick_save_progress_counter_sched(int cpu)
-{
- struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
- rdssp->sched_dynticks_snap = rdssp->dynticks;
-}
-
-static int rcu_qsctr_inc_needed_dyntick(int cpu)
-{
- long curr;
- long snap;
- struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
- curr = rdssp->dynticks;
- snap = rdssp->sched_dynticks_snap;
- smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
-
- /*
- * If the CPU remained in dynticks mode for the entire time
- * and didn't take any interrupts, NMIs, SMIs, or whatever,
- * then it cannot be in the middle of an rcu_read_lock(), so
- * the next rcu_read_lock() it executes must use the new value
- * of the counter. Therefore, this CPU has been in a quiescent
- * state the entire time, and we don't need to wait for it.
- */
-
- if ((curr == snap) && ((curr & 0x1) == 0))
- return 0;
-
- /*
- * If the CPU passed through or entered a dynticks idle phase with
- * no active irq handlers, then, as above, this CPU has already
- * passed through a quiescent state.
- */
-
- if ((curr - snap) > 2 || (snap & 0x1) == 0)
- return 0;
-
- /* We need this CPU to go through a quiescent state. */
-
- return 1;
-}
-
-#else /* !CONFIG_NO_HZ */
-
-# define dyntick_save_progress_counter(cpu) do { } while (0)
-# define rcu_try_flip_waitack_needed(cpu) (1)
-# define rcu_try_flip_waitmb_needed(cpu) (1)
-
-# define dyntick_save_progress_counter_sched(cpu) do { } while (0)
-# define rcu_qsctr_inc_needed_dyntick(cpu) (1)
-
-#endif /* CONFIG_NO_HZ */
-
-static void save_qsctr_sched(int cpu)
-{
- struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
- rdssp->sched_qs_snap = rdssp->sched_qs;
-}
-
-static inline int rcu_qsctr_inc_needed(int cpu)
-{
- struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
- /*
- * If there has been a quiescent state, no more need to wait
- * on this CPU.
- */
-
- if (rdssp->sched_qs != rdssp->sched_qs_snap) {
- smp_mb(); /* force ordering with cpu entering schedule(). */
- return 0;
- }
-
- /* We need this CPU to go through a quiescent state. */
-
- return 1;
-}
-
-/*
- * Get here when RCU is idle. Decide whether we need to
- * move out of idle state, and return non-zero if so.
- * "Straightforward" approach for the moment, might later
- * use callback-list lengths, grace-period duration, or
- * some such to determine when to exit idle state.
- * Might also need a pre-idle test that does not acquire
- * the lock, but let's get the simple case working first...
- */
-
-static int
-rcu_try_flip_idle(void)
-{
- int cpu;
-
- RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
- if (!rcu_pending(smp_processor_id())) {
- RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
- return 0;
- }
-
- /*
- * Do the flip.
- */
-
- RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
- rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
-
- /*
- * Need a memory barrier so that other CPUs see the new
- * counter value before they see the subsequent change of all
- * the rcu_flip_flag instances to rcu_flipped.
- */
-
- smp_mb(); /* see above block comment. */
-
- /* Now ask each CPU for acknowledgement of the flip. */
-
- for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
- per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
- dyntick_save_progress_counter(cpu);
- }
-
- return 1;
-}
-
-/*
- * Wait for CPUs to acknowledge the flip.
- */
-
-static int
-rcu_try_flip_waitack(void)
-{
- int cpu;
-
- RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
- for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
- if (rcu_try_flip_waitack_needed(cpu) &&
- per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
- RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
- return 0;
- }
-
- /*
- * Make sure our checks above don't bleed into subsequent
- * waiting for the sum of the counters to reach zero.
- */
-
- smp_mb(); /* see above block comment. */
- RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
- return 1;
-}
-
-/*
- * Wait for collective ``last'' counter to reach zero,
- * then tell all CPUs to do an end-of-grace-period memory barrier.
- */
-
-static int
-rcu_try_flip_waitzero(void)
-{
- int cpu;
- int lastidx = !(rcu_ctrlblk.completed & 0x1);
- int sum = 0;
-
- /* Check to see if the sum of the "last" counters is zero. */
-
- RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
- for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
- sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
- if (sum != 0) {
- RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
- return 0;
- }
-
- /*
- * This ensures that the other CPUs see the call for
- * memory barriers -after- the sum to zero has been
- * detected here
- */
- smp_mb(); /* ^^^^^^^^^^^^ */
-
- /* Call for a memory barrier from each CPU. */
- for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
- per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
- dyntick_save_progress_counter(cpu);
- }
-
- RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
- return 1;
-}
-
-/*
- * Wait for all CPUs to do their end-of-grace-period memory barrier.
- * Return 0 once all CPUs have done so.
- */
-
-static int
-rcu_try_flip_waitmb(void)
-{
- int cpu;
-
- RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
- for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
- if (rcu_try_flip_waitmb_needed(cpu) &&
- per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
- RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
- return 0;
- }
-
- smp_mb(); /* Ensure that the above checks precede any following flip. */
- RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
- return 1;
-}
-
-/*
- * Attempt a single flip of the counters. Remember, a single flip does
- * -not- constitute a grace period. Instead, the interval between
- * at least GP_STAGES consecutive flips is a grace period.
- *
- * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
- * on a large SMP, they might want to use a hierarchical organization of
- * the per-CPU-counter pairs.
- */
-static void rcu_try_flip(void)
-{
- unsigned long flags;
-
- RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
- if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
- RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
- return;
- }
-
- /*
- * Take the next transition(s) through the RCU grace-period
- * flip-counter state machine.
- */
-
- switch (rcu_ctrlblk.rcu_try_flip_state) {
- case rcu_try_flip_idle_state:
- if (rcu_try_flip_idle())
- rcu_ctrlblk.rcu_try_flip_state =
- rcu_try_flip_waitack_state;
- break;
- case rcu_try_flip_waitack_state:
- if (rcu_try_flip_waitack())
- rcu_ctrlblk.rcu_try_flip_state =
- rcu_try_flip_waitzero_state;
- break;
- case rcu_try_flip_waitzero_state:
- if (rcu_try_flip_waitzero())
- rcu_ctrlblk.rcu_try_flip_state =
- rcu_try_flip_waitmb_state;
- break;
- case rcu_try_flip_waitmb_state:
- if (rcu_try_flip_waitmb())
- rcu_ctrlblk.rcu_try_flip_state =
- rcu_try_flip_idle_state;
- }
- spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
-}
-
-/*
- * Check to see if this CPU needs to do a memory barrier in order to
- * ensure that any prior RCU read-side critical sections have committed
- * their counter manipulations and critical-section memory references
- * before declaring the grace period to be completed.
- */
-static void rcu_check_mb(int cpu)
-{
- if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
- smp_mb(); /* Ensure RCU read-side accesses are visible. */
- per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
- }
-}
-
-void rcu_check_callbacks(int cpu, int user)
-{
- unsigned long flags;
- struct rcu_data *rdp = RCU_DATA_CPU(cpu);
-
- /*
- * If this CPU took its interrupt from user mode or from the
- * idle loop, and this is not a nested interrupt, then
- * this CPU has to have exited all prior preept-disable
- * sections of code. So increment the counter to note this.
- *
- * The memory barrier is needed to handle the case where
- * writes from a preempt-disable section of code get reordered
- * into schedule() by this CPU's write buffer. So the memory
- * barrier makes sure that the rcu_qsctr_inc() is seen by other
- * CPUs to happen after any such write.
- */
-
- if (user ||
- (idle_cpu(cpu) && !in_softirq() &&
- hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
- smp_mb(); /* Guard against aggressive schedule(). */
- rcu_qsctr_inc(cpu);
- }
-
- rcu_check_mb(cpu);
- if (rcu_ctrlblk.completed == rdp->completed)
- rcu_try_flip();
- spin_lock_irqsave(&rdp->lock, flags);
- RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
- __rcu_advance_callbacks(rdp);
- if (rdp->donelist == NULL) {
- spin_unlock_irqrestore(&rdp->lock, flags);
- } else {
- spin_unlock_irqrestore(&rdp->lock, flags);
- raise_softirq(RCU_SOFTIRQ);
- }
-}
-
-/*
- * Needed by dynticks, to make sure all RCU processing has finished
- * when we go idle:
- */
-void rcu_advance_callbacks(int cpu, int user)
-{
- unsigned long flags;
- struct rcu_data *rdp = RCU_DATA_CPU(cpu);
-
- if (rcu_ctrlblk.completed == rdp->completed) {
- rcu_try_flip();
- if (rcu_ctrlblk.completed == rdp->completed)
- return;
- }
- spin_lock_irqsave(&rdp->lock, flags);
- RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
- __rcu_advance_callbacks(rdp);
- spin_unlock_irqrestore(&rdp->lock, flags);
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
- *dsttail = srclist; \
- if (srclist != NULL) { \
- dsttail = srctail; \
- srclist = NULL; \
- srctail = &srclist;\
- } \
- } while (0)
-
-void rcu_offline_cpu(int cpu)
-{
- int i;
- struct rcu_head *list = NULL;
- unsigned long flags;
- struct rcu_data *rdp = RCU_DATA_CPU(cpu);
- struct rcu_head *schedlist = NULL;
- struct rcu_head **schedtail = &schedlist;
- struct rcu_head **tail = &list;
-
- /*
- * Remove all callbacks from the newly dead CPU, retaining order.
- * Otherwise rcu_barrier() will fail
- */
-
- spin_lock_irqsave(&rdp->lock, flags);
- rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
- for (i = GP_STAGES - 1; i >= 0; i--)
- rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
- list, tail);
- rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
- rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
- schedlist, schedtail);
- rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
- schedlist, schedtail);
- rdp->rcu_sched_sleeping = 0;
- spin_unlock_irqrestore(&rdp->lock, flags);
- rdp->waitlistcount = 0;
-
- /* Disengage the newly dead CPU from the grace-period computation. */
-
- spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
- rcu_check_mb(cpu);
- if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
- smp_mb(); /* Subsequent counter accesses must see new value */
- per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
- smp_mb(); /* Subsequent RCU read-side critical sections */
- /* seen -after- acknowledgement. */
- }
-
- RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
- RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
-
- RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
- RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
-
- cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
-
- spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
-
- /*
- * Place the removed callbacks on the current CPU's queue.
- * Make them all start a new grace period: simple approach,
- * in theory could starve a given set of callbacks, but
- * you would need to be doing some serious CPU hotplugging
- * to make this happen. If this becomes a problem, adding
- * a synchronize_rcu() to the hotplug path would be a simple
- * fix.
- */
-
- local_irq_save(flags); /* disable preempt till we know what lock. */
- rdp = RCU_DATA_ME();
- spin_lock(&rdp->lock);
- *rdp->nexttail = list;
- if (list)
- rdp->nexttail = tail;
- *rdp->nextschedtail = schedlist;
- if (schedlist)
- rdp->nextschedtail = schedtail;
- spin_unlock_irqrestore(&rdp->lock, flags);
-}
-
-#else /* #ifdef CONFIG_HOTPLUG_CPU */
-
-void rcu_offline_cpu(int cpu)
-{
-}
-
-#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
-
-void __cpuinit rcu_online_cpu(int cpu)
-{
- unsigned long flags;
- struct rcu_data *rdp;
-
- spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
- cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map));
- spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
-
- /*
- * The rcu_sched grace-period processing might have bypassed
- * this CPU, given that it was not in the rcu_cpu_online_map
- * when the grace-period scan started. This means that the
- * grace-period task might sleep. So make sure that if this
- * should happen, the first callback posted to this CPU will
- * wake up the grace-period task if need be.
- */
-
- rdp = RCU_DATA_CPU(cpu);
- spin_lock_irqsave(&rdp->lock, flags);
- rdp->rcu_sched_sleeping = 1;
- spin_unlock_irqrestore(&rdp->lock, flags);
-}
-
-static void rcu_process_callbacks(struct softirq_action *unused)
-{
- unsigned long flags;
- struct rcu_head *next, *list;
- struct rcu_data *rdp;
-
- local_irq_save(flags);
- rdp = RCU_DATA_ME();
- spin_lock(&rdp->lock);
- list = rdp->donelist;
- if (list == NULL) {
- spin_unlock_irqrestore(&rdp->lock, flags);
- return;
- }
- rdp->donelist = NULL;
- rdp->donetail = &rdp->donelist;
- RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
- spin_unlock_irqrestore(&rdp->lock, flags);
- while (list) {
- next = list->next;
- list->func(list);
- list = next;
- RCU_TRACE_ME(rcupreempt_trace_invoke);
- }
-}
-
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
-{
- unsigned long flags;
- struct rcu_data *rdp;
-
- head->func = func;
- head->next = NULL;
- local_irq_save(flags);
- rdp = RCU_DATA_ME();
- spin_lock(&rdp->lock);
- __rcu_advance_callbacks(rdp);
- *rdp->nexttail = head;
- rdp->nexttail = &head->next;
- RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
- spin_unlock_irqrestore(&rdp->lock, flags);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
-
-void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
-{
- unsigned long flags;
- struct rcu_data *rdp;
- int wake_gp = 0;
-
- head->func = func;
- head->next = NULL;
- local_irq_save(flags);
- rdp = RCU_DATA_ME();
- spin_lock(&rdp->lock);
- *rdp->nextschedtail = head;
- rdp->nextschedtail = &head->next;
- if (rdp->rcu_sched_sleeping) {
-
- /* Grace-period processing might be sleeping... */
-
- rdp->rcu_sched_sleeping = 0;
- wake_gp = 1;
- }
- spin_unlock_irqrestore(&rdp->lock, flags);
- if (wake_gp) {
-
- /* Wake up grace-period processing, unless someone beat us. */
-
- spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
- if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
- wake_gp = 0;
- rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
- spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
- if (wake_gp)
- wake_up_interruptible(&rcu_ctrlblk.sched_wq);
- }
-}
-EXPORT_SYMBOL_GPL(call_rcu_sched);
-
-/*
- * Wait until all currently running preempt_disable() code segments
- * (including hardware-irq-disable segments) complete. Note that
- * in -rt this does -not- necessarily result in all currently executing
- * interrupt -handlers- having completed.
- */
-void __synchronize_sched(void)
-{
- struct rcu_synchronize rcu;
-
- init_completion(&rcu.completion);
- /* Will wake me after RCU finished. */
- call_rcu_sched(&rcu.head, wakeme_after_rcu);
- /* Wait for it. */
- wait_for_completion(&rcu.completion);
-}
-EXPORT_SYMBOL_GPL(__synchronize_sched);
-
-/*
- * kthread function that manages call_rcu_sched grace periods.
- */
-static int rcu_sched_grace_period(void *arg)
-{
- int couldsleep; /* might sleep after current pass. */
- int couldsleepnext = 0; /* might sleep after next pass. */
- int cpu;
- unsigned long flags;
- struct rcu_data *rdp;
- int ret;
-
- /*
- * Each pass through the following loop handles one
- * rcu_sched grace period cycle.
- */
- do {
- /* Save each CPU's current state. */
-
- for_each_online_cpu(cpu) {
- dyntick_save_progress_counter_sched(cpu);
- save_qsctr_sched(cpu);
- }
-
- /*
- * Sleep for about an RCU grace-period's worth to
- * allow better batching and to consume less CPU.
- */
- schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
-
- /*
- * If there was nothing to do last time, prepare to
- * sleep at the end of the current grace period cycle.
- */
- couldsleep = couldsleepnext;
- couldsleepnext = 1;
- if (couldsleep) {
- spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
- rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
- spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
- }
-
- /*
- * Wait on each CPU in turn to have either visited
- * a quiescent state or been in dynticks-idle mode.
- */
- for_each_online_cpu(cpu) {
- while (rcu_qsctr_inc_needed(cpu) &&
- rcu_qsctr_inc_needed_dyntick(cpu)) {
- /* resched_cpu(cpu); @@@ */
- schedule_timeout_interruptible(1);
- }
- }
-
- /* Advance callbacks for each CPU. */
-
- for_each_online_cpu(cpu) {
-
- rdp = RCU_DATA_CPU(cpu);
- spin_lock_irqsave(&rdp->lock, flags);
-
- /*
- * We are running on this CPU irq-disabled, so no
- * CPU can go offline until we re-enable irqs.
- * The current CPU might have already gone
- * offline (between the for_each_offline_cpu and
- * the spin_lock_irqsave), but in that case all its
- * callback lists will be empty, so no harm done.
- *
- * Advance the callbacks! We share normal RCU's
- * donelist, since callbacks are invoked the
- * same way in either case.
- */
- if (rdp->waitschedlist != NULL) {
- *rdp->donetail = rdp->waitschedlist;
- rdp->donetail = rdp->waitschedtail;
-
- /*
- * Next rcu_check_callbacks() will
- * do the required raise_softirq().
- */
- }
- if (rdp->nextschedlist != NULL) {
- rdp->waitschedlist = rdp->nextschedlist;
- rdp->waitschedtail = rdp->nextschedtail;
- couldsleep = 0;
- couldsleepnext = 0;
- } else {
- rdp->waitschedlist = NULL;
- rdp->waitschedtail = &rdp->waitschedlist;
- }
- rdp->nextschedlist = NULL;
- rdp->nextschedtail = &rdp->nextschedlist;
-
- /* Mark sleep intention. */
-
- rdp->rcu_sched_sleeping = couldsleep;
-
- spin_unlock_irqrestore(&rdp->lock, flags);
- }
-
- /* If we saw callbacks on the last scan, go deal with them. */
-
- if (!couldsleep)
- continue;
-
- /* Attempt to block... */
-
- spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
- if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
-
- /*
- * Someone posted a callback after we scanned.
- * Go take care of it.
- */
- spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
- couldsleepnext = 0;
- continue;
- }
-
- /* Block until the next person posts a callback. */
-
- rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
- spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
- ret = 0;
- __wait_event_interruptible(rcu_ctrlblk.sched_wq,
- rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
- ret);
-
- /*
- * Signals would prevent us from sleeping, and we cannot
- * do much with them in any case. So flush them.
- */
- if (ret)
- flush_signals(current);
- couldsleepnext = 0;
-
- } while (!kthread_should_stop());
-
- return (0);
-}
-
-/*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so. Assumes that notifiers would take care of handling any
- * outstanding requests from the RCU core.
- *
- * This function is part of the RCU implementation; it is -not-
- * an exported member of the RCU API.
- */
-int rcu_needs_cpu(int cpu)
-{
- struct rcu_data *rdp = RCU_DATA_CPU(cpu);
-
- return (rdp->donelist != NULL ||
- !!rdp->waitlistcount ||
- rdp->nextlist != NULL ||
- rdp->nextschedlist != NULL ||
- rdp->waitschedlist != NULL);
-}
-
-int rcu_pending(int cpu)
-{
- struct rcu_data *rdp = RCU_DATA_CPU(cpu);
-
- /* The CPU has at least one callback queued somewhere. */
-
- if (rdp->donelist != NULL ||
- !!rdp->waitlistcount ||
- rdp->nextlist != NULL ||
- rdp->nextschedlist != NULL ||
- rdp->waitschedlist != NULL)
- return 1;
-
- /* The RCU core needs an acknowledgement from this CPU. */
-
- if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
- (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
- return 1;
-
- /* This CPU has fallen behind the global grace-period number. */
-
- if (rdp->completed != rcu_ctrlblk.completed)
- return 1;
-
- /* Nothing needed from this CPU. */
-
- return 0;
-}
-
-static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- long cpu = (long)hcpu;
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- rcu_online_cpu(cpu);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- rcu_offline_cpu(cpu);
- break;
- default:
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata rcu_nb = {
- .notifier_call = rcu_cpu_notify,
-};
-
-void __init __rcu_init(void)
-{
- int cpu;
- int i;
- struct rcu_data *rdp;
-
- printk(KERN_NOTICE "Preemptible RCU implementation.\n");
- for_each_possible_cpu(cpu) {
- rdp = RCU_DATA_CPU(cpu);
- spin_lock_init(&rdp->lock);
- rdp->completed = 0;
- rdp->waitlistcount = 0;
- rdp->nextlist = NULL;
- rdp->nexttail = &rdp->nextlist;
- for (i = 0; i < GP_STAGES; i++) {
- rdp->waitlist[i] = NULL;
- rdp->waittail[i] = &rdp->waitlist[i];
- }
- rdp->donelist = NULL;
- rdp->donetail = &rdp->donelist;
- rdp->rcu_flipctr[0] = 0;
- rdp->rcu_flipctr[1] = 0;
- rdp->nextschedlist = NULL;
- rdp->nextschedtail = &rdp->nextschedlist;
- rdp->waitschedlist = NULL;
- rdp->waitschedtail = &rdp->waitschedlist;
- rdp->rcu_sched_sleeping = 0;
- }
- register_cpu_notifier(&rcu_nb);
-
- /*
- * We don't need protection against CPU-Hotplug here
- * since
- * a) If a CPU comes online while we are iterating over the
- * cpu_online_mask below, we would only end up making a
- * duplicate call to rcu_online_cpu() which sets the corresponding
- * CPU's mask in the rcu_cpu_online_map.
- *
- * b) A CPU cannot go offline at this point in time since the user
- * does not have access to the sysfs interface, nor do we
- * suspend the system.
- */
- for_each_online_cpu(cpu)
- rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
-
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
-}
-
-/*
- * Late-boot-time RCU initialization that must wait until after scheduler
- * has been initialized.
- */
-void __init rcu_init_sched(void)
-{
- rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
- NULL,
- "rcu_sched_grace_period");
- WARN_ON(IS_ERR(rcu_sched_grace_period_task));
-}
-
-#ifdef CONFIG_RCU_TRACE
-long *rcupreempt_flipctr(int cpu)
-{
- return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
-}
-EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
-
-int rcupreempt_flip_flag(int cpu)
-{
- return per_cpu(rcu_flip_flag, cpu);
-}
-EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
-
-int rcupreempt_mb_flag(int cpu)
-{
- return per_cpu(rcu_mb_flag, cpu);
-}
-EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
-
-char *rcupreempt_try_flip_state_name(void)
-{
- return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
-}
-EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
-
-struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
-{
- struct rcu_data *rdp = RCU_DATA_CPU(cpu);
-
- return &rdp->trace;
-}
-EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
-
-#endif /* #ifdef RCU_TRACE */
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
deleted file mode 100644
index 7c2665cac17..00000000000
--- a/kernel/rcupreempt_trace.c
+++ /dev/null
@@ -1,334 +0,0 @@
-/*
- * Read-Copy Update tracing for realtime implementation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright IBM Corporation, 2006
- *
- * Papers: http://www.rdrop.com/users/paulmck/RCU
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- * Documentation/RCU/ *.txt
- *
- */
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/rcupdate.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <asm/atomic.h>
-#include <linux/bitops.h>
-#include <linux/module.h>
-#include <linux/completion.h>
-#include <linux/moduleparam.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/mutex.h>
-#include <linux/rcupreempt_trace.h>
-#include <linux/debugfs.h>
-
-static struct mutex rcupreempt_trace_mutex;
-static char *rcupreempt_trace_buf;
-#define RCUPREEMPT_TRACE_BUF_SIZE 4096
-
-void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
-{
- trace->done_length += trace->wait_length;
- trace->done_add += trace->wait_length;
- trace->wait_length = 0;
-}
-void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
-{
- trace->wait_length += trace->next_length;
- trace->wait_add += trace->next_length;
- trace->next_length = 0;
-}
-void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
-{
- atomic_inc(&trace->rcu_try_flip_1);
-}
-void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
-{
- atomic_inc(&trace->rcu_try_flip_e1);
-}
-void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
-{
- trace->rcu_try_flip_i1++;
-}
-void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
-{
- trace->rcu_try_flip_ie1++;
-}
-void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
-{
- trace->rcu_try_flip_g1++;
-}
-void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
-{
- trace->rcu_try_flip_a1++;
-}
-void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
-{
- trace->rcu_try_flip_ae1++;
-}
-void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
-{
- trace->rcu_try_flip_a2++;
-}
-void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
-{
- trace->rcu_try_flip_z1++;
-}
-void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
-{
- trace->rcu_try_flip_ze1++;
-}
-void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
-{
- trace->rcu_try_flip_z2++;
-}
-void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
-{
- trace->rcu_try_flip_m1++;
-}
-void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
-{
- trace->rcu_try_flip_me1++;
-}
-void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
-{
- trace->rcu_try_flip_m2++;
-}
-void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
-{
- trace->rcu_check_callbacks++;
-}
-void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
-{
- trace->done_remove += trace->done_length;
- trace->done_length = 0;
-}
-void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
-{
- atomic_inc(&trace->done_invoked);
-}
-void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
-{
- trace->next_add++;
- trace->next_length++;
-}
-
-static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
-{
- struct rcupreempt_trace *cp;
- int cpu;
-
- memset(sp, 0, sizeof(*sp));
- for_each_possible_cpu(cpu) {
- cp = rcupreempt_trace_cpu(cpu);
- sp->next_length += cp->next_length;
- sp->next_add += cp->next_add;
- sp->wait_length += cp->wait_length;
- sp->wait_add += cp->wait_add;
- sp->done_length += cp->done_length;
- sp->done_add += cp->done_add;
- sp->done_remove += cp->done_remove;
- atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
- sp->rcu_check_callbacks += cp->rcu_check_callbacks;
- atomic_add(atomic_read(&cp->rcu_try_flip_1),
- &sp->rcu_try_flip_1);
- atomic_add(atomic_read(&cp->rcu_try_flip_e1),
- &sp->rcu_try_flip_e1);
- sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
- sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
- sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
- sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
- sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
- sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
- sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
- sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
- sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
- sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
- sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
- sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
- }
-}
-
-static ssize_t rcustats_read(struct file *filp, char __user *buffer,
- size_t count, loff_t *ppos)
-{
- struct rcupreempt_trace trace;
- ssize_t bcount;
- int cnt = 0;
-
- rcupreempt_trace_sum(&trace);
- mutex_lock(&rcupreempt_trace_mutex);
- snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
- "ggp=%ld rcc=%ld\n",
- rcu_batches_completed(),
- trace.rcu_check_callbacks);
- snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
- "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
- "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
- "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
-
- trace.next_add, trace.next_length,
- trace.wait_add, trace.wait_length,
- trace.done_add, trace.done_length,
- trace.done_remove, atomic_read(&trace.done_invoked),
- atomic_read(&trace.rcu_try_flip_1),
- atomic_read(&trace.rcu_try_flip_e1),
- trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
- trace.rcu_try_flip_g1,
- trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
- trace.rcu_try_flip_a2,
- trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
- trace.rcu_try_flip_z2,
- trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
- trace.rcu_try_flip_m2);
- bcount = simple_read_from_buffer(buffer, count, ppos,
- rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
- mutex_unlock(&rcupreempt_trace_mutex);
- return bcount;
-}
-
-static ssize_t rcugp_read(struct file *filp, char __user *buffer,
- size_t count, loff_t *ppos)
-{
- long oldgp = rcu_batches_completed();
- ssize_t bcount;
-
- mutex_lock(&rcupreempt_trace_mutex);
- synchronize_rcu();
- snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
- "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed());
- bcount = simple_read_from_buffer(buffer, count, ppos,
- rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
- mutex_unlock(&rcupreempt_trace_mutex);
- return bcount;
-}
-
-static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
- size_t count, loff_t *ppos)
-{
- int cnt = 0;
- int cpu;
- int f = rcu_batches_completed() & 0x1;
- ssize_t bcount;
-
- mutex_lock(&rcupreempt_trace_mutex);
-
- cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
- "CPU last cur F M\n");
- for_each_online_cpu(cpu) {
- long *flipctr = rcupreempt_flipctr(cpu);
- cnt += snprintf(&rcupreempt_trace_buf[cnt],
- RCUPREEMPT_TRACE_BUF_SIZE - cnt,
- "%3d %4ld %3ld %d %d\n",
- cpu,
- flipctr[!f],
- flipctr[f],
- rcupreempt_flip_flag(cpu),
- rcupreempt_mb_flag(cpu));
- }
- cnt += snprintf(&rcupreempt_trace_buf[cnt],
- RCUPREEMPT_TRACE_BUF_SIZE - cnt,
- "ggp = %ld, state = %s\n",
- rcu_batches_completed(),
- rcupreempt_try_flip_state_name());
- cnt += snprintf(&rcupreempt_trace_buf[cnt],
- RCUPREEMPT_TRACE_BUF_SIZE - cnt,
- "\n");
- bcount = simple_read_from_buffer(buffer, count, ppos,
- rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
- mutex_unlock(&rcupreempt_trace_mutex);
- return bcount;
-}
-
-static struct file_operations rcustats_fops = {
- .owner = THIS_MODULE,
- .read = rcustats_read,
-};
-
-static struct file_operations rcugp_fops = {
- .owner = THIS_MODULE,
- .read = rcugp_read,
-};
-
-static struct file_operations rcuctrs_fops = {
- .owner = THIS_MODULE,
- .read = rcuctrs_read,
-};
-
-static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
-static int rcupreempt_debugfs_init(void)
-{
- rcudir = debugfs_create_dir("rcu", NULL);
- if (!rcudir)
- goto out;
- statdir = debugfs_create_file("rcustats", 0444, rcudir,
- NULL, &rcustats_fops);
- if (!statdir)
- goto free_out;
-
- gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
- if (!gpdir)
- goto free_out;
-
- ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
- NULL, &rcuctrs_fops);
- if (!ctrsdir)
- goto free_out;
- return 0;
-free_out:
- if (statdir)
- debugfs_remove(statdir);
- if (gpdir)
- debugfs_remove(gpdir);
- debugfs_remove(rcudir);
-out:
- return 1;
-}
-
-static int __init rcupreempt_trace_init(void)
-{
- int ret;
-
- mutex_init(&rcupreempt_trace_mutex);
- rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
- if (!rcupreempt_trace_buf)
- return 1;
- ret = rcupreempt_debugfs_init();
- if (ret)
- kfree(rcupreempt_trace_buf);
- return ret;
-}
-
-static void __exit rcupreempt_trace_cleanup(void)
-{
- debugfs_remove(statdir);
- debugfs_remove(gpdir);
- debugfs_remove(ctrsdir);
- debugfs_remove(rcudir);
- kfree(rcupreempt_trace_buf);
-}
-
-
-module_init(rcupreempt_trace_init);
-module_exit(rcupreempt_trace_cleanup);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 7c4142a79f0..b33db539a8a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -126,6 +126,7 @@ static atomic_t n_rcu_torture_mberror;
static atomic_t n_rcu_torture_error;
static long n_rcu_torture_timers = 0;
static struct list_head rcu_torture_removed;
+static cpumask_var_t shuffle_tmp_mask;
static int stutter_pause_test = 0;
@@ -256,14 +257,14 @@ struct rcu_torture_ops {
void (*init)(void);
void (*cleanup)(void);
int (*readlock)(void);
- void (*readdelay)(struct rcu_random_state *rrsp);
+ void (*read_delay)(struct rcu_random_state *rrsp);
void (*readunlock)(int idx);
int (*completed)(void);
- void (*deferredfree)(struct rcu_torture *p);
+ void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
void (*cb_barrier)(void);
int (*stats)(char *page);
- int irqcapable;
+ int irq_capable;
char *name;
};
static struct rcu_torture_ops *cur_ops = NULL;
@@ -319,7 +320,7 @@ rcu_torture_cb(struct rcu_head *p)
rp->rtort_mbtest = 0;
rcu_torture_free(rp);
} else
- cur_ops->deferredfree(rp);
+ cur_ops->deferred_free(rp);
}
static void rcu_torture_deferred_free(struct rcu_torture *p)
@@ -328,18 +329,18 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
}
static struct rcu_torture_ops rcu_ops = {
- .init = NULL,
- .cleanup = NULL,
- .readlock = rcu_torture_read_lock,
- .readdelay = rcu_read_delay,
- .readunlock = rcu_torture_read_unlock,
- .completed = rcu_torture_completed,
- .deferredfree = rcu_torture_deferred_free,
- .sync = synchronize_rcu,
- .cb_barrier = rcu_barrier,
- .stats = NULL,
- .irqcapable = 1,
- .name = "rcu"
+ .init = NULL,
+ .cleanup = NULL,
+ .readlock = rcu_torture_read_lock,
+ .read_delay = rcu_read_delay,
+ .readunlock = rcu_torture_read_unlock,
+ .completed = rcu_torture_completed,
+ .deferred_free = rcu_torture_deferred_free,
+ .sync = synchronize_rcu,
+ .cb_barrier = rcu_barrier,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "rcu"
};
static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
@@ -369,18 +370,18 @@ static void rcu_sync_torture_init(void)
}
static struct rcu_torture_ops rcu_sync_ops = {
- .init = rcu_sync_torture_init,
- .cleanup = NULL,
- .readlock = rcu_torture_read_lock,
- .readdelay = rcu_read_delay,
- .readunlock = rcu_torture_read_unlock,
- .completed = rcu_torture_completed,
- .deferredfree = rcu_sync_torture_deferred_free,
- .sync = synchronize_rcu,
- .cb_barrier = NULL,
- .stats = NULL,
- .irqcapable = 1,
- .name = "rcu_sync"
+ .init = rcu_sync_torture_init,
+ .cleanup = NULL,
+ .readlock = rcu_torture_read_lock,
+ .read_delay = rcu_read_delay,
+ .readunlock = rcu_torture_read_unlock,
+ .completed = rcu_torture_completed,
+ .deferred_free = rcu_sync_torture_deferred_free,
+ .sync = synchronize_rcu,
+ .cb_barrier = NULL,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "rcu_sync"
};
/*
@@ -431,33 +432,33 @@ static void rcu_bh_torture_synchronize(void)
}
static struct rcu_torture_ops rcu_bh_ops = {
- .init = NULL,
- .cleanup = NULL,
- .readlock = rcu_bh_torture_read_lock,
- .readdelay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = rcu_bh_torture_read_unlock,
- .completed = rcu_bh_torture_completed,
- .deferredfree = rcu_bh_torture_deferred_free,
- .sync = rcu_bh_torture_synchronize,
- .cb_barrier = rcu_barrier_bh,
- .stats = NULL,
- .irqcapable = 1,
- .name = "rcu_bh"
+ .init = NULL,
+ .cleanup = NULL,
+ .readlock = rcu_bh_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_bh_torture_read_unlock,
+ .completed = rcu_bh_torture_completed,
+ .deferred_free = rcu_bh_torture_deferred_free,
+ .sync = rcu_bh_torture_synchronize,
+ .cb_barrier = rcu_barrier_bh,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "rcu_bh"
};
static struct rcu_torture_ops rcu_bh_sync_ops = {
- .init = rcu_sync_torture_init,
- .cleanup = NULL,
- .readlock = rcu_bh_torture_read_lock,
- .readdelay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = rcu_bh_torture_read_unlock,
- .completed = rcu_bh_torture_completed,
- .deferredfree = rcu_sync_torture_deferred_free,
- .sync = rcu_bh_torture_synchronize,
- .cb_barrier = NULL,
- .stats = NULL,
- .irqcapable = 1,
- .name = "rcu_bh_sync"
+ .init = rcu_sync_torture_init,
+ .cleanup = NULL,
+ .readlock = rcu_bh_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_bh_torture_read_unlock,
+ .completed = rcu_bh_torture_completed,
+ .deferred_free = rcu_sync_torture_deferred_free,
+ .sync = rcu_bh_torture_synchronize,
+ .cb_barrier = NULL,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "rcu_bh_sync"
};
/*
@@ -529,17 +530,17 @@ static int srcu_torture_stats(char *page)
}
static struct rcu_torture_ops srcu_ops = {
- .init = srcu_torture_init,
- .cleanup = srcu_torture_cleanup,
- .readlock = srcu_torture_read_lock,
- .readdelay = srcu_read_delay,
- .readunlock = srcu_torture_read_unlock,
- .completed = srcu_torture_completed,
- .deferredfree = rcu_sync_torture_deferred_free,
- .sync = srcu_torture_synchronize,
- .cb_barrier = NULL,
- .stats = srcu_torture_stats,
- .name = "srcu"
+ .init = srcu_torture_init,
+ .cleanup = srcu_torture_cleanup,
+ .readlock = srcu_torture_read_lock,
+ .read_delay = srcu_read_delay,
+ .readunlock = srcu_torture_read_unlock,
+ .completed = srcu_torture_completed,
+ .deferred_free = rcu_sync_torture_deferred_free,
+ .sync = srcu_torture_synchronize,
+ .cb_barrier = NULL,
+ .stats = srcu_torture_stats,
+ .name = "srcu"
};
/*
@@ -573,32 +574,49 @@ static void sched_torture_synchronize(void)
}
static struct rcu_torture_ops sched_ops = {
- .init = rcu_sync_torture_init,
- .cleanup = NULL,
- .readlock = sched_torture_read_lock,
- .readdelay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = sched_torture_read_unlock,
- .completed = sched_torture_completed,
- .deferredfree = rcu_sched_torture_deferred_free,
- .sync = sched_torture_synchronize,
- .cb_barrier = rcu_barrier_sched,
- .stats = NULL,
- .irqcapable = 1,
- .name = "sched"
+ .init = rcu_sync_torture_init,
+ .cleanup = NULL,
+ .readlock = sched_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = sched_torture_read_unlock,
+ .completed = sched_torture_completed,
+ .deferred_free = rcu_sched_torture_deferred_free,
+ .sync = sched_torture_synchronize,
+ .cb_barrier = rcu_barrier_sched,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "sched"
};
static struct rcu_torture_ops sched_ops_sync = {
- .init = rcu_sync_torture_init,
- .cleanup = NULL,
- .readlock = sched_torture_read_lock,
- .readdelay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = sched_torture_read_unlock,
- .completed = sched_torture_completed,
- .deferredfree = rcu_sync_torture_deferred_free,
- .sync = sched_torture_synchronize,
- .cb_barrier = NULL,
- .stats = NULL,
- .name = "sched_sync"
+ .init = rcu_sync_torture_init,
+ .cleanup = NULL,
+ .readlock = sched_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = sched_torture_read_unlock,
+ .completed = sched_torture_completed,
+ .deferred_free = rcu_sync_torture_deferred_free,
+ .sync = sched_torture_synchronize,
+ .cb_barrier = NULL,
+ .stats = NULL,
+ .name = "sched_sync"
+};
+
+extern int rcu_expedited_torture_stats(char *page);
+
+static struct rcu_torture_ops sched_expedited_ops = {
+ .init = rcu_sync_torture_init,
+ .cleanup = NULL,
+ .readlock = sched_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = sched_torture_read_unlock,
+ .completed = sched_torture_completed,
+ .deferred_free = rcu_sync_torture_deferred_free,
+ .sync = synchronize_sched_expedited,
+ .cb_barrier = NULL,
+ .stats = rcu_expedited_torture_stats,
+ .irq_capable = 1,
+ .name = "sched_expedited"
};
/*
@@ -634,7 +652,7 @@ rcu_torture_writer(void *arg)
i = RCU_TORTURE_PIPE_LEN;
atomic_inc(&rcu_torture_wcount[i]);
old_rp->rtort_pipe_count++;
- cur_ops->deferredfree(old_rp);
+ cur_ops->deferred_free(old_rp);
}
rcu_torture_current_version++;
oldbatch = cur_ops->completed();
@@ -699,7 +717,7 @@ static void rcu_torture_timer(unsigned long unused)
if (p->rtort_mbtest == 0)
atomic_inc(&n_rcu_torture_mberror);
spin_lock(&rand_lock);
- cur_ops->readdelay(&rand);
+ cur_ops->read_delay(&rand);
n_rcu_torture_timers++;
spin_unlock(&rand_lock);
preempt_disable();
@@ -737,11 +755,11 @@ rcu_torture_reader(void *arg)
VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
set_user_nice(current, 19);
- if (irqreader && cur_ops->irqcapable)
+ if (irqreader && cur_ops->irq_capable)
setup_timer_on_stack(&t, rcu_torture_timer, 0);
do {
- if (irqreader && cur_ops->irqcapable) {
+ if (irqreader && cur_ops->irq_capable) {
if (!timer_pending(&t))
mod_timer(&t, 1);
}
@@ -756,7 +774,7 @@ rcu_torture_reader(void *arg)
}
if (p->rtort_mbtest == 0)
atomic_inc(&n_rcu_torture_mberror);
- cur_ops->readdelay(&rand);
+ cur_ops->read_delay(&rand);
preempt_disable();
pipe_count = p->rtort_pipe_count;
if (pipe_count > RCU_TORTURE_PIPE_LEN) {
@@ -777,7 +795,7 @@ rcu_torture_reader(void *arg)
} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
rcutorture_shutdown_absorb("rcu_torture_reader");
- if (irqreader && cur_ops->irqcapable)
+ if (irqreader && cur_ops->irq_capable)
del_timer_sync(&t);
while (!kthread_should_stop())
schedule_timeout_uninterruptible(1);
@@ -889,10 +907,9 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
*/
static void rcu_torture_shuffle_tasks(void)
{
- cpumask_t tmp_mask;
int i;
- cpus_setall(tmp_mask);
+ cpumask_setall(shuffle_tmp_mask);
get_online_cpus();
/* No point in shuffling if there is only one online CPU (ex: UP) */
@@ -902,29 +919,29 @@ static void rcu_torture_shuffle_tasks(void)
}
if (rcu_idle_cpu != -1)
- cpu_clear(rcu_idle_cpu, tmp_mask);
+ cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask);
- set_cpus_allowed_ptr(current, &tmp_mask);
+ set_cpus_allowed_ptr(current, shuffle_tmp_mask);
if (reader_tasks) {
for (i = 0; i < nrealreaders; i++)
if (reader_tasks[i])
set_cpus_allowed_ptr(reader_tasks[i],
- &tmp_mask);
+ shuffle_tmp_mask);
}
if (fakewriter_tasks) {
for (i = 0; i < nfakewriters; i++)
if (fakewriter_tasks[i])
set_cpus_allowed_ptr(fakewriter_tasks[i],
- &tmp_mask);
+ shuffle_tmp_mask);
}
if (writer_task)
- set_cpus_allowed_ptr(writer_task, &tmp_mask);
+ set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
if (stats_task)
- set_cpus_allowed_ptr(stats_task, &tmp_mask);
+ set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
if (rcu_idle_cpu == -1)
rcu_idle_cpu = num_online_cpus() - 1;
@@ -1012,6 +1029,7 @@ rcu_torture_cleanup(void)
if (shuffler_task) {
VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
kthread_stop(shuffler_task);
+ free_cpumask_var(shuffle_tmp_mask);
}
shuffler_task = NULL;
@@ -1077,6 +1095,7 @@ rcu_torture_init(void)
int firsterr = 0;
static struct rcu_torture_ops *torture_ops[] =
{ &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
+ &sched_expedited_ops,
&srcu_ops, &sched_ops, &sched_ops_sync, };
mutex_lock(&fullstop_mutex);
@@ -1190,10 +1209,18 @@ rcu_torture_init(void)
}
if (test_no_idle_hz) {
rcu_idle_cpu = num_online_cpus() - 1;
+
+ if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
+ firsterr = -ENOMEM;
+ VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask");
+ goto unwind;
+ }
+
/* Create the shuffler thread */
shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
"rcu_torture_shuffle");
if (IS_ERR(shuffler_task)) {
+ free_cpumask_var(shuffle_tmp_mask);
firsterr = PTR_ERR(shuffler_task);
VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
shuffler_task = NULL;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f2d8638e6c6..6b11b07cfe7 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -35,6 +35,7 @@
#include <linux/rcupdate.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
+#include <linux/nmi.h>
#include <asm/atomic.h>
#include <linux/bitops.h>
#include <linux/module.h>
@@ -46,6 +47,8 @@
#include <linux/mutex.h>
#include <linux/time.h>
+#include "rcutree.h"
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
struct lockdep_map rcu_lock_map =
@@ -72,12 +75,61 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
.n_force_qs_ngp = 0, \
}
-struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
-DEFINE_PER_CPU(struct rcu_data, rcu_data);
+struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+extern long rcu_batches_completed_sched(void);
+static struct rcu_node *rcu_get_root(struct rcu_state *rsp);
+static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp,
+ struct rcu_node *rnp, unsigned long flags);
+static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags);
+#ifdef CONFIG_HOTPLUG_CPU
+static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+static void __rcu_process_callbacks(struct rcu_state *rsp,
+ struct rcu_data *rdp);
+static void __call_rcu(struct rcu_head *head,
+ void (*func)(struct rcu_head *rcu),
+ struct rcu_state *rsp);
+static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp);
+static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
+ int preemptable);
+
+#include "rcutree_plugin.h"
+
+/*
+ * Note a quiescent state. Because we do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period, this just sets a flag.
+ */
+void rcu_sched_qs(int cpu)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+
+ local_irq_save(flags);
+ rdp = &per_cpu(rcu_sched_data, cpu);
+ rdp->passed_quiesc = 1;
+ rdp->passed_quiesc_completed = rdp->completed;
+ rcu_preempt_qs(cpu);
+ local_irq_restore(flags);
+}
+
+void rcu_bh_qs(int cpu)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+
+ local_irq_save(flags);
+ rdp = &per_cpu(rcu_bh_data, cpu);
+ rdp->passed_quiesc = 1;
+ rdp->passed_quiesc_completed = rdp->completed;
+ local_irq_restore(flags);
+}
+
#ifdef CONFIG_NO_HZ
DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
.dynticks_nesting = 1,
@@ -90,15 +142,16 @@ static int qhimark = 10000; /* If this many pending, ignore blimit. */
static int qlowmark = 100; /* Once only this many pending, use blimit. */
static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
+static int rcu_pending(int cpu);
/*
- * Return the number of RCU batches processed thus far for debug & stats.
+ * Return the number of RCU-sched batches processed thus far for debug & stats.
*/
-long rcu_batches_completed(void)
+long rcu_batches_completed_sched(void)
{
- return rcu_state.completed;
+ return rcu_sched_state.completed;
}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
+EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
/*
* Return the number of RCU BH batches processed thus far for debug & stats.
@@ -161,6 +214,10 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
return 1;
}
+ /* If preemptable RCU, no point in sending reschedule IPI. */
+ if (rdp->preemptable)
+ return 0;
+
/* The CPU is online, so send it a reschedule IPI. */
if (rdp->cpu != smp_processor_id())
smp_send_reschedule(rdp->cpu);
@@ -173,7 +230,6 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
#endif /* #ifdef CONFIG_SMP */
#ifdef CONFIG_NO_HZ
-static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
/**
* rcu_enter_nohz - inform RCU that current CPU is entering nohz
@@ -193,7 +249,7 @@ void rcu_enter_nohz(void)
rdtp = &__get_cpu_var(rcu_dynticks);
rdtp->dynticks++;
rdtp->dynticks_nesting--;
- WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
+ WARN_ON_ONCE(rdtp->dynticks & 0x1);
local_irq_restore(flags);
}
@@ -212,7 +268,7 @@ void rcu_exit_nohz(void)
rdtp = &__get_cpu_var(rcu_dynticks);
rdtp->dynticks++;
rdtp->dynticks_nesting++;
- WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
+ WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
local_irq_restore(flags);
smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
}
@@ -231,7 +287,7 @@ void rcu_nmi_enter(void)
if (rdtp->dynticks & 0x1)
return;
rdtp->dynticks_nmi++;
- WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs);
+ WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1));
smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
}
@@ -250,7 +306,7 @@ void rcu_nmi_exit(void)
return;
smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
rdtp->dynticks_nmi++;
- WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs);
+ WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1);
}
/**
@@ -266,7 +322,7 @@ void rcu_irq_enter(void)
if (rdtp->dynticks_nesting++)
return;
rdtp->dynticks++;
- WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
+ WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
}
@@ -285,10 +341,10 @@ void rcu_irq_exit(void)
return;
smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
rdtp->dynticks++;
- WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
+ WARN_ON_ONCE(rdtp->dynticks & 0x1);
/* If the interrupt queued a callback, get out of dyntick mode. */
- if (__get_cpu_var(rcu_data).nxtlist ||
+ if (__get_cpu_var(rcu_sched_data).nxtlist ||
__get_cpu_var(rcu_bh_data).nxtlist)
set_need_resched();
}
@@ -441,6 +497,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
printk(KERN_ERR "INFO: RCU detected CPU stalls:");
for (; rnp_cur < rnp_end; rnp_cur++) {
+ rcu_print_task_stall(rnp);
if (rnp_cur->qsmask == 0)
continue;
for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
@@ -449,6 +506,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
}
printk(" (detected by %d, t=%ld jiffies)\n",
smp_processor_id(), (long)(jiffies - rsp->gp_start));
+ trigger_all_cpu_backtrace();
+
force_quiescent_state(rsp, 0); /* Kick them all. */
}
@@ -459,12 +518,14 @@ static void print_cpu_stall(struct rcu_state *rsp)
printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
smp_processor_id(), jiffies - rsp->gp_start);
- dump_stack();
+ trigger_all_cpu_backtrace();
+
spin_lock_irqsave(&rnp->lock, flags);
if ((long)(jiffies - rsp->jiffies_stall) >= 0)
rsp->jiffies_stall =
jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
spin_unlock_irqrestore(&rnp->lock, flags);
+
set_need_resched(); /* kick ourselves to get things going. */
}
@@ -510,8 +571,6 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
rdp->qs_pending = 1;
rdp->passed_quiesc = 0;
rdp->gpnum = rsp->gpnum;
- rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
- RCU_JIFFIES_TILL_FORCE_QS;
}
/*
@@ -558,8 +617,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
rsp->gpnum++;
rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
- rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
- RCU_JIFFIES_TILL_FORCE_QS;
record_gp_stall_check_time(rsp);
dyntick_record_completed(rsp, rsp->completed - 1);
note_new_gpnum(rsp, rdp);
@@ -658,6 +715,19 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
}
/*
+ * Clean up after the prior grace period and let rcu_start_gp() start up
+ * the next grace period if one is needed. Note that the caller must
+ * hold rnp->lock, as required by rcu_start_gp(), which will release it.
+ */
+static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
+ __releases(rnp->lock)
+{
+ rsp->completed = rsp->gpnum;
+ rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
+ rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
+}
+
+/*
* Similar to cpu_quiet(), for which it is a helper function. Allows
* a group of CPUs to be quieted at one go, though all the CPUs in the
* group must be represented by the same leaf rcu_node structure.
@@ -678,7 +748,7 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
return;
}
rnp->qsmask &= ~mask;
- if (rnp->qsmask != 0) {
+ if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
/* Other bits still set at this level, so done. */
spin_unlock_irqrestore(&rnp->lock, flags);
@@ -698,14 +768,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
/*
* Get here if we are the last CPU to pass through a quiescent
- * state for this grace period. Clean up and let rcu_start_gp()
- * start up the next grace period if one is needed. Note that
- * we still hold rnp->lock, as required by rcu_start_gp(), which
- * will release it.
+ * state for this grace period. Invoke cpu_quiet_msk_finish()
+ * to clean up and start the next grace period if one is needed.
*/
- rsp->completed = rsp->gpnum;
- rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
- rcu_start_gp(rsp, flags); /* releases rnp->lock. */
+ cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */
}
/*
@@ -812,11 +878,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
spin_lock(&rnp->lock); /* irqs already disabled. */
rnp->qsmaskinit &= ~mask;
if (rnp->qsmaskinit != 0) {
- spin_unlock(&rnp->lock); /* irqs already disabled. */
+ spin_unlock(&rnp->lock); /* irqs remain disabled. */
break;
}
+ rcu_preempt_offline_tasks(rsp, rnp);
mask = rnp->grpmask;
- spin_unlock(&rnp->lock); /* irqs already disabled. */
+ spin_unlock(&rnp->lock); /* irqs remain disabled. */
rnp = rnp->parent;
} while (rnp != NULL);
lastcomp = rsp->completed;
@@ -829,7 +896,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
/*
* Move callbacks from the outgoing CPU to the running CPU.
* Note that the outgoing CPU is now quiscent, so it is now
- * (uncharacteristically) safe to access it rcu_data structure.
+ * (uncharacteristically) safe to access its rcu_data structure.
* Note also that we must carefully retain the order of the
* outgoing CPU's callbacks in order for rcu_barrier() to work
* correctly. Finally, note that we start all the callbacks
@@ -860,8 +927,9 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
*/
static void rcu_offline_cpu(int cpu)
{
- __rcu_offline_cpu(cpu, &rcu_state);
+ __rcu_offline_cpu(cpu, &rcu_sched_state);
__rcu_offline_cpu(cpu, &rcu_bh_state);
+ rcu_preempt_offline_cpu(cpu);
}
#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -947,25 +1015,26 @@ static void rcu_do_batch(struct rcu_data *rdp)
*/
void rcu_check_callbacks(int cpu, int user)
{
+ if (!rcu_pending(cpu))
+ return; /* if nothing for RCU to do. */
if (user ||
- (idle_cpu(cpu) && !in_softirq() &&
- hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+ (idle_cpu(cpu) && rcu_scheduler_active &&
+ !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
/*
* Get here if this CPU took its interrupt from user
* mode or from the idle loop, and if this is not a
* nested interrupt. In this case, the CPU is in
- * a quiescent state, so count it.
+ * a quiescent state, so note it.
*
* No memory barrier is required here because both
- * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference
- * only CPU-local variables that other CPUs neither
- * access nor modify, at least not while the corresponding
- * CPU is online.
+ * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local
+ * variables that other CPUs neither access nor modify,
+ * at least not while the corresponding CPU is online.
*/
- rcu_qsctr_inc(cpu);
- rcu_bh_qsctr_inc(cpu);
+ rcu_sched_qs(cpu);
+ rcu_bh_qs(cpu);
} else if (!in_softirq()) {
@@ -973,11 +1042,12 @@ void rcu_check_callbacks(int cpu, int user)
* Get here if this CPU did not take its interrupt from
* softirq, in other words, if it is not interrupting
* a rcu_bh read-side critical section. This is an _bh
- * critical section, so count it.
+ * critical section, so note it.
*/
- rcu_bh_qsctr_inc(cpu);
+ rcu_bh_qs(cpu);
}
+ rcu_preempt_check_callbacks(cpu);
raise_softirq(RCU_SOFTIRQ);
}
@@ -1035,7 +1105,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
{
unsigned long flags;
long lastcomp;
- struct rcu_data *rdp = rsp->rda[smp_processor_id()];
struct rcu_node *rnp = rcu_get_root(rsp);
u8 signaled;
@@ -1046,16 +1115,13 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
return; /* Someone else is already on the job. */
}
if (relaxed &&
- (long)(rsp->jiffies_force_qs - jiffies) >= 0 &&
- (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) >= 0)
+ (long)(rsp->jiffies_force_qs - jiffies) >= 0)
goto unlock_ret; /* no emergency and done recently. */
rsp->n_force_qs++;
spin_lock(&rnp->lock);
lastcomp = rsp->completed;
signaled = rsp->signaled;
rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
- rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
- RCU_JIFFIES_TILL_FORCE_QS;
if (lastcomp == rsp->gpnum) {
rsp->n_force_qs_ngp++;
spin_unlock(&rnp->lock);
@@ -1120,12 +1186,13 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
{
unsigned long flags;
+ WARN_ON_ONCE(rdp->beenonline == 0);
+
/*
* If an RCU GP has gone long enough, go check for dyntick
* idle CPUs and, if needed, send resched IPIs.
*/
- if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
- (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
+ if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
force_quiescent_state(rsp, 1);
/*
@@ -1159,8 +1226,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
*/
smp_mb(); /* See above block comment. */
- __rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
+ __rcu_process_callbacks(&rcu_sched_state,
+ &__get_cpu_var(rcu_sched_data));
__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+ rcu_preempt_process_callbacks();
/*
* Memory references from any later RCU read-side critical sections
@@ -1210,20 +1279,19 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
if (unlikely(++rdp->qlen > qhimark)) {
rdp->blimit = LONG_MAX;
force_quiescent_state(rsp, 0);
- } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
- (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
+ } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
force_quiescent_state(rsp, 1);
local_irq_restore(flags);
}
/*
- * Queue an RCU callback for invocation after a grace period.
+ * Queue an RCU-sched callback for invocation after a grace period.
*/
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
{
- __call_rcu(head, func, &rcu_state);
+ __call_rcu(head, func, &rcu_sched_state);
}
-EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(call_rcu_sched);
/*
* Queue an RCU for invocation after a quicker grace period.
@@ -1249,32 +1317,44 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
check_cpu_stall(rsp, rdp);
/* Is the RCU core waiting for a quiescent state from this CPU? */
- if (rdp->qs_pending)
+ if (rdp->qs_pending) {
+ rdp->n_rp_qs_pending++;
return 1;
+ }
/* Does this CPU have callbacks ready to invoke? */
- if (cpu_has_callbacks_ready_to_invoke(rdp))
+ if (cpu_has_callbacks_ready_to_invoke(rdp)) {
+ rdp->n_rp_cb_ready++;
return 1;
+ }
/* Has RCU gone idle with this CPU needing another grace period? */
- if (cpu_needs_another_gp(rsp, rdp))
+ if (cpu_needs_another_gp(rsp, rdp)) {
+ rdp->n_rp_cpu_needs_gp++;
return 1;
+ }
/* Has another RCU grace period completed? */
- if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
+ if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */
+ rdp->n_rp_gp_completed++;
return 1;
+ }
/* Has a new RCU grace period started? */
- if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
+ if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */
+ rdp->n_rp_gp_started++;
return 1;
+ }
/* Has an RCU GP gone long enough to send resched IPIs &c? */
if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
- ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
- (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0))
+ ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
+ rdp->n_rp_need_fqs++;
return 1;
+ }
/* nothing to do */
+ rdp->n_rp_need_nothing++;
return 0;
}
@@ -1283,10 +1363,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
* by the current CPU, returning 1 if so. This function is part of the
* RCU implementation; it is -not- an exported member of the RCU API.
*/
-int rcu_pending(int cpu)
+static int rcu_pending(int cpu)
{
- return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) ||
- __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
+ return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
+ __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) ||
+ rcu_preempt_pending(cpu);
}
/*
@@ -1298,27 +1379,46 @@ int rcu_pending(int cpu)
int rcu_needs_cpu(int cpu)
{
/* RCU callbacks either ready or pending? */
- return per_cpu(rcu_data, cpu).nxtlist ||
- per_cpu(rcu_bh_data, cpu).nxtlist;
+ return per_cpu(rcu_sched_data, cpu).nxtlist ||
+ per_cpu(rcu_bh_data, cpu).nxtlist ||
+ rcu_preempt_needs_cpu(cpu);
}
/*
- * Initialize a CPU's per-CPU RCU data. We take this "scorched earth"
- * approach so that we don't have to worry about how long the CPU has
- * been gone, or whether it ever was online previously. We do trust the
- * ->mynode field, as it is constant for a given struct rcu_data and
- * initialized during early boot.
- *
- * Note that only one online or offline event can be happening at a given
- * time. Note also that we can accept some slop in the rsp->completed
- * access due to the fact that this CPU cannot possibly have any RCU
- * callbacks in flight yet.
+ * Do boot-time initialization of a CPU's per-CPU RCU data.
*/
-static void
-rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
+static void __init
+rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
{
unsigned long flags;
int i;
+ struct rcu_data *rdp = rsp->rda[cpu];
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ /* Set up local state, ensuring consistent view of global state. */
+ spin_lock_irqsave(&rnp->lock, flags);
+ rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
+ rdp->nxtlist = NULL;
+ for (i = 0; i < RCU_NEXT_SIZE; i++)
+ rdp->nxttail[i] = &rdp->nxtlist;
+ rdp->qlen = 0;
+#ifdef CONFIG_NO_HZ
+ rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
+#endif /* #ifdef CONFIG_NO_HZ */
+ rdp->cpu = cpu;
+ spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
+ * Initialize a CPU's per-CPU RCU data. Note that only one online or
+ * offline event can be happening at a given time. Note also that we
+ * can accept some slop in the rsp->completed access due to the fact
+ * that this CPU cannot possibly have any RCU callbacks in flight yet.
+ */
+static void __cpuinit
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
+{
+ unsigned long flags;
long lastcomp;
unsigned long mask;
struct rcu_data *rdp = rsp->rda[cpu];
@@ -1332,17 +1432,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->passed_quiesc = 0; /* We could be racing with new GP, */
rdp->qs_pending = 1; /* so set up to respond to current GP. */
rdp->beenonline = 1; /* We have now been online. */
+ rdp->preemptable = preemptable;
rdp->passed_quiesc_completed = lastcomp - 1;
- rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
- rdp->nxtlist = NULL;
- for (i = 0; i < RCU_NEXT_SIZE; i++)
- rdp->nxttail[i] = &rdp->nxtlist;
- rdp->qlen = 0;
rdp->blimit = blimit;
-#ifdef CONFIG_NO_HZ
- rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
-#endif /* #ifdef CONFIG_NO_HZ */
- rdp->cpu = cpu;
spin_unlock(&rnp->lock); /* irqs remain disabled. */
/*
@@ -1383,16 +1475,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
static void __cpuinit rcu_online_cpu(int cpu)
{
- rcu_init_percpu_data(cpu, &rcu_state);
- rcu_init_percpu_data(cpu, &rcu_bh_state);
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+ rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
+ rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
+ rcu_preempt_init_percpu_data(cpu);
}
/*
- * Handle CPU online/offline notifcation events.
+ * Handle CPU online/offline notification events.
*/
-static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
@@ -1464,6 +1556,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
rnp = rsp->level[i];
for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
spin_lock_init(&rnp->lock);
+ rnp->gpnum = 0;
rnp->qsmask = 0;
rnp->qsmaskinit = 0;
rnp->grplo = j * cpustride;
@@ -1481,16 +1574,20 @@ static void __init rcu_init_one(struct rcu_state *rsp)
j / rsp->levelspread[i - 1];
}
rnp->level = i;
+ INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
+ INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
}
}
}
/*
- * Helper macro for __rcu_init(). To be used nowhere else!
- * Assigns leaf node pointers into each CPU's rcu_data structure.
+ * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used
+ * nowhere else! Assigns leaf node pointers into each CPU's rcu_data
+ * structure.
*/
-#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
+#define RCU_INIT_FLAVOR(rsp, rcu_data) \
do { \
+ rcu_init_one(rsp); \
rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
j = 0; \
for_each_possible_cpu(i) { \
@@ -1498,33 +1595,43 @@ do { \
j++; \
per_cpu(rcu_data, i).mynode = &rnp[j]; \
(rsp)->rda[i] = &per_cpu(rcu_data, i); \
+ rcu_boot_init_percpu_data(i, rsp); \
} \
} while (0)
-static struct notifier_block __cpuinitdata rcu_nb = {
- .notifier_call = rcu_cpu_notify,
-};
+#ifdef CONFIG_TREE_PREEMPT_RCU
+
+void __init __rcu_init_preempt(void)
+{
+ int i; /* All used by RCU_INIT_FLAVOR(). */
+ int j;
+ struct rcu_node *rnp;
+
+ RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
+}
+
+#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+
+void __init __rcu_init_preempt(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
void __init __rcu_init(void)
{
- int i; /* All used by RCU_DATA_PTR_INIT(). */
+ int i; /* All used by RCU_INIT_FLAVOR(). */
int j;
struct rcu_node *rnp;
- printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n");
+ rcu_bootup_announce();
#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
- rcu_init_one(&rcu_state);
- RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
- rcu_init_one(&rcu_bh_state);
- RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
-
- for_each_online_cpu(i)
- rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
- /* Register notifier for non-boot CPUs */
- register_cpu_notifier(&rcu_nb);
- printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
+ RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
+ RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
+ __rcu_init_preempt();
+ open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
}
module_param(blimit, int, 0);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
new file mode 100644
index 00000000000..bf8a6f9f134
--- /dev/null
+++ b/kernel/rcutree.h
@@ -0,0 +1,259 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Internal non-public definitions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Author: Ingo Molnar <mingo@elte.hu>
+ * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/seqlock.h>
+
+/*
+ * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
+ * In theory, it should be possible to add more levels straightforwardly.
+ * In practice, this has not been tested, so there is probably some
+ * bug somewhere.
+ */
+#define MAX_RCU_LVLS 3
+#define RCU_FANOUT (CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
+#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
+
+#if NR_CPUS <= RCU_FANOUT
+# define NUM_RCU_LVLS 1
+# define NUM_RCU_LVL_0 1
+# define NUM_RCU_LVL_1 (NR_CPUS)
+# define NUM_RCU_LVL_2 0
+# define NUM_RCU_LVL_3 0
+#elif NR_CPUS <= RCU_FANOUT_SQ
+# define NUM_RCU_LVLS 2
+# define NUM_RCU_LVL_0 1
+# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
+# define NUM_RCU_LVL_2 (NR_CPUS)
+# define NUM_RCU_LVL_3 0
+#elif NR_CPUS <= RCU_FANOUT_CUBE
+# define NUM_RCU_LVLS 3
+# define NUM_RCU_LVL_0 1
+# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
+# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
+# define NUM_RCU_LVL_3 NR_CPUS
+#else
+# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
+#endif /* #if (NR_CPUS) <= RCU_FANOUT */
+
+#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
+
+/*
+ * Dynticks per-CPU state.
+ */
+struct rcu_dynticks {
+ int dynticks_nesting; /* Track nesting level, sort of. */
+ int dynticks; /* Even value for dynticks-idle, else odd. */
+ int dynticks_nmi; /* Even value for either dynticks-idle or */
+ /* not in nmi handler, else odd. So this */
+ /* remains even for nmi from irq handler. */
+};
+
+/*
+ * Definition for node within the RCU grace-period-detection hierarchy.
+ */
+struct rcu_node {
+ spinlock_t lock;
+ long gpnum; /* Current grace period for this node. */
+ /* This will either be equal to or one */
+ /* behind the root rcu_node's gpnum. */
+ unsigned long qsmask; /* CPUs or groups that need to switch in */
+ /* order for current grace period to proceed.*/
+ unsigned long qsmaskinit;
+ /* Per-GP initialization for qsmask. */
+ unsigned long grpmask; /* Mask to apply to parent qsmask. */
+ int grplo; /* lowest-numbered CPU or group here. */
+ int grphi; /* highest-numbered CPU or group here. */
+ u8 grpnum; /* CPU/group number for next level up. */
+ u8 level; /* root is at level 0. */
+ struct rcu_node *parent;
+ struct list_head blocked_tasks[2];
+ /* Tasks blocked in RCU read-side critsect. */
+} ____cacheline_internodealigned_in_smp;
+
+/* Index values for nxttail array in struct rcu_data. */
+#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
+#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
+#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
+#define RCU_NEXT_TAIL 3
+#define RCU_NEXT_SIZE 4
+
+/* Per-CPU data for read-copy update. */
+struct rcu_data {
+ /* 1) quiescent-state and grace-period handling : */
+ long completed; /* Track rsp->completed gp number */
+ /* in order to detect GP end. */
+ long gpnum; /* Highest gp number that this CPU */
+ /* is aware of having started. */
+ long passed_quiesc_completed;
+ /* Value of completed at time of qs. */
+ bool passed_quiesc; /* User-mode/idle loop etc. */
+ bool qs_pending; /* Core waits for quiesc state. */
+ bool beenonline; /* CPU online at least once. */
+ bool preemptable; /* Preemptable RCU? */
+ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
+ unsigned long grpmask; /* Mask to apply to leaf qsmask. */
+
+ /* 2) batch handling */
+ /*
+ * If nxtlist is not NULL, it is partitioned as follows.
+ * Any of the partitions might be empty, in which case the
+ * pointer to that partition will be equal to the pointer for
+ * the following partition. When the list is empty, all of
+ * the nxttail elements point to nxtlist, which is NULL.
+ *
+ * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
+ * Entries that might have arrived after current GP ended
+ * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
+ * Entries known to have arrived before current GP ended
+ * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
+ * Entries that batch # <= ->completed - 1: waiting for current GP
+ * [nxtlist, *nxttail[RCU_DONE_TAIL]):
+ * Entries that batch # <= ->completed
+ * The grace period for these entries has completed, and
+ * the other grace-period-completed entries may be moved
+ * here temporarily in rcu_process_callbacks().
+ */
+ struct rcu_head *nxtlist;
+ struct rcu_head **nxttail[RCU_NEXT_SIZE];
+ long qlen; /* # of queued callbacks */
+ long blimit; /* Upper limit on a processed batch */
+
+#ifdef CONFIG_NO_HZ
+ /* 3) dynticks interface. */
+ struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
+ int dynticks_snap; /* Per-GP tracking for dynticks. */
+ int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
+#endif /* #ifdef CONFIG_NO_HZ */
+
+ /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
+#ifdef CONFIG_NO_HZ
+ unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
+#endif /* #ifdef CONFIG_NO_HZ */
+ unsigned long offline_fqs; /* Kicked due to being offline. */
+ unsigned long resched_ipi; /* Sent a resched IPI. */
+
+ /* 5) __rcu_pending() statistics. */
+ long n_rcu_pending; /* rcu_pending() calls since boot. */
+ long n_rp_qs_pending;
+ long n_rp_cb_ready;
+ long n_rp_cpu_needs_gp;
+ long n_rp_gp_completed;
+ long n_rp_gp_started;
+ long n_rp_need_fqs;
+ long n_rp_need_nothing;
+
+ int cpu;
+};
+
+/* Values for signaled field in struct rcu_state. */
+#define RCU_GP_INIT 0 /* Grace period being initialized. */
+#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */
+#define RCU_FORCE_QS 2 /* Need to force quiescent state. */
+#ifdef CONFIG_NO_HZ
+#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
+#else /* #ifdef CONFIG_NO_HZ */
+#define RCU_SIGNAL_INIT RCU_FORCE_QS
+#endif /* #else #ifdef CONFIG_NO_HZ */
+
+#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */
+#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */
+#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
+ /* to take at least one */
+ /* scheduling clock irq */
+ /* before ratting on them. */
+
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+/*
+ * RCU global state, including node hierarchy. This hierarchy is
+ * represented in "heap" form in a dense array. The root (first level)
+ * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
+ * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
+ * and the third level in ->node[m+1] and following (->node[m+1] referenced
+ * by ->level[2]). The number of levels is determined by the number of
+ * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy"
+ * consisting of a single rcu_node.
+ */
+struct rcu_state {
+ struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
+ struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
+ u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
+ u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
+ struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */
+
+ /* The following fields are guarded by the root rcu_node's lock. */
+
+ u8 signaled ____cacheline_internodealigned_in_smp;
+ /* Force QS state. */
+ long gpnum; /* Current gp number. */
+ long completed; /* # of last completed gp. */
+ spinlock_t onofflock; /* exclude on/offline and */
+ /* starting new GP. */
+ spinlock_t fqslock; /* Only one task forcing */
+ /* quiescent states. */
+ unsigned long jiffies_force_qs; /* Time at which to invoke */
+ /* force_quiescent_state(). */
+ unsigned long n_force_qs; /* Number of calls to */
+ /* force_quiescent_state(). */
+ unsigned long n_force_qs_lh; /* ~Number of calls leaving */
+ /* due to lock unavailable. */
+ unsigned long n_force_qs_ngp; /* Number of calls leaving */
+ /* due to no GP active. */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+ unsigned long gp_start; /* Time at which GP started, */
+ /* but in jiffies. */
+ unsigned long jiffies_stall; /* Time at which to check */
+ /* for CPU stalls. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#ifdef CONFIG_NO_HZ
+ long dynticks_completed; /* Value of completed @ snap. */
+#endif /* #ifdef CONFIG_NO_HZ */
+};
+
+#ifdef RCU_TREE_NONCORE
+
+/*
+ * RCU implementation internal declarations:
+ */
+extern struct rcu_state rcu_sched_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
+
+extern struct rcu_state rcu_bh_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+#ifdef CONFIG_TREE_PREEMPT_RCU
+extern struct rcu_state rcu_preempt_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+
+#endif /* #ifdef RCU_TREE_NONCORE */
+
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
new file mode 100644
index 00000000000..47789369ea5
--- /dev/null
+++ b/kernel/rcutree_plugin.h
@@ -0,0 +1,532 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Internal non-public definitions that provide either classic
+ * or preemptable semantics.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright Red Hat, 2009
+ * Copyright IBM Corporation, 2009
+ *
+ * Author: Ingo Molnar <mingo@elte.hu>
+ * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+
+#ifdef CONFIG_TREE_PREEMPT_RCU
+
+struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
+
+/*
+ * Tell them what RCU they are running.
+ */
+static inline void rcu_bootup_announce(void)
+{
+ printk(KERN_INFO
+ "Experimental preemptable hierarchical RCU implementation.\n");
+}
+
+/*
+ * Return the number of RCU-preempt batches processed thus far
+ * for debug and statistics.
+ */
+long rcu_batches_completed_preempt(void)
+{
+ return rcu_preempt_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
+
+/*
+ * Return the number of RCU batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed(void)
+{
+ return rcu_batches_completed_preempt();
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Record a preemptable-RCU quiescent state for the specified CPU. Note
+ * that this just means that the task currently running on the CPU is
+ * not in a quiescent state. There might be any number of tasks blocked
+ * while in an RCU read-side critical section.
+ */
+static void rcu_preempt_qs_record(int cpu)
+{
+ struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
+ rdp->passed_quiesc = 1;
+ rdp->passed_quiesc_completed = rdp->completed;
+}
+
+/*
+ * We have entered the scheduler or are between softirqs in ksoftirqd.
+ * If we are in an RCU read-side critical section, we need to reflect
+ * that in the state of the rcu_node structure corresponding to this CPU.
+ * Caller must disable hardirqs.
+ */
+static void rcu_preempt_qs(int cpu)
+{
+ struct task_struct *t = current;
+ int phase;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ if (t->rcu_read_lock_nesting &&
+ (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
+
+ /* Possibly blocking in an RCU read-side critical section. */
+ rdp = rcu_preempt_state.rda[cpu];
+ rnp = rdp->mynode;
+ spin_lock(&rnp->lock);
+ t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
+ t->rcu_blocked_node = rnp;
+
+ /*
+ * If this CPU has already checked in, then this task
+ * will hold up the next grace period rather than the
+ * current grace period. Queue the task accordingly.
+ * If the task is queued for the current grace period
+ * (i.e., this CPU has not yet passed through a quiescent
+ * state for the current grace period), then as long
+ * as that task remains queued, the current grace period
+ * cannot end.
+ */
+ phase = !(rnp->qsmask & rdp->grpmask) ^ (rnp->gpnum & 0x1);
+ list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
+ smp_mb(); /* Ensure later ctxt swtch seen after above. */
+ spin_unlock(&rnp->lock);
+ }
+
+ /*
+ * Either we were not in an RCU read-side critical section to
+ * begin with, or we have now recorded that critical section
+ * globally. Either way, we can now note a quiescent state
+ * for this CPU. Again, if we were in an RCU read-side critical
+ * section, and if that critical section was blocking the current
+ * grace period, then the fact that the task has been enqueued
+ * means that we continue to block the current grace period.
+ */
+ rcu_preempt_qs_record(cpu);
+ t->rcu_read_unlock_special &= ~(RCU_READ_UNLOCK_NEED_QS |
+ RCU_READ_UNLOCK_GOT_QS);
+}
+
+/*
+ * Tree-preemptable RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+ ACCESS_ONCE(current->rcu_read_lock_nesting)++;
+ barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+static void rcu_read_unlock_special(struct task_struct *t)
+{
+ int empty;
+ unsigned long flags;
+ unsigned long mask;
+ struct rcu_node *rnp;
+ int special;
+
+ /* NMI handlers cannot block and cannot safely manipulate state. */
+ if (in_nmi())
+ return;
+
+ local_irq_save(flags);
+
+ /*
+ * If RCU core is waiting for this CPU to exit critical section,
+ * let it know that we have done so.
+ */
+ special = t->rcu_read_unlock_special;
+ if (special & RCU_READ_UNLOCK_NEED_QS) {
+ t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+ t->rcu_read_unlock_special |= RCU_READ_UNLOCK_GOT_QS;
+ }
+
+ /* Hardware IRQ handlers cannot block. */
+ if (in_irq()) {
+ local_irq_restore(flags);
+ return;
+ }
+
+ /* Clean up if blocked during RCU read-side critical section. */
+ if (special & RCU_READ_UNLOCK_BLOCKED) {
+ t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
+
+ /*
+ * Remove this task from the list it blocked on. The
+ * task can migrate while we acquire the lock, but at
+ * most one time. So at most two passes through loop.
+ */
+ for (;;) {
+ rnp = t->rcu_blocked_node;
+ spin_lock(&rnp->lock);
+ if (rnp == t->rcu_blocked_node)
+ break;
+ spin_unlock(&rnp->lock);
+ }
+ empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
+ list_del_init(&t->rcu_node_entry);
+ t->rcu_blocked_node = NULL;
+
+ /*
+ * If this was the last task on the current list, and if
+ * we aren't waiting on any CPUs, report the quiescent state.
+ * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk()
+ * drop rnp->lock and restore irq.
+ */
+ if (!empty && rnp->qsmask == 0 &&
+ list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) {
+ t->rcu_read_unlock_special &=
+ ~(RCU_READ_UNLOCK_NEED_QS |
+ RCU_READ_UNLOCK_GOT_QS);
+ if (rnp->parent == NULL) {
+ /* Only one rcu_node in the tree. */
+ cpu_quiet_msk_finish(&rcu_preempt_state, flags);
+ return;
+ }
+ /* Report up the rest of the hierarchy. */
+ mask = rnp->grpmask;
+ spin_unlock_irqrestore(&rnp->lock, flags);
+ rnp = rnp->parent;
+ spin_lock_irqsave(&rnp->lock, flags);
+ cpu_quiet_msk(mask, &rcu_preempt_state, rnp, flags);
+ return;
+ }
+ spin_unlock(&rnp->lock);
+ }
+ local_irq_restore(flags);
+}
+
+/*
+ * Tree-preemptable RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+ struct task_struct *t = current;
+
+ barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
+ if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
+ unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+ rcu_read_unlock_special(t);
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, printing out the tid of each.
+ */
+static void rcu_print_task_stall(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ struct list_head *lp;
+ int phase = rnp->gpnum & 0x1;
+ struct task_struct *t;
+
+ if (!list_empty(&rnp->blocked_tasks[phase])) {
+ spin_lock_irqsave(&rnp->lock, flags);
+ phase = rnp->gpnum & 0x1; /* re-read under lock. */
+ lp = &rnp->blocked_tasks[phase];
+ list_for_each_entry(t, lp, rcu_node_entry)
+ printk(" P%d", t->pid);
+ spin_unlock_irqrestore(&rnp->lock, flags);
+ }
+}
+
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+/*
+ * Check for preempted RCU readers for the specified rcu_node structure.
+ * If the caller needs a reliable answer, it must hold the rcu_node's
+ * >lock.
+ */
+static int rcu_preempted_readers(struct rcu_node *rnp)
+{
+ return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Handle tasklist migration for case in which all CPUs covered by the
+ * specified rcu_node have gone offline. Move them up to the root
+ * rcu_node. The reason for not just moving them to the immediate
+ * parent is to remove the need for rcu_read_unlock_special() to
+ * make more than two attempts to acquire the target rcu_node's lock.
+ *
+ * The caller must hold rnp->lock with irqs disabled.
+ */
+static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
+ struct rcu_node *rnp)
+{
+ int i;
+ struct list_head *lp;
+ struct list_head *lp_root;
+ struct rcu_node *rnp_root = rcu_get_root(rsp);
+ struct task_struct *tp;
+
+ if (rnp == rnp_root) {
+ WARN_ONCE(1, "Last CPU thought to be offlined?");
+ return; /* Shouldn't happen: at least one CPU online. */
+ }
+
+ /*
+ * Move tasks up to root rcu_node. Rely on the fact that the
+ * root rcu_node can be at most one ahead of the rest of the
+ * rcu_nodes in terms of gp_num value. This fact allows us to
+ * move the blocked_tasks[] array directly, element by element.
+ */
+ for (i = 0; i < 2; i++) {
+ lp = &rnp->blocked_tasks[i];
+ lp_root = &rnp_root->blocked_tasks[i];
+ while (!list_empty(lp)) {
+ tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
+ spin_lock(&rnp_root->lock); /* irqs already disabled */
+ list_del(&tp->rcu_node_entry);
+ tp->rcu_blocked_node = rnp_root;
+ list_add(&tp->rcu_node_entry, lp_root);
+ spin_unlock(&rnp_root->lock); /* irqs remain disabled */
+ }
+ }
+}
+
+/*
+ * Do CPU-offline processing for preemptable RCU.
+ */
+static void rcu_preempt_offline_cpu(int cpu)
+{
+ __rcu_offline_cpu(cpu, &rcu_preempt_state);
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * Check for a quiescent state from the current CPU. When a task blocks,
+ * the task is recorded in the corresponding CPU's rcu_node structure,
+ * which is checked elsewhere.
+ *
+ * Caller must disable hard irqs.
+ */
+static void rcu_preempt_check_callbacks(int cpu)
+{
+ struct task_struct *t = current;
+
+ if (t->rcu_read_lock_nesting == 0) {
+ t->rcu_read_unlock_special &=
+ ~(RCU_READ_UNLOCK_NEED_QS | RCU_READ_UNLOCK_GOT_QS);
+ rcu_preempt_qs_record(cpu);
+ return;
+ }
+ if (per_cpu(rcu_preempt_data, cpu).qs_pending) {
+ if (t->rcu_read_unlock_special & RCU_READ_UNLOCK_GOT_QS) {
+ rcu_preempt_qs_record(cpu);
+ t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_GOT_QS;
+ } else if (!(t->rcu_read_unlock_special &
+ RCU_READ_UNLOCK_NEED_QS)) {
+ t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
+ }
+ }
+}
+
+/*
+ * Process callbacks for preemptable RCU.
+ */
+static void rcu_preempt_process_callbacks(void)
+{
+ __rcu_process_callbacks(&rcu_preempt_state,
+ &__get_cpu_var(rcu_preempt_data));
+}
+
+/*
+ * Queue a preemptable-RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ __call_rcu(head, func, &rcu_preempt_state);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/*
+ * Check to see if there is any immediate preemptable-RCU-related work
+ * to be done.
+ */
+static int rcu_preempt_pending(int cpu)
+{
+ return __rcu_pending(&rcu_preempt_state,
+ &per_cpu(rcu_preempt_data, cpu));
+}
+
+/*
+ * Does preemptable RCU need the CPU to stay out of dynticks mode?
+ */
+static int rcu_preempt_needs_cpu(int cpu)
+{
+ return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
+}
+
+/*
+ * Initialize preemptable RCU's per-CPU data.
+ */
+static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
+{
+ rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
+}
+
+/*
+ * Check for a task exiting while in a preemptable-RCU read-side
+ * critical section, clean up if so. No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+ struct task_struct *t = current;
+
+ if (t->rcu_read_lock_nesting == 0)
+ return;
+ t->rcu_read_lock_nesting = 1;
+ rcu_read_unlock();
+}
+
+#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+
+/*
+ * Tell them what RCU they are running.
+ */
+static inline void rcu_bootup_announce(void)
+{
+ printk(KERN_INFO "Hierarchical RCU implementation.\n");
+}
+
+/*
+ * Return the number of RCU batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed(void)
+{
+ return rcu_batches_completed_sched();
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Because preemptable RCU does not exist, we never have to check for
+ * CPUs being in quiescent states.
+ */
+static void rcu_preempt_qs(int cpu)
+{
+}
+
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+
+/*
+ * Because preemptable RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
+static void rcu_print_task_stall(struct rcu_node *rnp)
+{
+}
+
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+/*
+ * Because preemptable RCU does not exist, there are never any preempted
+ * RCU readers.
+ */
+static int rcu_preempted_readers(struct rcu_node *rnp)
+{
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Because preemptable RCU does not exist, it never needs to migrate
+ * tasks that were blocked within RCU read-side critical sections.
+ */
+static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
+ struct rcu_node *rnp)
+{
+}
+
+/*
+ * Because preemptable RCU does not exist, it never needs CPU-offline
+ * processing.
+ */
+static void rcu_preempt_offline_cpu(int cpu)
+{
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * Because preemptable RCU does not exist, it never has any callbacks
+ * to check.
+ */
+void rcu_preempt_check_callbacks(int cpu)
+{
+}
+
+/*
+ * Because preemptable RCU does not exist, it never has any callbacks
+ * to process.
+ */
+void rcu_preempt_process_callbacks(void)
+{
+}
+
+/*
+ * In classic RCU, call_rcu() is just call_rcu_sched().
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ call_rcu_sched(head, func);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/*
+ * Because preemptable RCU does not exist, it never has any work to do.
+ */
+static int rcu_preempt_pending(int cpu)
+{
+ return 0;
+}
+
+/*
+ * Because preemptable RCU does not exist, it never needs any CPU.
+ */
+static int rcu_preempt_needs_cpu(int cpu)
+{
+ return 0;
+}
+
+/*
+ * Because preemptable RCU does not exist, there is no per-CPU
+ * data to initialize.
+ */
+static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
+{
+}
+
+#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d6db3e83782..0ea1bff6972 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -43,18 +43,19 @@
#include <linux/debugfs.h>
#include <linux/seq_file.h>
+#define RCU_TREE_NONCORE
+#include "rcutree.h"
+
static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
{
if (!rdp->beenonline)
return;
- seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d rpfq=%ld rp=%x",
+ seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d",
rdp->cpu,
cpu_is_offline(rdp->cpu) ? '!' : ' ',
rdp->completed, rdp->gpnum,
rdp->passed_quiesc, rdp->passed_quiesc_completed,
- rdp->qs_pending,
- rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
- (int)(rdp->n_rcu_pending & 0xffff));
+ rdp->qs_pending);
#ifdef CONFIG_NO_HZ
seq_printf(m, " dt=%d/%d dn=%d df=%lu",
rdp->dynticks->dynticks,
@@ -76,8 +77,12 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
static int show_rcudata(struct seq_file *m, void *unused)
{
- seq_puts(m, "rcu:\n");
- PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m);
+#ifdef CONFIG_TREE_PREEMPT_RCU
+ seq_puts(m, "rcu_preempt:\n");
+ PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m);
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+ seq_puts(m, "rcu_sched:\n");
+ PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m);
seq_puts(m, "rcu_bh:\n");
PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
return 0;
@@ -100,14 +105,12 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
{
if (!rdp->beenonline)
return;
- seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d,%ld,%ld",
+ seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d",
rdp->cpu,
- cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"",
+ cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
rdp->completed, rdp->gpnum,
rdp->passed_quiesc, rdp->passed_quiesc_completed,
- rdp->qs_pending,
- rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
- rdp->n_rcu_pending);
+ rdp->qs_pending);
#ifdef CONFIG_NO_HZ
seq_printf(m, ",%d,%d,%d,%lu",
rdp->dynticks->dynticks,
@@ -121,13 +124,17 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
static int show_rcudata_csv(struct seq_file *m, void *unused)
{
- seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",\"rpfq\",\"rp\",");
+ seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
#ifdef CONFIG_NO_HZ
seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
#endif /* #ifdef CONFIG_NO_HZ */
seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
- seq_puts(m, "\"rcu:\"\n");
- PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m);
+#ifdef CONFIG_TREE_PREEMPT_RCU
+ seq_puts(m, "\"rcu_preempt:\"\n");
+ PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+ seq_puts(m, "\"rcu_sched:\"\n");
+ PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
seq_puts(m, "\"rcu_bh:\"\n");
PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
return 0;
@@ -173,8 +180,12 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
static int show_rcuhier(struct seq_file *m, void *unused)
{
- seq_puts(m, "rcu:\n");
- print_one_rcu_state(m, &rcu_state);
+#ifdef CONFIG_TREE_PREEMPT_RCU
+ seq_puts(m, "rcu_preempt:\n");
+ print_one_rcu_state(m, &rcu_preempt_state);
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+ seq_puts(m, "rcu_sched:\n");
+ print_one_rcu_state(m, &rcu_sched_state);
seq_puts(m, "rcu_bh:\n");
print_one_rcu_state(m, &rcu_bh_state);
return 0;
@@ -195,8 +206,12 @@ static struct file_operations rcuhier_fops = {
static int show_rcugp(struct seq_file *m, void *unused)
{
- seq_printf(m, "rcu: completed=%ld gpnum=%ld\n",
- rcu_state.completed, rcu_state.gpnum);
+#ifdef CONFIG_TREE_PREEMPT_RCU
+ seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n",
+ rcu_preempt_state.completed, rcu_preempt_state.gpnum);
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+ seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n",
+ rcu_sched_state.completed, rcu_sched_state.gpnum);
seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n",
rcu_bh_state.completed, rcu_bh_state.gpnum);
return 0;
@@ -215,51 +230,102 @@ static struct file_operations rcugp_fops = {
.release = single_release,
};
-static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir;
+static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
+{
+ seq_printf(m, "%3d%cnp=%ld "
+ "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n",
+ rdp->cpu,
+ cpu_is_offline(rdp->cpu) ? '!' : ' ',
+ rdp->n_rcu_pending,
+ rdp->n_rp_qs_pending,
+ rdp->n_rp_cb_ready,
+ rdp->n_rp_cpu_needs_gp,
+ rdp->n_rp_gp_completed,
+ rdp->n_rp_gp_started,
+ rdp->n_rp_need_fqs,
+ rdp->n_rp_need_nothing);
+}
+
+static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
+{
+ int cpu;
+ struct rcu_data *rdp;
+
+ for_each_possible_cpu(cpu) {
+ rdp = rsp->rda[cpu];
+ if (rdp->beenonline)
+ print_one_rcu_pending(m, rdp);
+ }
+}
+
+static int show_rcu_pending(struct seq_file *m, void *unused)
+{
+#ifdef CONFIG_TREE_PREEMPT_RCU
+ seq_puts(m, "rcu_preempt:\n");
+ print_rcu_pendings(m, &rcu_preempt_state);
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+ seq_puts(m, "rcu_sched:\n");
+ print_rcu_pendings(m, &rcu_sched_state);
+ seq_puts(m, "rcu_bh:\n");
+ print_rcu_pendings(m, &rcu_bh_state);
+ return 0;
+}
+
+static int rcu_pending_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_rcu_pending, NULL);
+}
+
+static struct file_operations rcu_pending_fops = {
+ .owner = THIS_MODULE,
+ .open = rcu_pending_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *rcudir;
+
static int __init rcuclassic_trace_init(void)
{
+ struct dentry *retval;
+
rcudir = debugfs_create_dir("rcu", NULL);
if (!rcudir)
- goto out;
+ goto free_out;
- datadir = debugfs_create_file("rcudata", 0444, rcudir,
+ retval = debugfs_create_file("rcudata", 0444, rcudir,
NULL, &rcudata_fops);
- if (!datadir)
+ if (!retval)
goto free_out;
- datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir,
+ retval = debugfs_create_file("rcudata.csv", 0444, rcudir,
NULL, &rcudata_csv_fops);
- if (!datadir_csv)
+ if (!retval)
goto free_out;
- gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
- if (!gpdir)
+ retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
+ if (!retval)
goto free_out;
- hierdir = debugfs_create_file("rcuhier", 0444, rcudir,
+ retval = debugfs_create_file("rcuhier", 0444, rcudir,
NULL, &rcuhier_fops);
- if (!hierdir)
+ if (!retval)
+ goto free_out;
+
+ retval = debugfs_create_file("rcu_pending", 0444, rcudir,
+ NULL, &rcu_pending_fops);
+ if (!retval)
goto free_out;
return 0;
free_out:
- if (datadir)
- debugfs_remove(datadir);
- if (datadir_csv)
- debugfs_remove(datadir_csv);
- if (gpdir)
- debugfs_remove(gpdir);
- debugfs_remove(rcudir);
-out:
+ debugfs_remove_recursive(rcudir);
return 1;
}
static void __exit rcuclassic_trace_cleanup(void)
{
- debugfs_remove(datadir);
- debugfs_remove(datadir_csv);
- debugfs_remove(gpdir);
- debugfs_remove(hierdir);
- debugfs_remove(rcudir);
+ debugfs_remove_recursive(rcudir);
}
diff --git a/kernel/relay.c b/kernel/relay.c
index 09ac2008f77..bc188549788 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -663,8 +663,10 @@ int relay_late_setup_files(struct rchan *chan,
mutex_lock(&relay_channels_mutex);
/* Is chan already set up? */
- if (unlikely(chan->has_base_filename))
+ if (unlikely(chan->has_base_filename)) {
+ mutex_unlock(&relay_channels_mutex);
return -EEXIST;
+ }
chan->has_base_filename = 1;
chan->parent = parent;
curr_cpu = get_cpu();
@@ -675,9 +677,7 @@ int relay_late_setup_files(struct rchan *chan,
*/
for_each_online_cpu(i) {
if (unlikely(!chan->buf[i])) {
- printk(KERN_ERR "relay_late_setup_files: CPU %u "
- "has no buffer, it must have!\n", i);
- BUG();
+ WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
err = -EINVAL;
break;
}
@@ -748,7 +748,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
* from the scheduler (trying to re-grab
* rq->lock), so defer it.
*/
- __mod_timer(&buf->timer, jiffies + 1);
+ mod_timer(&buf->timer, jiffies + 1);
}
old = buf->data;
@@ -795,13 +795,15 @@ void relay_subbufs_consumed(struct rchan *chan,
if (!chan)
return;
- if (cpu >= NR_CPUS || !chan->buf[cpu])
+ if (cpu >= NR_CPUS || !chan->buf[cpu] ||
+ subbufs_consumed > chan->n_subbufs)
return;
buf = chan->buf[cpu];
- buf->subbufs_consumed += subbufs_consumed;
- if (buf->subbufs_consumed > buf->subbufs_produced)
+ if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
buf->subbufs_consumed = buf->subbufs_produced;
+ else
+ buf->subbufs_consumed += subbufs_consumed;
}
EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bf8e7534c80..e1338f07431 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -18,7 +18,7 @@
void res_counter_init(struct res_counter *counter, struct res_counter *parent)
{
spin_lock_init(&counter->lock);
- counter->limit = (unsigned long long)LLONG_MAX;
+ counter->limit = RESOURCE_MAX;
counter->parent = parent;
}
@@ -133,6 +133,16 @@ int res_counter_memparse_write_strategy(const char *buf,
unsigned long long *res)
{
char *end;
+
+ /* return RESOURCE_MAX(unlimited) if "-1" is specified */
+ if (*buf == '-') {
+ *res = simple_strtoull(buf + 1, &end, 10);
+ if (*res != 1 || *end != '\0')
+ return -EINVAL;
+ *res = RESOURCE_MAX;
+ return 0;
+ }
+
/* FIXME - make memparse() take const char* args */
*res = memparse((char *)buf, &end);
if (*end != '\0')
diff --git a/kernel/resource.c b/kernel/resource.c
index fd5d7d574bb..78b087221c1 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -533,43 +533,21 @@ static void __init __reserve_region_with_split(struct resource *root,
res->end = end;
res->flags = IORESOURCE_BUSY;
- for (;;) {
- conflict = __request_resource(parent, res);
- if (!conflict)
- break;
- if (conflict != parent) {
- parent = conflict;
- if (!(conflict->flags & IORESOURCE_BUSY))
- continue;
- }
-
- /* Uhhuh, that didn't work out.. */
- kfree(res);
- res = NULL;
- break;
- }
-
- if (!res) {
- /* failed, split and try again */
-
- /* conflict covered whole area */
- if (conflict->start <= start && conflict->end >= end)
- return;
+ conflict = __request_resource(parent, res);
+ if (!conflict)
+ return;
- if (conflict->start > start)
- __reserve_region_with_split(root, start, conflict->start-1, name);
- if (!(conflict->flags & IORESOURCE_BUSY)) {
- resource_size_t common_start, common_end;
+ /* failed, split and try again */
+ kfree(res);
- common_start = max(conflict->start, start);
- common_end = min(conflict->end, end);
- if (common_start < common_end)
- __reserve_region_with_split(root, common_start, common_end, name);
- }
- if (conflict->end < end)
- __reserve_region_with_split(root, conflict->end+1, end, name);
- }
+ /* conflict covered whole area */
+ if (conflict->start <= start && conflict->end >= end)
+ return;
+ if (conflict->start > start)
+ __reserve_region_with_split(root, start, conflict->start-1, name);
+ if (conflict->end < end)
+ __reserve_region_with_split(root, conflict->end+1, end, name);
}
void __init reserve_region_with_split(struct resource *root,
@@ -809,7 +787,7 @@ static int __init reserve_setup(char *str)
static struct resource reserve[MAXRESERVE];
for (;;) {
- int io_start, io_num;
+ unsigned int io_start, io_num;
int x = reserved;
if (get_option (&str, &io_start) != 2)
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 69d9cb921ff..29bd4baf9e7 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* assigned pending owner [which might not have taken the
* lock yet]:
*/
-static inline int try_to_steal_lock(struct rt_mutex *lock)
+static inline int try_to_steal_lock(struct rt_mutex *lock,
+ struct task_struct *task)
{
struct task_struct *pendowner = rt_mutex_owner(lock);
struct rt_mutex_waiter *next;
@@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
if (!rt_mutex_owner_pending(lock))
return 0;
- if (pendowner == current)
+ if (pendowner == task)
return 1;
spin_lock_irqsave(&pendowner->pi_lock, flags);
- if (current->prio >= pendowner->prio) {
+ if (task->prio >= pendowner->prio) {
spin_unlock_irqrestore(&pendowner->pi_lock, flags);
return 0;
}
@@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
* We are going to steal the lock and a waiter was
* enqueued on the pending owners pi_waiters queue. So
* we have to enqueue this waiter into
- * current->pi_waiters list. This covers the case,
- * where current is boosted because it holds another
+ * task->pi_waiters list. This covers the case,
+ * where task is boosted because it holds another
* lock and gets unboosted because the booster is
* interrupted, so we would delay a waiter with higher
- * priority as current->normal_prio.
+ * priority as task->normal_prio.
*
* Note: in the rare case of a SCHED_OTHER task changing
* its priority and thus stealing the lock, next->task
- * might be current:
+ * might be task:
*/
- if (likely(next->task != current)) {
- spin_lock_irqsave(&current->pi_lock, flags);
- plist_add(&next->pi_list_entry, &current->pi_waiters);
- __rt_mutex_adjust_prio(current);
- spin_unlock_irqrestore(&current->pi_lock, flags);
+ if (likely(next->task != task)) {
+ spin_lock_irqsave(&task->pi_lock, flags);
+ plist_add(&next->pi_list_entry, &task->pi_waiters);
+ __rt_mutex_adjust_prio(task);
+ spin_unlock_irqrestore(&task->pi_lock, flags);
}
return 1;
}
@@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
*/
mark_rt_mutex_waiters(lock);
- if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
+ if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))
return 0;
/* We got the lock. */
@@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
*/
static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
+ struct task_struct *task,
int detect_deadlock)
{
struct task_struct *owner = rt_mutex_owner(lock);
@@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
unsigned long flags;
int chain_walk = 0, res;
- spin_lock_irqsave(&current->pi_lock, flags);
- __rt_mutex_adjust_prio(current);
- waiter->task = current;
+ spin_lock_irqsave(&task->pi_lock, flags);
+ __rt_mutex_adjust_prio(task);
+ waiter->task = task;
waiter->lock = lock;
- plist_node_init(&waiter->list_entry, current->prio);
- plist_node_init(&waiter->pi_list_entry, current->prio);
+ plist_node_init(&waiter->list_entry, task->prio);
+ plist_node_init(&waiter->pi_list_entry, task->prio);
/* Get the top priority waiter on the lock */
if (rt_mutex_has_waiters(lock))
top_waiter = rt_mutex_top_waiter(lock);
plist_add(&waiter->list_entry, &lock->wait_list);
- current->pi_blocked_on = waiter;
+ task->pi_blocked_on = waiter;
- spin_unlock_irqrestore(&current->pi_lock, flags);
+ spin_unlock_irqrestore(&task->pi_lock, flags);
if (waiter == rt_mutex_top_waiter(lock)) {
spin_lock_irqsave(&owner->pi_lock, flags);
@@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
spin_unlock(&lock->wait_lock);
res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
- current);
+ task);
spin_lock(&lock->wait_lock);
@@ -605,37 +607,25 @@ void rt_mutex_adjust_pi(struct task_struct *task)
rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
}
-/*
- * Slow path lock function:
+/**
+ * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
+ * @lock: the rt_mutex to take
+ * @state: the state the task should block in (TASK_INTERRUPTIBLE
+ * or TASK_UNINTERRUPTIBLE)
+ * @timeout: the pre-initialized and started timer, or NULL for none
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @detect_deadlock: passed to task_blocks_on_rt_mutex
+ *
+ * lock->wait_lock must be held by the caller.
*/
static int __sched
-rt_mutex_slowlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- int detect_deadlock)
+__rt_mutex_slowlock(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ struct rt_mutex_waiter *waiter,
+ int detect_deadlock)
{
- struct rt_mutex_waiter waiter;
int ret = 0;
- debug_rt_mutex_init_waiter(&waiter);
- waiter.task = NULL;
-
- spin_lock(&lock->wait_lock);
-
- /* Try to acquire the lock again: */
- if (try_to_take_rt_mutex(lock)) {
- spin_unlock(&lock->wait_lock);
- return 0;
- }
-
- set_current_state(state);
-
- /* Setup the timer, when timeout != NULL */
- if (unlikely(timeout)) {
- hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
- if (!hrtimer_active(&timeout->timer))
- timeout->task = NULL;
- }
-
for (;;) {
/* Try to acquire the lock: */
if (try_to_take_rt_mutex(lock))
@@ -656,19 +646,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
}
/*
- * waiter.task is NULL the first time we come here and
+ * waiter->task is NULL the first time we come here and
* when we have been woken up by the previous owner
* but the lock got stolen by a higher prio task.
*/
- if (!waiter.task) {
- ret = task_blocks_on_rt_mutex(lock, &waiter,
+ if (!waiter->task) {
+ ret = task_blocks_on_rt_mutex(lock, waiter, current,
detect_deadlock);
/*
* If we got woken up by the owner then start loop
* all over without going into schedule to try
* to get the lock now:
*/
- if (unlikely(!waiter.task)) {
+ if (unlikely(!waiter->task)) {
/*
* Reset the return value. We might
* have returned with -EDEADLK and the
@@ -684,15 +674,52 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
spin_unlock(&lock->wait_lock);
- debug_rt_mutex_print_deadlock(&waiter);
+ debug_rt_mutex_print_deadlock(waiter);
- if (waiter.task)
+ if (waiter->task)
schedule_rt_mutex(lock);
spin_lock(&lock->wait_lock);
set_current_state(state);
}
+ return ret;
+}
+
+/*
+ * Slow path lock function:
+ */
+static int __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ int detect_deadlock)
+{
+ struct rt_mutex_waiter waiter;
+ int ret = 0;
+
+ debug_rt_mutex_init_waiter(&waiter);
+ waiter.task = NULL;
+
+ spin_lock(&lock->wait_lock);
+
+ /* Try to acquire the lock again: */
+ if (try_to_take_rt_mutex(lock)) {
+ spin_unlock(&lock->wait_lock);
+ return 0;
+ }
+
+ set_current_state(state);
+
+ /* Setup the timer, when timeout != NULL */
+ if (unlikely(timeout)) {
+ hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+ if (!hrtimer_active(&timeout->timer))
+ timeout->task = NULL;
+ }
+
+ ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
+ detect_deadlock);
+
set_current_state(TASK_RUNNING);
if (unlikely(waiter.task))
@@ -864,9 +891,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
/**
- * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
- * the timeout structure is provided
- * by the caller
+ * rt_mutex_timed_lock - lock a rt_mutex interruptible
+ * the timeout structure is provided
+ * by the caller
*
* @lock: the rt_mutex to be locked
* @timeout: timeout structure or NULL (no timeout)
@@ -875,7 +902,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
* Returns:
* 0 on success
* -EINTR when interrupted by a signal
- * -ETIMEOUT when the timeout expired
+ * -ETIMEDOUT when the timeout expired
* -EDEADLK when the lock would deadlock (when deadlock detection is on)
*/
int
@@ -913,7 +940,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
}
EXPORT_SYMBOL_GPL(rt_mutex_unlock);
-/***
+/**
* rt_mutex_destroy - mark a mutex unusable
* @lock: the mutex to be destroyed
*
@@ -986,6 +1013,57 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
}
/**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ * @detect_deadlock: perform deadlock detection (1) or not (0)
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for FUTEX_REQUEUE_PI support.
+ */
+int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task, int detect_deadlock)
+{
+ int ret;
+
+ spin_lock(&lock->wait_lock);
+
+ mark_rt_mutex_waiters(lock);
+
+ if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
+ /* We got the lock for task. */
+ debug_rt_mutex_lock(lock);
+ rt_mutex_set_owner(lock, task, 0);
+ spin_unlock(&lock->wait_lock);
+ rt_mutex_deadlock_account_lock(lock, task);
+ return 1;
+ }
+
+ ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
+
+ if (ret && !waiter->task) {
+ /*
+ * Reset the return value. We might have
+ * returned with -EDEADLK and the owner
+ * released the lock while we were walking the
+ * pi chain. Let the waiter sort it out.
+ */
+ ret = 0;
+ }
+ spin_unlock(&lock->wait_lock);
+
+ debug_rt_mutex_print_deadlock(waiter);
+
+ return ret;
+}
+
+/**
* rt_mutex_next_owner - return the next owner of the lock
*
* @lock: the rt lock query
@@ -1004,3 +1082,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
return rt_mutex_top_waiter(lock)->task;
}
+
+/**
+ * rt_mutex_finish_proxy_lock() - Complete lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @to: the timeout, null if none. hrtimer should already have
+ * been started.
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @detect_deadlock: perform deadlock detection (1) or not (0)
+ *
+ * Complete the lock acquisition started our behalf by another thread.
+ *
+ * Returns:
+ * 0 - success
+ * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK
+ *
+ * Special API call for PI-futex requeue support
+ */
+int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter,
+ int detect_deadlock)
+{
+ int ret;
+
+ spin_lock(&lock->wait_lock);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter,
+ detect_deadlock);
+
+ set_current_state(TASK_RUNNING);
+
+ if (unlikely(waiter->task))
+ remove_waiter(lock, waiter);
+
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ spin_unlock(&lock->wait_lock);
+
+ /*
+ * Readjust priority, when we did not get the lock. We might have been
+ * the pending owner and boosted. Since we did not take the lock, the
+ * PI boost has to go.
+ */
+ if (unlikely(ret))
+ rt_mutex_adjust_prio(current);
+
+ return ret;
+}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index e124bf5800e..97a2f81866a 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
struct task_struct *proxy_owner);
extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner);
+extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task,
+ int detect_deadlock);
+extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter,
+ int detect_deadlock);
#ifdef CONFIG_DEBUG_RT_MUTEXES
# include "rtmutex-debug.h"
diff --git a/kernel/sched.c b/kernel/sched.c
index 52bbf1c842a..e27a53685ed 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,6 +39,7 @@
#include <linux/completion.h>
#include <linux/kernel_stat.h>
#include <linux/debug_locks.h>
+#include <linux/perf_counter.h>
#include <linux/security.h>
#include <linux/notifier.h>
#include <linux/profile.h>
@@ -63,22 +64,22 @@
#include <linux/tsacct_kern.h>
#include <linux/kprobes.h>
#include <linux/delayacct.h>
-#include <linux/reciprocal_div.h>
#include <linux/unistd.h>
#include <linux/pagemap.h>
#include <linux/hrtimer.h>
#include <linux/tick.h>
-#include <linux/bootmem.h>
#include <linux/debugfs.h>
#include <linux/ctype.h>
#include <linux/ftrace.h>
-#include <trace/sched.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
#include "sched_cpupri.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+
/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -118,36 +119,8 @@
*/
#define RUNTIME_INF ((u64)~0ULL)
-DEFINE_TRACE(sched_wait_task);
-DEFINE_TRACE(sched_wakeup);
-DEFINE_TRACE(sched_wakeup_new);
-DEFINE_TRACE(sched_switch);
-DEFINE_TRACE(sched_migrate_task);
-
-#ifdef CONFIG_SMP
-
static void double_rq_lock(struct rq *rq1, struct rq *rq2);
-/*
- * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
- * Since cpu_power is a 'constant', we can use a reciprocal divide.
- */
-static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
-{
- return reciprocal_divide(load, sg->reciprocal_cpu_power);
-}
-
-/*
- * Each time a sched group cpu_power is changed,
- * we must compute its reciprocal value
- */
-static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
-{
- sg->__cpu_power += val;
- sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
-}
-#endif
-
static inline int rt_policy(int policy)
{
if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -223,7 +196,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
ktime_t now;
- if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
+ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return;
if (hrtimer_active(&rt_b->rt_period_timer))
@@ -231,13 +204,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
spin_lock(&rt_b->rt_runtime_lock);
for (;;) {
+ unsigned long delta;
+ ktime_t soft, hard;
+
if (hrtimer_active(&rt_b->rt_period_timer))
break;
now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
- hrtimer_start_expires(&rt_b->rt_period_timer,
- HRTIMER_MODE_ABS);
+
+ soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
+ hard = hrtimer_get_expires(&rt_b->rt_period_timer);
+ delta = ktime_to_ns(ktime_sub(hard, soft));
+ __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
+ HRTIMER_MODE_ABS_PINNED, 0);
}
spin_unlock(&rt_b->rt_runtime_lock);
}
@@ -306,8 +286,8 @@ void set_tg_uid(struct user_struct *user)
/*
* Root task group.
- * Every UID task group (including init_task_group aka UID-0) will
- * be a child to this group.
+ * Every UID task group (including init_task_group aka UID-0) will
+ * be a child to this group.
*/
struct task_group root_task_group;
@@ -315,7 +295,7 @@ struct task_group root_task_group;
/* Default task group's sched entity on each cpu */
static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
/* Default task group's cfs_rq on each cpu */
-static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp;
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
@@ -331,6 +311,13 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
*/
static DEFINE_SPINLOCK(task_group_lock);
+#ifdef CONFIG_SMP
+static int root_task_group_empty(void)
+{
+ return list_empty(&root_task_group.children);
+}
+#endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_USER_SCHED
# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@ -391,6 +378,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
#else
+#ifdef CONFIG_SMP
+static int root_task_group_empty(void)
+{
+ return 1;
+}
+#endif
+
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
static inline struct task_group *task_group(struct task_struct *p)
{
@@ -467,11 +461,18 @@ struct rt_rq {
struct rt_prio_array active;
unsigned long rt_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- int highest_prio; /* highest queued rt task prio */
+ struct {
+ int curr; /* highest queued rt task prio */
+#ifdef CONFIG_SMP
+ int next; /* next highest */
+#endif
+ } highest_prio;
#endif
#ifdef CONFIG_SMP
unsigned long rt_nr_migratory;
+ unsigned long rt_nr_total;
int overloaded;
+ struct plist_head pushable_tasks;
#endif
int rt_throttled;
u64 rt_time;
@@ -549,7 +550,6 @@ struct rq {
unsigned long nr_running;
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
- unsigned char idle_at_tick;
#ifdef CONFIG_NO_HZ
unsigned long last_tick_seen;
unsigned char in_nohz_recently;
@@ -558,6 +558,7 @@ struct rq {
struct load_weight load;
unsigned long nr_load_updates;
u64 nr_switches;
+ u64 nr_migrations_in;
struct cfs_rq cfs;
struct rt_rq rt;
@@ -590,7 +591,9 @@ struct rq {
struct root_domain *rd;
struct sched_domain *sd;
+ unsigned char idle_at_tick;
/* For active balancing */
+ int post_schedule;
int active_balance;
int push_cpu;
/* cpu of this runqueue: */
@@ -601,8 +604,15 @@ struct rq {
struct task_struct *migration_thread;
struct list_head migration_queue;
+
+ u64 rt_avg;
+ u64 age_stamp;
#endif
+ /* calc_load related fields */
+ unsigned long calc_load_update;
+ long calc_load_active;
+
#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
int hrtick_csd_pending;
@@ -618,9 +628,6 @@ struct rq {
/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
- unsigned int yld_exp_empty;
- unsigned int yld_act_empty;
- unsigned int yld_both_empty;
unsigned int yld_count;
/* schedule() stats */
@@ -667,8 +674,9 @@ static inline int cpu_of(struct rq *rq)
#define this_rq() (&__get_cpu_var(runqueues))
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+#define raw_rq() (&__raw_get_cpu_var(runqueues))
-static inline void update_rq_clock(struct rq *rq)
+inline void update_rq_clock(struct rq *rq)
{
rq->clock = sched_clock_cpu(cpu_of(rq));
}
@@ -835,6 +843,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000;
unsigned int sysctl_sched_shares_thresh = 4;
/*
+ * period over which we average the RT time consumption, measured
+ * in ms.
+ *
+ * default: 1s
+ */
+const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
+
+/*
* period over which we measure -rt task cpu usage in us.
* default: 1s
*/
@@ -1093,7 +1109,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
if (rq == this_rq()) {
hrtimer_restart(timer);
} else if (!rq->hrtick_csd_pending) {
- __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+ __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
rq->hrtick_csd_pending = 1;
}
}
@@ -1129,7 +1145,8 @@ static __init void init_hrtick(void)
*/
static void hrtick_start(struct rq *rq, u64 delay)
{
- hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+ __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
+ HRTIMER_MODE_REL_PINNED, 0);
}
static inline void init_hrtick(void)
@@ -1183,10 +1200,10 @@ static void resched_task(struct task_struct *p)
assert_spin_locked(&task_rq(p)->lock);
- if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+ if (test_tsk_need_resched(p))
return;
- set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+ set_tsk_need_resched(p);
cpu = task_cpu(p);
if (cpu == smp_processor_id())
@@ -1242,7 +1259,7 @@ void wake_up_idle_cpu(int cpu)
* lockless. The worst case is that the other CPU runs the
* idle task through an additional NOOP schedule()
*/
- set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
+ set_tsk_need_resched(rq->idle);
/* NEED_RESCHED must be visible before we test polling */
smp_mb();
@@ -1251,12 +1268,37 @@ void wake_up_idle_cpu(int cpu)
}
#endif /* CONFIG_NO_HZ */
+static u64 sched_avg_period(void)
+{
+ return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
+}
+
+static void sched_avg_update(struct rq *rq)
+{
+ s64 period = sched_avg_period();
+
+ while ((s64)(rq->clock - rq->age_stamp) > period) {
+ rq->age_stamp += period;
+ rq->rt_avg /= 2;
+ }
+}
+
+static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
+{
+ rq->rt_avg += rt_delta;
+ sched_avg_update(rq);
+}
+
#else /* !CONFIG_SMP */
static void resched_task(struct task_struct *p)
{
assert_spin_locked(&task_rq(p)->lock);
set_tsk_need_resched(p);
}
+
+static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
+{
+}
#endif /* CONFIG_SMP */
#if BITS_PER_LONG == 32
@@ -1393,10 +1435,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
struct rq_iterator *iterator);
#endif
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+ CPUACCT_STAT_USER, /* ... user mode */
+ CPUACCT_STAT_SYSTEM, /* ... kernel mode */
+
+ CPUACCT_STAT_NSTATS,
+};
+
#ifdef CONFIG_CGROUP_CPUACCT
static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+static void cpuacct_update_stats(struct task_struct *tsk,
+ enum cpuacct_stat_index idx, cputime_t val);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+static inline void cpuacct_update_stats(struct task_struct *tsk,
+ enum cpuacct_stat_index idx, cputime_t val) {}
#endif
static inline void inc_cpu_load(struct rq *rq, unsigned long load)
@@ -1474,28 +1528,35 @@ static unsigned long cpu_avg_load_per_task(int cpu)
#ifdef CONFIG_FAIR_GROUP_SCHED
+struct update_shares_data {
+ unsigned long rq_weight[NR_CPUS];
+};
+
+static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
+
static void __set_se_shares(struct sched_entity *se, unsigned long shares);
/*
* Calculate and set the cpu's group shares.
*/
-static void
-update_group_shares_cpu(struct task_group *tg, int cpu,
- unsigned long sd_shares, unsigned long sd_rq_weight)
+static void update_group_shares_cpu(struct task_group *tg, int cpu,
+ unsigned long sd_shares,
+ unsigned long sd_rq_weight,
+ struct update_shares_data *usd)
{
- unsigned long shares;
- unsigned long rq_weight;
-
- if (!tg->se[cpu])
- return;
+ unsigned long shares, rq_weight;
+ int boost = 0;
- rq_weight = tg->cfs_rq[cpu]->rq_weight;
+ rq_weight = usd->rq_weight[cpu];
+ if (!rq_weight) {
+ boost = 1;
+ rq_weight = NICE_0_LOAD;
+ }
/*
- * \Sum shares * rq_weight
- * shares = -----------------------
- * \Sum rq_weight
- *
+ * \Sum_j shares_j * rq_weight_i
+ * shares_i = -----------------------------
+ * \Sum_j rq_weight_j
*/
shares = (sd_shares * rq_weight) / sd_rq_weight;
shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
@@ -1506,8 +1567,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
unsigned long flags;
spin_lock_irqsave(&rq->lock, flags);
- tg->cfs_rq[cpu]->shares = shares;
-
+ tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
+ tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
__set_se_shares(tg->se[cpu], shares);
spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -1520,22 +1581,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
*/
static int tg_shares_up(struct task_group *tg, void *data)
{
- unsigned long weight, rq_weight = 0;
- unsigned long shares = 0;
+ unsigned long weight, rq_weight = 0, shares = 0;
+ struct update_shares_data *usd;
struct sched_domain *sd = data;
+ unsigned long flags;
int i;
+ if (!tg->se[0])
+ return 0;
+
+ local_irq_save(flags);
+ usd = &__get_cpu_var(update_shares_data);
+
for_each_cpu(i, sched_domain_span(sd)) {
+ weight = tg->cfs_rq[i]->load.weight;
+ usd->rq_weight[i] = weight;
+
/*
* If there are currently no tasks on the cpu pretend there
* is one of average load so that when a new task gets to
* run here it will not get delayed by group starvation.
*/
- weight = tg->cfs_rq[i]->load.weight;
if (!weight)
weight = NICE_0_LOAD;
- tg->cfs_rq[i]->rq_weight = weight;
rq_weight += weight;
shares += tg->cfs_rq[i]->shares;
}
@@ -1547,7 +1616,9 @@ static int tg_shares_up(struct task_group *tg, void *data)
shares = tg->shares;
for_each_cpu(i, sched_domain_span(sd))
- update_group_shares_cpu(tg, i, shares, rq_weight);
+ update_group_shares_cpu(tg, i, shares, rq_weight, usd);
+
+ local_irq_restore(flags);
return 0;
}
@@ -1577,8 +1648,14 @@ static int tg_load_down(struct task_group *tg, void *data)
static void update_shares(struct sched_domain *sd)
{
- u64 now = cpu_clock(raw_smp_processor_id());
- s64 elapsed = now - sd->last_update;
+ s64 elapsed;
+ u64 now;
+
+ if (root_task_group_empty())
+ return;
+
+ now = cpu_clock(raw_smp_processor_id());
+ elapsed = now - sd->last_update;
if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
sd->last_update = now;
@@ -1588,6 +1665,9 @@ static void update_shares(struct sched_domain *sd)
static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
{
+ if (root_task_group_empty())
+ return;
+
spin_unlock(&rq->lock);
update_shares(sd);
spin_lock(&rq->lock);
@@ -1595,6 +1675,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
static void update_h_load(long cpu)
{
+ if (root_task_group_empty())
+ return;
+
walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
}
@@ -1610,21 +1693,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
#endif
+#ifdef CONFIG_PREEMPT
+
/*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ * way at the expense of forcing extra atomic operations in all
+ * invocations. This assures that the double_lock is acquired using the
+ * same underlying policy as the spinlock_t on this architecture, which
+ * reduces latency compared to the unfair variant below. However, it
+ * also adds more overhead and therefore may reduce throughput.
*/
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(this_rq->lock)
+ __acquires(busiest->lock)
+ __acquires(this_rq->lock)
+{
+ spin_unlock(&this_rq->lock);
+ double_rq_lock(this_rq, busiest);
+
+ return 1;
+}
+
+#else
+/*
+ * Unfair double_lock_balance: Optimizes throughput at the expense of
+ * latency by eliminating extra atomic operations when the locks are
+ * already in proper order on entry. This favors lower cpu-ids and will
+ * grant the double lock to lower cpus over higher ids under contention,
+ * regardless of entry order into the function.
+ */
+static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__releases(this_rq->lock)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
int ret = 0;
- if (unlikely(!irqs_disabled())) {
- /* printk() doesn't work good under rq->lock */
- spin_unlock(&this_rq->lock);
- BUG_ON(1);
- }
if (unlikely(!spin_trylock(&busiest->lock))) {
if (busiest < this_rq) {
spin_unlock(&this_rq->lock);
@@ -1637,6 +1741,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
return ret;
}
+#endif /* CONFIG_PREEMPT */
+
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+{
+ if (unlikely(!irqs_disabled())) {
+ /* printk() doesn't work good under rq->lock */
+ spin_unlock(&this_rq->lock);
+ BUG_ON(1);
+ }
+
+ return _double_lock_balance(this_rq, busiest);
+}
+
static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
__releases(busiest->lock)
{
@@ -1654,6 +1774,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
}
#endif
+static void calc_load_account_active(struct rq *this_rq);
+
#include "sched_stats.h"
#include "sched_idletask.c"
#include "sched_fair.c"
@@ -1705,6 +1827,9 @@ static void update_avg(u64 *avg, u64 sample)
static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
{
+ if (wakeup)
+ p->se.start_runtime = p->se.sum_exec_runtime;
+
sched_info_queued(p);
p->sched_class->enqueue_task(rq, p, wakeup);
p->se.on_rq = 1;
@@ -1712,10 +1837,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
{
- if (sleep && p->se.last_wakeup) {
- update_avg(&p->se.avg_overlap,
- p->se.sum_exec_runtime - p->se.last_wakeup);
- p->se.last_wakeup = 0;
+ if (sleep) {
+ if (p->se.last_wakeup) {
+ update_avg(&p->se.avg_overlap,
+ p->se.sum_exec_runtime - p->se.last_wakeup);
+ p->se.last_wakeup = 0;
+ } else {
+ update_avg(&p->se.avg_wakeup,
+ sysctl_sched_wakeup_granularity);
+ }
}
sched_info_dequeued(p);
@@ -1876,7 +2006,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
clock_offset = old_rq->clock - new_rq->clock;
- trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+ trace_sched_migrate_task(p, new_cpu);
#ifdef CONFIG_SCHEDSTATS
if (p->se.wait_start)
@@ -1885,12 +2015,17 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
p->se.sleep_start -= clock_offset;
if (p->se.block_start)
p->se.block_start -= clock_offset;
+#endif
if (old_cpu != new_cpu) {
- schedstat_inc(p, se.nr_migrations);
+ p->se.nr_migrations++;
+ new_rq->nr_migrations_in++;
+#ifdef CONFIG_SCHEDSTATS
if (task_hot(p, old_rq->clock, NULL))
schedstat_inc(p, se.nr_forced2_migrations);
- }
#endif
+ perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
+ 1, 1, NULL, 0);
+ }
p->se.vruntime -= old_cfsrq->min_vruntime -
new_cfsrq->min_vruntime;
@@ -1933,6 +2068,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
}
/*
+ * wait_task_context_switch - wait for a thread to complete at least one
+ * context switch.
+ *
+ * @p must not be current.
+ */
+void wait_task_context_switch(struct task_struct *p)
+{
+ unsigned long nvcsw, nivcsw, flags;
+ int running;
+ struct rq *rq;
+
+ nvcsw = p->nvcsw;
+ nivcsw = p->nivcsw;
+ for (;;) {
+ /*
+ * The runqueue is assigned before the actual context
+ * switch. We need to take the runqueue lock.
+ *
+ * We could check initially without the lock but it is
+ * very likely that we need to take the lock in every
+ * iteration.
+ */
+ rq = task_rq_lock(p, &flags);
+ running = task_running(rq, p);
+ task_rq_unlock(rq, &flags);
+
+ if (likely(!running))
+ break;
+ /*
+ * The switch count is incremented before the actual
+ * context switch. We thus wait for two switches to be
+ * sure at least one completed.
+ */
+ if ((p->nvcsw - nvcsw) > 1)
+ break;
+ if ((p->nivcsw - nivcsw) > 1)
+ break;
+
+ cpu_relax();
+ }
+}
+
+/*
* wait_task_inactive - wait for a thread to unschedule.
*
* If @match_state is nonzero, it's the @p->state value just checked and
@@ -2017,7 +2195,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* it must be off the runqueue _entirely_, and not
* preempted!
*
- * So if it wa still runnable (but just not actively
+ * So if it was still runnable (but just not actively
* running right now), it's preempted, and we should
* yield - it could be a while.
*/
@@ -2060,6 +2238,7 @@ void kick_process(struct task_struct *p)
smp_send_reschedule(cpu);
preempt_enable();
}
+EXPORT_SYMBOL_GPL(kick_process);
/*
* Return a low guess at the load of a migration-source cpu weighted
@@ -2133,8 +2312,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
}
/* Adjust by relative CPU power of the group */
- avg_load = sg_div_cpu_power(group,
- avg_load * SCHED_LOAD_SCALE);
+ avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
if (local_group) {
this_load = avg_load;
@@ -2242,6 +2420,27 @@ static int sched_balance_self(int cpu, int flag)
#endif /* CONFIG_SMP */
+/**
+ * task_oncpu_function_call - call a function on the cpu on which a task runs
+ * @p: the task to evaluate
+ * @func: the function to be called
+ * @info: the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ */
+void task_oncpu_function_call(struct task_struct *p,
+ void (*func) (void *info), void *info)
+{
+ int cpu;
+
+ preempt_disable();
+ cpu = task_cpu(p);
+ if (task_curr(p))
+ smp_call_function_single(cpu, func, info, 1);
+ preempt_enable();
+}
+
/***
* try_to_wake_up - wake up a thread
* @p: the to-be-woken-up thread
@@ -2267,7 +2466,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
sync = 0;
#ifdef CONFIG_SMP
- if (sched_feat(LB_WAKEUP_UPDATE)) {
+ if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
struct sched_domain *sd;
this_cpu = raw_smp_processor_id();
@@ -2345,6 +2544,22 @@ out_activate:
activate_task(rq, p, 1);
success = 1;
+ /*
+ * Only attribute actual wakeups done by this task.
+ */
+ if (!in_interrupt()) {
+ struct sched_entity *se = &current->se;
+ u64 sample = se->sum_exec_runtime;
+
+ if (se->last_wakeup)
+ sample -= se->last_wakeup;
+ else
+ sample -= se->start_runtime;
+ update_avg(&se->avg_wakeup, sample);
+
+ se->last_wakeup = se->sum_exec_runtime;
+ }
+
out_running:
trace_sched_wakeup(rq, p, success);
check_preempt_curr(rq, p, sync);
@@ -2355,13 +2570,22 @@ out_running:
p->sched_class->task_wake_up(rq, p);
#endif
out:
- current->se.last_wakeup = current->se.sum_exec_runtime;
-
task_rq_unlock(rq, &flags);
return success;
}
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes. Returns 1 if the process was woken up, 0 if it was already
+ * running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
int wake_up_process(struct task_struct *p)
{
return try_to_wake_up(p, TASK_ALL, 0);
@@ -2384,19 +2608,44 @@ static void __sched_fork(struct task_struct *p)
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
+ p->se.nr_migrations = 0;
p->se.last_wakeup = 0;
p->se.avg_overlap = 0;
+ p->se.start_runtime = 0;
+ p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
#ifdef CONFIG_SCHEDSTATS
- p->se.wait_start = 0;
- p->se.sum_sleep_runtime = 0;
- p->se.sleep_start = 0;
- p->se.block_start = 0;
- p->se.sleep_max = 0;
- p->se.block_max = 0;
- p->se.exec_max = 0;
- p->se.slice_max = 0;
- p->se.wait_max = 0;
+ p->se.wait_start = 0;
+ p->se.wait_max = 0;
+ p->se.wait_count = 0;
+ p->se.wait_sum = 0;
+
+ p->se.sleep_start = 0;
+ p->se.sleep_max = 0;
+ p->se.sum_sleep_runtime = 0;
+
+ p->se.block_start = 0;
+ p->se.block_max = 0;
+ p->se.exec_max = 0;
+ p->se.slice_max = 0;
+
+ p->se.nr_migrations_cold = 0;
+ p->se.nr_failed_migrations_affine = 0;
+ p->se.nr_failed_migrations_running = 0;
+ p->se.nr_failed_migrations_hot = 0;
+ p->se.nr_forced_migrations = 0;
+ p->se.nr_forced2_migrations = 0;
+
+ p->se.nr_wakeups = 0;
+ p->se.nr_wakeups_sync = 0;
+ p->se.nr_wakeups_migrate = 0;
+ p->se.nr_wakeups_local = 0;
+ p->se.nr_wakeups_remote = 0;
+ p->se.nr_wakeups_affine = 0;
+ p->se.nr_wakeups_affine_attempts = 0;
+ p->se.nr_wakeups_passive = 0;
+ p->se.nr_wakeups_idle = 0;
+
#endif
INIT_LIST_HEAD(&p->rt.run_list);
@@ -2431,9 +2680,32 @@ void sched_fork(struct task_struct *p, int clone_flags)
set_task_cpu(p, cpu);
/*
- * Make sure we do not leak PI boosting priority to the child:
+ * Make sure we do not leak PI boosting priority to the child.
*/
p->prio = current->normal_prio;
+
+ /*
+ * Revert to default priority/policy on fork if requested.
+ */
+ if (unlikely(p->sched_reset_on_fork)) {
+ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
+ p->policy = SCHED_NORMAL;
+
+ if (p->normal_prio < DEFAULT_PRIO)
+ p->prio = DEFAULT_PRIO;
+
+ if (PRIO_TO_NICE(p->static_prio) < 0) {
+ p->static_prio = NICE_TO_PRIO(0);
+ set_load_weight(p);
+ }
+
+ /*
+ * We don't need the reset flag anymore after the fork. It has
+ * fulfilled its duty:
+ */
+ p->sched_reset_on_fork = 0;
+ }
+
if (!rt_prio(p->prio))
p->sched_class = &fair_sched_class;
@@ -2448,6 +2720,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
#endif
+ plist_node_init(&p->pushable_tasks, MAX_PRIO);
+
put_cpu();
}
@@ -2491,7 +2765,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
#ifdef CONFIG_PREEMPT_NOTIFIERS
/**
- * preempt_notifier_register - tell me when current is being being preempted & rescheduled
+ * preempt_notifier_register - tell me when current is being preempted & rescheduled
* @notifier: notifier struct to register
*/
void preempt_notifier_register(struct preempt_notifier *notifier)
@@ -2604,11 +2878,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
*/
prev_state = prev->state;
finish_arch_switch(prev);
+ perf_counter_task_sched_in(current, cpu_of(rq));
finish_lock_switch(rq, prev);
-#ifdef CONFIG_SMP
- if (current->sched_class->post_schedule)
- current->sched_class->post_schedule(rq);
-#endif
fire_sched_in_preempt_notifiers(current);
if (mm)
@@ -2623,6 +2894,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
}
}
+#ifdef CONFIG_SMP
+
+/* assumes rq->lock is held */
+static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
+{
+ if (prev->sched_class->pre_schedule)
+ prev->sched_class->pre_schedule(rq, prev);
+}
+
+/* rq->lock is NOT held, but preemption is disabled */
+static inline void post_schedule(struct rq *rq)
+{
+ if (rq->post_schedule) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&rq->lock, flags);
+ if (rq->curr->sched_class->post_schedule)
+ rq->curr->sched_class->post_schedule(rq);
+ spin_unlock_irqrestore(&rq->lock, flags);
+
+ rq->post_schedule = 0;
+ }
+}
+
+#else
+
+static inline void pre_schedule(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void post_schedule(struct rq *rq)
+{
+}
+
+#endif
+
/**
* schedule_tail - first thing a freshly forked thread must call.
* @prev: the thread we just switched away from.
@@ -2633,6 +2940,13 @@ asmlinkage void schedule_tail(struct task_struct *prev)
struct rq *rq = this_rq();
finish_task_switch(rq, prev);
+
+ /*
+ * FIXME: do we need to worry about rq being invalidated by the
+ * task_switch?
+ */
+ post_schedule(rq);
+
#ifdef __ARCH_WANT_UNLOCKED_CTXSW
/* In this case, finish_task_switch does not reenable preemption */
preempt_enable();
@@ -2660,7 +2974,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
* combine the page table reload and the switch backend into
* one hypercall.
*/
- arch_enter_lazy_cpu_mode();
+ arch_start_context_switch(prev);
if (unlikely(!mm)) {
next->active_mm = oldmm;
@@ -2750,19 +3064,81 @@ unsigned long nr_iowait(void)
return sum;
}
-unsigned long nr_active(void)
+/* Variables and functions for calc_load */
+static atomic_long_t calc_load_tasks;
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+
+/**
+ * get_avenrun - get the load average array
+ * @loads: pointer to dest load array
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+ loads[0] = (avenrun[0] + offset) << shift;
+ loads[1] = (avenrun[1] + offset) << shift;
+ loads[2] = (avenrun[2] + offset) << shift;
+}
+
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
- unsigned long i, running = 0, uninterruptible = 0;
+ load *= exp;
+ load += active * (FIXED_1 - exp);
+ return load >> FSHIFT;
+}
- for_each_online_cpu(i) {
- running += cpu_rq(i)->nr_running;
- uninterruptible += cpu_rq(i)->nr_uninterruptible;
- }
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(void)
+{
+ unsigned long upd = calc_load_update + 10;
+ long active;
+
+ if (time_before(jiffies, upd))
+ return;
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+ avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+ avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+ calc_load_update += LOAD_FREQ;
+}
+
+/*
+ * Either called from update_cpu_load() or from a cpu going idle
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+ long nr_active, delta;
- if (unlikely((long)uninterruptible < 0))
- uninterruptible = 0;
+ nr_active = this_rq->nr_running;
+ nr_active += (long) this_rq->nr_uninterruptible;
- return running + uninterruptible;
+ if (nr_active != this_rq->calc_load_active) {
+ delta = nr_active - this_rq->calc_load_active;
+ this_rq->calc_load_active = nr_active;
+ atomic_long_add(delta, &calc_load_tasks);
+ }
+}
+
+/*
+ * Externally visible per-cpu scheduler statistics:
+ * cpu_nr_migrations(cpu) - number of migrations into that cpu
+ */
+u64 cpu_nr_migrations(int cpu)
+{
+ return cpu_rq(cpu)->nr_migrations_in;
}
/*
@@ -2793,6 +3169,11 @@ static void update_cpu_load(struct rq *this_rq)
new_load += scale-1;
this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
}
+
+ if (time_after_eq(jiffies, this_rq->calc_load_update)) {
+ this_rq->calc_load_update += LOAD_FREQ;
+ calc_load_account_active(this_rq);
+ }
}
#ifdef CONFIG_SMP
@@ -2913,6 +3294,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned)
{
+ int tsk_cache_hot = 0;
/*
* We do not migrate tasks that are:
* 1) running (obviously), or
@@ -2936,10 +3318,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
* 2) too many balance attempts have failed.
*/
- if (!task_hot(p, rq->clock, sd) ||
- sd->nr_balance_failed > sd->cache_nice_tries) {
+ tsk_cache_hot = task_hot(p, rq->clock, sd);
+ if (!tsk_cache_hot ||
+ sd->nr_balance_failed > sd->cache_nice_tries) {
#ifdef CONFIG_SCHEDSTATS
- if (task_hot(p, rq->clock, sd)) {
+ if (tsk_cache_hot) {
schedstat_inc(sd, lb_hot_gained[idle]);
schedstat_inc(p, se.nr_forced_migrations);
}
@@ -2947,7 +3330,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
return 1;
}
- if (task_hot(p, rq->clock, sd)) {
+ if (tsk_cache_hot) {
schedstat_inc(p, se.nr_failed_migrations_hot);
return 0;
}
@@ -2987,6 +3370,16 @@ next:
pulled++;
rem_load_move -= p->se.load.weight;
+#ifdef CONFIG_PREEMPT
+ /*
+ * NEWIDLE balancing is a source of latency, so preemptible kernels
+ * will stop after the first task is pulled to minimize the critical
+ * section.
+ */
+ if (idle == CPU_NEWLY_IDLE)
+ goto out;
+#endif
+
/*
* We only want to steal up to the prescribed amount of weighted load.
*/
@@ -3033,9 +3426,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
sd, idle, all_pinned, &this_best_prio);
class = class->next;
+#ifdef CONFIG_PREEMPT
+ /*
+ * NEWIDLE balancing is a source of latency, so preemptible
+ * kernels will stop after the first task is pulled to minimize
+ * the critical section.
+ */
if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
break;
-
+#endif
} while (class && max_load_move > total_load_moved);
return total_load_moved > 0;
@@ -3079,252 +3478,572 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
{
const struct sched_class *class;
- for (class = sched_class_highest; class; class = class->next)
+ for_each_class(class) {
if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
return 1;
+ }
return 0;
}
-
+/********** Helpers for find_busiest_group ************************/
/*
- * find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the amount of weighted load which
- * should be moved to restore balance via the imbalance parameter.
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ * during load balancing.
*/
-static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
- unsigned long *imbalance, enum cpu_idle_type idle,
- int *sd_idle, const struct cpumask *cpus, int *balance)
-{
- struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
- unsigned long max_load, avg_load, total_load, this_load, total_pwr;
- unsigned long max_pull;
- unsigned long busiest_load_per_task, busiest_nr_running;
- unsigned long this_load_per_task, this_nr_running;
- int load_idx, group_imb = 0;
+struct sd_lb_stats {
+ struct sched_group *busiest; /* Busiest group in this sd */
+ struct sched_group *this; /* Local group in this sd */
+ unsigned long total_load; /* Total load of all groups in sd */
+ unsigned long total_pwr; /* Total power of all groups in sd */
+ unsigned long avg_load; /* Average load across all groups in sd */
+
+ /** Statistics of this group */
+ unsigned long this_load;
+ unsigned long this_load_per_task;
+ unsigned long this_nr_running;
+
+ /* Statistics of the busiest group */
+ unsigned long max_load;
+ unsigned long busiest_load_per_task;
+ unsigned long busiest_nr_running;
+
+ int group_imb; /* Is there imbalance in this sd */
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- int power_savings_balance = 1;
- unsigned long leader_nr_running = 0, min_load_per_task = 0;
- unsigned long min_nr_running = ULONG_MAX;
- struct sched_group *group_min = NULL, *group_leader = NULL;
+ int power_savings_balance; /* Is powersave balance needed for this sd */
+ struct sched_group *group_min; /* Least loaded group in sd */
+ struct sched_group *group_leader; /* Group which relieves group_min */
+ unsigned long min_load_per_task; /* load_per_task in group_min */
+ unsigned long leader_nr_running; /* Nr running of group_leader */
+ unsigned long min_nr_running; /* Nr running of group_min */
#endif
+};
- max_load = this_load = total_load = total_pwr = 0;
- busiest_load_per_task = busiest_nr_running = 0;
- this_load_per_task = this_nr_running = 0;
+/*
+ * sg_lb_stats - stats of a sched_group required for load_balancing
+ */
+struct sg_lb_stats {
+ unsigned long avg_load; /*Avg load across the CPUs of the group */
+ unsigned long group_load; /* Total load over the CPUs of the group */
+ unsigned long sum_nr_running; /* Nr tasks running in the group */
+ unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+ unsigned long group_capacity;
+ int group_imb; /* Is there an imbalance in the group ? */
+};
+
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+ return cpumask_first(sched_group_cpus(group));
+}
+
+/**
+ * get_sd_load_idx - Obtain the load index for a given sched domain.
+ * @sd: The sched_domain whose load_idx is to be obtained.
+ * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ */
+static inline int get_sd_load_idx(struct sched_domain *sd,
+ enum cpu_idle_type idle)
+{
+ int load_idx;
- if (idle == CPU_NOT_IDLE)
+ switch (idle) {
+ case CPU_NOT_IDLE:
load_idx = sd->busy_idx;
- else if (idle == CPU_NEWLY_IDLE)
+ break;
+
+ case CPU_NEWLY_IDLE:
load_idx = sd->newidle_idx;
- else
+ break;
+ default:
load_idx = sd->idle_idx;
+ break;
+ }
- do {
- unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
- int local_group;
- int i;
- int __group_imb = 0;
- unsigned int balance_cpu = -1, first_idle_cpu = 0;
- unsigned long sum_nr_running, sum_weighted_load;
- unsigned long sum_avg_load_per_task;
- unsigned long avg_load_per_task;
+ return load_idx;
+}
- local_group = cpumask_test_cpu(this_cpu,
- sched_group_cpus(group));
- if (local_group)
- balance_cpu = cpumask_first(sched_group_cpus(group));
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * init_sd_power_savings_stats - Initialize power savings statistics for
+ * the given sched_domain, during load balancing.
+ *
+ * @sd: Sched domain whose power-savings statistics are to be initialized.
+ * @sds: Variable containing the statistics for sd.
+ * @idle: Idle status of the CPU at which we're performing load-balancing.
+ */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+ struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+ /*
+ * Busy processors will not participate in power savings
+ * balance.
+ */
+ if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ sds->power_savings_balance = 0;
+ else {
+ sds->power_savings_balance = 1;
+ sds->min_nr_running = ULONG_MAX;
+ sds->leader_nr_running = 0;
+ }
+}
- /* Tally up the load of all CPUs in the group */
- sum_weighted_load = sum_nr_running = avg_load = 0;
- sum_avg_load_per_task = avg_load_per_task = 0;
+/**
+ * update_sd_power_savings_stats - Update the power saving stats for a
+ * sched_domain while performing load balancing.
+ *
+ * @group: sched_group belonging to the sched_domain under consideration.
+ * @sds: Variable containing the statistics of the sched_domain
+ * @local_group: Does group contain the CPU for which we're performing
+ * load balancing ?
+ * @sgs: Variable containing the statistics of the group.
+ */
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+ struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
- max_cpu_load = 0;
- min_cpu_load = ~0UL;
+ if (!sds->power_savings_balance)
+ return;
- for_each_cpu_and(i, sched_group_cpus(group), cpus) {
- struct rq *rq = cpu_rq(i);
+ /*
+ * If the local group is idle or completely loaded
+ * no need to do power savings balance at this domain
+ */
+ if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
+ !sds->this_nr_running))
+ sds->power_savings_balance = 0;
- if (*sd_idle && rq->nr_running)
- *sd_idle = 0;
+ /*
+ * If a group is already running at full capacity or idle,
+ * don't include that group in power savings calculations
+ */
+ if (!sds->power_savings_balance ||
+ sgs->sum_nr_running >= sgs->group_capacity ||
+ !sgs->sum_nr_running)
+ return;
- /* Bias balancing toward cpus of our domain */
- if (local_group) {
- if (idle_cpu(i) && !first_idle_cpu) {
- first_idle_cpu = 1;
- balance_cpu = i;
- }
+ /*
+ * Calculate the group which has the least non-idle load.
+ * This is the group from where we need to pick up the load
+ * for saving power
+ */
+ if ((sgs->sum_nr_running < sds->min_nr_running) ||
+ (sgs->sum_nr_running == sds->min_nr_running &&
+ group_first_cpu(group) > group_first_cpu(sds->group_min))) {
+ sds->group_min = group;
+ sds->min_nr_running = sgs->sum_nr_running;
+ sds->min_load_per_task = sgs->sum_weighted_load /
+ sgs->sum_nr_running;
+ }
- load = target_load(i, load_idx);
- } else {
- load = source_load(i, load_idx);
- if (load > max_cpu_load)
- max_cpu_load = load;
- if (min_cpu_load > load)
- min_cpu_load = load;
- }
+ /*
+ * Calculate the group which is almost near its
+ * capacity but still has some space to pick up some load
+ * from other group and save more power
+ */
+ if (sgs->sum_nr_running + 1 > sgs->group_capacity)
+ return;
- avg_load += load;
- sum_nr_running += rq->nr_running;
- sum_weighted_load += weighted_cpuload(i);
+ if (sgs->sum_nr_running > sds->leader_nr_running ||
+ (sgs->sum_nr_running == sds->leader_nr_running &&
+ group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
+ sds->group_leader = group;
+ sds->leader_nr_running = sgs->sum_nr_running;
+ }
+}
- sum_avg_load_per_task += cpu_avg_load_per_task(i);
- }
+/**
+ * check_power_save_busiest_group - see if there is potential for some power-savings balance
+ * @sds: Variable containing the statistics of the sched_domain
+ * under consideration.
+ * @this_cpu: Cpu at which we're currently performing load-balancing.
+ * @imbalance: Variable to store the imbalance.
+ *
+ * Description:
+ * Check if we have potential to perform some power-savings balance.
+ * If yes, set the busiest group to be the least loaded group in the
+ * sched_domain, so that it's CPUs can be put to idle.
+ *
+ * Returns 1 if there is potential to perform power-savings balance.
+ * Else returns 0.
+ */
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+ int this_cpu, unsigned long *imbalance)
+{
+ if (!sds->power_savings_balance)
+ return 0;
- /*
- * First idle cpu or the first cpu(busiest) in this sched group
- * is eligible for doing load balancing at this and above
- * domains. In the newly idle case, we will allow all the cpu's
- * to do the newly idle load balance.
- */
- if (idle != CPU_NEWLY_IDLE && local_group &&
- balance_cpu != this_cpu && balance) {
- *balance = 0;
- goto ret;
- }
+ if (sds->this != sds->group_leader ||
+ sds->group_leader == sds->group_min)
+ return 0;
- total_load += avg_load;
- total_pwr += group->__cpu_power;
+ *imbalance = sds->min_load_per_task;
+ sds->busiest = sds->group_min;
- /* Adjust by relative CPU power of the group */
- avg_load = sg_div_cpu_power(group,
- avg_load * SCHED_LOAD_SCALE);
+ if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+ cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+ group_first_cpu(sds->group_leader);
+ }
+ return 1;
- /*
- * Consider the group unbalanced when the imbalance is larger
- * than the average weight of two tasks.
- *
- * APZ: with cgroup the avg task weight can vary wildly and
- * might not be a suitable number - should we keep a
- * normalized nr_running number somewhere that negates
- * the hierarchy?
- */
- avg_load_per_task = sg_div_cpu_power(group,
- sum_avg_load_per_task * SCHED_LOAD_SCALE);
+}
+#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+ struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+ return;
+}
- if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
- __group_imb = 1;
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+ struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+ return;
+}
- group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+ int this_cpu, unsigned long *imbalance)
+{
+ return 0;
+}
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+ unsigned long weight = cpumask_weight(sched_domain_span(sd));
+ unsigned long smt_gain = sd->smt_gain;
+
+ smt_gain /= weight;
+
+ return smt_gain;
+}
+
+unsigned long scale_rt_power(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ u64 total, available;
+
+ sched_avg_update(rq);
+
+ total = sched_avg_period() + (rq->clock - rq->age_stamp);
+ available = total - rq->rt_avg;
+
+ if (unlikely((s64)total < SCHED_LOAD_SCALE))
+ total = SCHED_LOAD_SCALE;
+
+ total >>= SCHED_LOAD_SHIFT;
+
+ return div_u64(available, total);
+}
+
+static void update_cpu_power(struct sched_domain *sd, int cpu)
+{
+ unsigned long weight = cpumask_weight(sched_domain_span(sd));
+ unsigned long power = SCHED_LOAD_SCALE;
+ struct sched_group *sdg = sd->groups;
+
+ /* here we could scale based on cpufreq */
+
+ if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
+ power *= arch_scale_smt_power(sd, cpu);
+ power >>= SCHED_LOAD_SHIFT;
+ }
+
+ power *= scale_rt_power(cpu);
+ power >>= SCHED_LOAD_SHIFT;
+
+ if (!power)
+ power = 1;
+
+ sdg->cpu_power = power;
+}
+
+static void update_group_power(struct sched_domain *sd, int cpu)
+{
+ struct sched_domain *child = sd->child;
+ struct sched_group *group, *sdg = sd->groups;
+ unsigned long power;
+
+ if (!child) {
+ update_cpu_power(sd, cpu);
+ return;
+ }
+
+ power = 0;
+
+ group = child->groups;
+ do {
+ power += group->cpu_power;
+ group = group->next;
+ } while (group != child->groups);
+
+ sdg->cpu_power = power;
+}
+
+/**
+ * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ * @group: sched_group whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @load_idx: Load index of sched_domain of this_cpu for load calc.
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @local_group: Does group contain this_cpu.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sgs: variable to hold the statistics for this group.
+ */
+static inline void update_sg_lb_stats(struct sched_domain *sd,
+ struct sched_group *group, int this_cpu,
+ enum cpu_idle_type idle, int load_idx, int *sd_idle,
+ int local_group, const struct cpumask *cpus,
+ int *balance, struct sg_lb_stats *sgs)
+{
+ unsigned long load, max_cpu_load, min_cpu_load;
+ int i;
+ unsigned int balance_cpu = -1, first_idle_cpu = 0;
+ unsigned long sum_avg_load_per_task;
+ unsigned long avg_load_per_task;
+
+ if (local_group) {
+ balance_cpu = group_first_cpu(group);
+ if (balance_cpu == this_cpu)
+ update_group_power(sd, this_cpu);
+ }
+
+ /* Tally up the load of all CPUs in the group */
+ sum_avg_load_per_task = avg_load_per_task = 0;
+ max_cpu_load = 0;
+ min_cpu_load = ~0UL;
+
+ for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+ struct rq *rq = cpu_rq(i);
+
+ if (*sd_idle && rq->nr_running)
+ *sd_idle = 0;
+
+ /* Bias balancing toward cpus of our domain */
if (local_group) {
- this_load = avg_load;
- this = group;
- this_nr_running = sum_nr_running;
- this_load_per_task = sum_weighted_load;
- } else if (avg_load > max_load &&
- (sum_nr_running > group_capacity || __group_imb)) {
- max_load = avg_load;
- busiest = group;
- busiest_nr_running = sum_nr_running;
- busiest_load_per_task = sum_weighted_load;
- group_imb = __group_imb;
+ if (idle_cpu(i) && !first_idle_cpu) {
+ first_idle_cpu = 1;
+ balance_cpu = i;
+ }
+
+ load = target_load(i, load_idx);
+ } else {
+ load = source_load(i, load_idx);
+ if (load > max_cpu_load)
+ max_cpu_load = load;
+ if (min_cpu_load > load)
+ min_cpu_load = load;
}
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- /*
- * Busy processors will not participate in power savings
- * balance.
- */
- if (idle == CPU_NOT_IDLE ||
- !(sd->flags & SD_POWERSAVINGS_BALANCE))
- goto group_next;
+ sgs->group_load += load;
+ sgs->sum_nr_running += rq->nr_running;
+ sgs->sum_weighted_load += weighted_cpuload(i);
- /*
- * If the local group is idle or completely loaded
- * no need to do power savings balance at this domain
- */
- if (local_group && (this_nr_running >= group_capacity ||
- !this_nr_running))
- power_savings_balance = 0;
+ sum_avg_load_per_task += cpu_avg_load_per_task(i);
+ }
- /*
- * If a group is already running at full capacity or idle,
- * don't include that group in power savings calculations
- */
- if (!power_savings_balance || sum_nr_running >= group_capacity
- || !sum_nr_running)
- goto group_next;
+ /*
+ * First idle cpu or the first cpu(busiest) in this sched group
+ * is eligible for doing load balancing at this and above
+ * domains. In the newly idle case, we will allow all the cpu's
+ * to do the newly idle load balance.
+ */
+ if (idle != CPU_NEWLY_IDLE && local_group &&
+ balance_cpu != this_cpu && balance) {
+ *balance = 0;
+ return;
+ }
- /*
- * Calculate the group which has the least non-idle load.
- * This is the group from where we need to pick up the load
- * for saving power
- */
- if ((sum_nr_running < min_nr_running) ||
- (sum_nr_running == min_nr_running &&
- cpumask_first(sched_group_cpus(group)) >
- cpumask_first(sched_group_cpus(group_min)))) {
- group_min = group;
- min_nr_running = sum_nr_running;
- min_load_per_task = sum_weighted_load /
- sum_nr_running;
- }
+ /* Adjust by relative CPU power of the group */
+ sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+
+ /*
+ * Consider the group unbalanced when the imbalance is larger
+ * than the average weight of two tasks.
+ *
+ * APZ: with cgroup the avg task weight can vary wildly and
+ * might not be a suitable number - should we keep a
+ * normalized nr_running number somewhere that negates
+ * the hierarchy?
+ */
+ avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
+ group->cpu_power;
+
+ if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+ sgs->group_imb = 1;
+
+ sgs->group_capacity =
+ DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+}
+
+/**
+ * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: sched_domain whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sds: variable to hold the statistics for this sched_domain.
+ */
+static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+ enum cpu_idle_type idle, int *sd_idle,
+ const struct cpumask *cpus, int *balance,
+ struct sd_lb_stats *sds)
+{
+ struct sched_domain *child = sd->child;
+ struct sched_group *group = sd->groups;
+ struct sg_lb_stats sgs;
+ int load_idx, prefer_sibling = 0;
+
+ if (child && child->flags & SD_PREFER_SIBLING)
+ prefer_sibling = 1;
+
+ init_sd_power_savings_stats(sd, sds, idle);
+ load_idx = get_sd_load_idx(sd, idle);
+
+ do {
+ int local_group;
+
+ local_group = cpumask_test_cpu(this_cpu,
+ sched_group_cpus(group));
+ memset(&sgs, 0, sizeof(sgs));
+ update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
+ local_group, cpus, balance, &sgs);
+
+ if (local_group && balance && !(*balance))
+ return;
+
+ sds->total_load += sgs.group_load;
+ sds->total_pwr += group->cpu_power;
/*
- * Calculate the group which is almost near its
- * capacity but still has some space to pick up some load
- * from other group and save more power
+ * In case the child domain prefers tasks go to siblings
+ * first, lower the group capacity to one so that we'll try
+ * and move all the excess tasks away.
*/
- if (sum_nr_running <= group_capacity - 1) {
- if (sum_nr_running > leader_nr_running ||
- (sum_nr_running == leader_nr_running &&
- cpumask_first(sched_group_cpus(group)) <
- cpumask_first(sched_group_cpus(group_leader)))) {
- group_leader = group;
- leader_nr_running = sum_nr_running;
- }
+ if (prefer_sibling)
+ sgs.group_capacity = min(sgs.group_capacity, 1UL);
+
+ if (local_group) {
+ sds->this_load = sgs.avg_load;
+ sds->this = group;
+ sds->this_nr_running = sgs.sum_nr_running;
+ sds->this_load_per_task = sgs.sum_weighted_load;
+ } else if (sgs.avg_load > sds->max_load &&
+ (sgs.sum_nr_running > sgs.group_capacity ||
+ sgs.group_imb)) {
+ sds->max_load = sgs.avg_load;
+ sds->busiest = group;
+ sds->busiest_nr_running = sgs.sum_nr_running;
+ sds->busiest_load_per_task = sgs.sum_weighted_load;
+ sds->group_imb = sgs.group_imb;
}
-group_next:
-#endif
+
+ update_sd_power_savings_stats(group, sds, local_group, &sgs);
group = group->next;
} while (group != sd->groups);
+}
- if (!busiest || this_load >= max_load || busiest_nr_running == 0)
- goto out_balanced;
-
- avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
-
- if (this_load >= avg_load ||
- 100*max_load <= sd->imbalance_pct*this_load)
- goto out_balanced;
+/**
+ * fix_small_imbalance - Calculate the minor imbalance that exists
+ * amongst the groups of a sched_domain, during
+ * load balancing.
+ * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: Variable to store the imbalance.
+ */
+static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+ int this_cpu, unsigned long *imbalance)
+{
+ unsigned long tmp, pwr_now = 0, pwr_move = 0;
+ unsigned int imbn = 2;
+
+ if (sds->this_nr_running) {
+ sds->this_load_per_task /= sds->this_nr_running;
+ if (sds->busiest_load_per_task >
+ sds->this_load_per_task)
+ imbn = 1;
+ } else
+ sds->this_load_per_task =
+ cpu_avg_load_per_task(this_cpu);
- busiest_load_per_task /= busiest_nr_running;
- if (group_imb)
- busiest_load_per_task = min(busiest_load_per_task, avg_load);
+ if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
+ sds->busiest_load_per_task * imbn) {
+ *imbalance = sds->busiest_load_per_task;
+ return;
+ }
/*
- * We're trying to get all the cpus to the average_load, so we don't
- * want to push ourselves above the average load, nor do we wish to
- * reduce the max loaded cpu below the average load, as either of these
- * actions would just result in more rebalancing later, and ping-pong
- * tasks around. Thus we look for the minimum possible imbalance.
- * Negative imbalances (*we* are more loaded than anyone else) will
- * be counted as no imbalance for these purposes -- we can't fix that
- * by pulling tasks to us. Be careful of negative numbers as they'll
- * appear as very large values with unsigned longs.
+ * OK, we don't have enough imbalance to justify moving tasks,
+ * however we may be able to increase total CPU power used by
+ * moving them.
*/
- if (max_load <= busiest_load_per_task)
- goto out_balanced;
+ pwr_now += sds->busiest->cpu_power *
+ min(sds->busiest_load_per_task, sds->max_load);
+ pwr_now += sds->this->cpu_power *
+ min(sds->this_load_per_task, sds->this_load);
+ pwr_now /= SCHED_LOAD_SCALE;
+
+ /* Amount of load we'd subtract */
+ tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+ sds->busiest->cpu_power;
+ if (sds->max_load > tmp)
+ pwr_move += sds->busiest->cpu_power *
+ min(sds->busiest_load_per_task, sds->max_load - tmp);
+
+ /* Amount of load we'd add */
+ if (sds->max_load * sds->busiest->cpu_power <
+ sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+ tmp = (sds->max_load * sds->busiest->cpu_power) /
+ sds->this->cpu_power;
+ else
+ tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+ sds->this->cpu_power;
+ pwr_move += sds->this->cpu_power *
+ min(sds->this_load_per_task, sds->this_load + tmp);
+ pwr_move /= SCHED_LOAD_SCALE;
+
+ /* Move if we gain throughput */
+ if (pwr_move > pwr_now)
+ *imbalance = sds->busiest_load_per_task;
+}
+
+/**
+ * calculate_imbalance - Calculate the amount of imbalance present within the
+ * groups of a given sched_domain during load balance.
+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: Cpu for which currently load balance is being performed.
+ * @imbalance: The variable to store the imbalance.
+ */
+static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+ unsigned long *imbalance)
+{
+ unsigned long max_pull;
/*
* In the presence of smp nice balancing, certain scenarios can have
* max load less than avg load(as we skip the groups at or below
* its cpu_power, while calculating max_load..)
*/
- if (max_load < avg_load) {
+ if (sds->max_load < sds->avg_load) {
*imbalance = 0;
- goto small_imbalance;
+ return fix_small_imbalance(sds, this_cpu, imbalance);
}
/* Don't want to pull so many tasks that a group would go idle */
- max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
+ max_pull = min(sds->max_load - sds->avg_load,
+ sds->max_load - sds->busiest_load_per_task);
/* How much load to actually move to equalise the imbalance */
- *imbalance = min(max_pull * busiest->__cpu_power,
- (avg_load - this_load) * this->__cpu_power)
+ *imbalance = min(max_pull * sds->busiest->cpu_power,
+ (sds->avg_load - sds->this_load) * sds->this->cpu_power)
/ SCHED_LOAD_SCALE;
/*
@@ -3333,83 +4052,135 @@ group_next:
* a think about bumping its value to force at least one task to be
* moved
*/
- if (*imbalance < busiest_load_per_task) {
- unsigned long tmp, pwr_now, pwr_move;
- unsigned int imbn;
-
-small_imbalance:
- pwr_move = pwr_now = 0;
- imbn = 2;
- if (this_nr_running) {
- this_load_per_task /= this_nr_running;
- if (busiest_load_per_task > this_load_per_task)
- imbn = 1;
- } else
- this_load_per_task = cpu_avg_load_per_task(this_cpu);
+ if (*imbalance < sds->busiest_load_per_task)
+ return fix_small_imbalance(sds, this_cpu, imbalance);
- if (max_load - this_load + busiest_load_per_task >=
- busiest_load_per_task * imbn) {
- *imbalance = busiest_load_per_task;
- return busiest;
- }
+}
+/******* find_busiest_group() helpers end here *********************/
- /*
- * OK, we don't have enough imbalance to justify moving tasks,
- * however we may be able to increase total CPU power used by
- * moving them.
- */
+/**
+ * find_busiest_group - Returns the busiest group within the sched_domain
+ * if there is an imbalance. If there isn't an imbalance, and
+ * the user has opted for power-savings, it returns a group whose
+ * CPUs can be put to idle by rebalancing those tasks elsewhere, if
+ * such a group exists.
+ *
+ * Also calculates the amount of weighted load which should be moved
+ * to restore balance.
+ *
+ * @sd: The sched_domain whose busiest group is to be returned.
+ * @this_cpu: The cpu for which load balancing is currently being performed.
+ * @imbalance: Variable which stores amount of weighted load which should
+ * be moved to restore balance/put a group to idle.
+ * @idle: The idle status of this_cpu.
+ * @sd_idle: The idleness of sd
+ * @cpus: The set of CPUs under consideration for load-balancing.
+ * @balance: Pointer to a variable indicating if this_cpu
+ * is the appropriate cpu to perform load balancing at this_level.
+ *
+ * Returns: - the busiest group if imbalance exists.
+ * - If no imbalance and user has opted for power-savings balance,
+ * return the least loaded group whose CPUs can be
+ * put to idle by rebalancing its tasks onto our group.
+ */
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+ unsigned long *imbalance, enum cpu_idle_type idle,
+ int *sd_idle, const struct cpumask *cpus, int *balance)
+{
+ struct sd_lb_stats sds;
- pwr_now += busiest->__cpu_power *
- min(busiest_load_per_task, max_load);
- pwr_now += this->__cpu_power *
- min(this_load_per_task, this_load);
- pwr_now /= SCHED_LOAD_SCALE;
-
- /* Amount of load we'd subtract */
- tmp = sg_div_cpu_power(busiest,
- busiest_load_per_task * SCHED_LOAD_SCALE);
- if (max_load > tmp)
- pwr_move += busiest->__cpu_power *
- min(busiest_load_per_task, max_load - tmp);
-
- /* Amount of load we'd add */
- if (max_load * busiest->__cpu_power <
- busiest_load_per_task * SCHED_LOAD_SCALE)
- tmp = sg_div_cpu_power(this,
- max_load * busiest->__cpu_power);
- else
- tmp = sg_div_cpu_power(this,
- busiest_load_per_task * SCHED_LOAD_SCALE);
- pwr_move += this->__cpu_power *
- min(this_load_per_task, this_load + tmp);
- pwr_move /= SCHED_LOAD_SCALE;
+ memset(&sds, 0, sizeof(sds));
- /* Move if we gain throughput */
- if (pwr_move > pwr_now)
- *imbalance = busiest_load_per_task;
- }
+ /*
+ * Compute the various statistics relavent for load balancing at
+ * this level.
+ */
+ update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
+ balance, &sds);
+
+ /* Cases where imbalance does not exist from POV of this_cpu */
+ /* 1) this_cpu is not the appropriate cpu to perform load balancing
+ * at this level.
+ * 2) There is no busy sibling group to pull from.
+ * 3) This group is the busiest group.
+ * 4) This group is more busy than the avg busieness at this
+ * sched_domain.
+ * 5) The imbalance is within the specified limit.
+ * 6) Any rebalance would lead to ping-pong
+ */
+ if (balance && !(*balance))
+ goto ret;
- return busiest;
+ if (!sds.busiest || sds.busiest_nr_running == 0)
+ goto out_balanced;
-out_balanced:
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
- goto ret;
+ if (sds.this_load >= sds.max_load)
+ goto out_balanced;
- if (this == group_leader && group_leader != group_min) {
- *imbalance = min_load_per_task;
- if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
- cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
- cpumask_first(sched_group_cpus(group_leader));
- }
- return group_min;
- }
-#endif
+ sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+
+ if (sds.this_load >= sds.avg_load)
+ goto out_balanced;
+
+ if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+ goto out_balanced;
+
+ sds.busiest_load_per_task /= sds.busiest_nr_running;
+ if (sds.group_imb)
+ sds.busiest_load_per_task =
+ min(sds.busiest_load_per_task, sds.avg_load);
+
+ /*
+ * We're trying to get all the cpus to the average_load, so we don't
+ * want to push ourselves above the average load, nor do we wish to
+ * reduce the max loaded cpu below the average load, as either of these
+ * actions would just result in more rebalancing later, and ping-pong
+ * tasks around. Thus we look for the minimum possible imbalance.
+ * Negative imbalances (*we* are more loaded than anyone else) will
+ * be counted as no imbalance for these purposes -- we can't fix that
+ * by pulling tasks to us. Be careful of negative numbers as they'll
+ * appear as very large values with unsigned longs.
+ */
+ if (sds.max_load <= sds.busiest_load_per_task)
+ goto out_balanced;
+
+ /* Looks like there is an imbalance. Compute it */
+ calculate_imbalance(&sds, this_cpu, imbalance);
+ return sds.busiest;
+
+out_balanced:
+ /*
+ * There is no obvious imbalance. But check if we can do some balancing
+ * to save power.
+ */
+ if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
+ return sds.busiest;
ret:
*imbalance = 0;
return NULL;
}
+static struct sched_group *group_of(int cpu)
+{
+ struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
+
+ if (!sd)
+ return NULL;
+
+ return sd->groups;
+}
+
+static unsigned long power_of(int cpu)
+{
+ struct sched_group *group = group_of(cpu);
+
+ if (!group)
+ return SCHED_LOAD_SCALE;
+
+ return group->cpu_power;
+}
+
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
@@ -3422,15 +4193,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
int i;
for_each_cpu(i, sched_group_cpus(group)) {
+ unsigned long power = power_of(i);
+ unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
unsigned long wl;
if (!cpumask_test_cpu(i, cpus))
continue;
rq = cpu_rq(i);
- wl = weighted_cpuload(i);
+ wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
+ wl /= power;
- if (rq->nr_running == 1 && wl > imbalance)
+ if (capacity && rq->nr_running == 1 && wl > imbalance)
continue;
if (wl > max_load) {
@@ -3448,19 +4222,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
*/
#define MAX_PINNED_INTERVAL 512
+/* Working cpumask for load_balance and load_balance_newidle. */
+static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *balance, struct cpumask *cpus)
+ int *balance)
{
int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
struct sched_group *group;
unsigned long imbalance;
struct rq *busiest;
unsigned long flags;
+ struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
cpumask_setall(cpus);
@@ -3615,8 +4393,7 @@ out:
* this_rq is locked.
*/
static int
-load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
- struct cpumask *cpus)
+load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
{
struct sched_group *group;
struct rq *busiest = NULL;
@@ -3624,6 +4401,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
int ld_moved = 0;
int sd_idle = 0;
int all_pinned = 0;
+ struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
cpumask_setall(cpus);
@@ -3764,10 +4542,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
struct sched_domain *sd;
int pulled_task = 0;
unsigned long next_balance = jiffies + HZ;
- cpumask_var_t tmpmask;
-
- if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
- return;
for_each_domain(this_cpu, sd) {
unsigned long interval;
@@ -3778,7 +4552,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
if (sd->flags & SD_BALANCE_NEWIDLE)
/* If we've pulled tasks over stop searching: */
pulled_task = load_balance_newidle(this_cpu, this_rq,
- sd, tmpmask);
+ sd);
interval = msecs_to_jiffies(sd->balance_interval);
if (time_after(next_balance, sd->last_balance + interval))
@@ -3793,7 +4567,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
*/
this_rq->next_balance = next_balance;
}
- free_cpumask_var(tmpmask);
}
/*
@@ -3851,10 +4624,131 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
static struct {
atomic_t load_balancer;
cpumask_var_t cpu_mask;
+ cpumask_var_t ilb_grp_nohz_mask;
} nohz ____cacheline_aligned = {
.load_balancer = ATOMIC_INIT(-1),
};
+int get_nohz_load_balancer(void)
+{
+ return atomic_read(&nohz.load_balancer);
+}
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu: The cpu whose lowest level of sched domain is to
+ * be returned.
+ * @flag: The flag to check for the lowest sched_domain
+ * for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+ struct sched_domain *sd;
+
+ for_each_domain(cpu, sd)
+ if (sd && (sd->flags & flag))
+ break;
+
+ return sd;
+}
+
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu: The cpu whose domains we're iterating over.
+ * @sd: variable holding the value of the power_savings_sd
+ * for cpu.
+ * @flag: The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+ for (sd = lowest_flag_domain(cpu, flag); \
+ (sd && (sd->flags & flag)); sd = sd->parent)
+
+/**
+ * is_semi_idle_group - Checks if the given sched_group is semi-idle.
+ * @ilb_group: group to be checked for semi-idleness
+ *
+ * Returns: 1 if the group is semi-idle. 0 otherwise.
+ *
+ * We define a sched_group to be semi idle if it has atleast one idle-CPU
+ * and atleast one non-idle CPU. This helper function checks if the given
+ * sched_group is semi-idle or not.
+ */
+static inline int is_semi_idle_group(struct sched_group *ilb_group)
+{
+ cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+ sched_group_cpus(ilb_group));
+
+ /*
+ * A sched_group is semi-idle when it has atleast one busy cpu
+ * and atleast one idle cpu.
+ */
+ if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+ return 0;
+
+ if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+ return 0;
+
+ return 1;
+}
+/**
+ * find_new_ilb - Finds the optimum idle load balancer for nomination.
+ * @cpu: The cpu which is nominating a new idle_load_balancer.
+ *
+ * Returns: Returns the id of the idle load balancer if it exists,
+ * Else, returns >= nr_cpu_ids.
+ *
+ * This algorithm picks the idle load balancer such that it belongs to a
+ * semi-idle powersavings sched_domain. The idea is to try and avoid
+ * completely idle packages/cores just for the purpose of idle load balancing
+ * when there are other idle cpu's which are better suited for that job.
+ */
+static int find_new_ilb(int cpu)
+{
+ struct sched_domain *sd;
+ struct sched_group *ilb_group;
+
+ /*
+ * Have idle load balancer selection from semi-idle packages only
+ * when power-aware load balancing is enabled
+ */
+ if (!(sched_smt_power_savings || sched_mc_power_savings))
+ goto out_done;
+
+ /*
+ * Optimize for the case when we have no idle CPUs or only one
+ * idle CPU. Don't walk the sched_domain hierarchy in such cases
+ */
+ if (cpumask_weight(nohz.cpu_mask) < 2)
+ goto out_done;
+
+ for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+ ilb_group = sd->groups;
+
+ do {
+ if (is_semi_idle_group(ilb_group))
+ return cpumask_first(nohz.ilb_grp_nohz_mask);
+
+ ilb_group = ilb_group->next;
+
+ } while (ilb_group != sd->groups);
+ }
+
+out_done:
+ return cpumask_first(nohz.cpu_mask);
+}
+#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+static inline int find_new_ilb(int call_cpu)
+{
+ return cpumask_first(nohz.cpu_mask);
+}
+#endif
+
/*
* This routine will try to nominate the ilb (idle load balancing)
* owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -3880,19 +4774,24 @@ int select_nohz_load_balancer(int stop_tick)
int cpu = smp_processor_id();
if (stop_tick) {
- cpumask_set_cpu(cpu, nohz.cpu_mask);
cpu_rq(cpu)->in_nohz_recently = 1;
- /*
- * If we are going offline and still the leader, give up!
- */
- if (!cpu_active(cpu) &&
- atomic_read(&nohz.load_balancer) == cpu) {
+ if (!cpu_active(cpu)) {
+ if (atomic_read(&nohz.load_balancer) != cpu)
+ return 0;
+
+ /*
+ * If we are going offline and still the leader,
+ * give up!
+ */
if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
BUG();
+
return 0;
}
+ cpumask_set_cpu(cpu, nohz.cpu_mask);
+
/* time for ilb owner also to sleep */
if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
if (atomic_read(&nohz.load_balancer) == cpu)
@@ -3904,8 +4803,24 @@ int select_nohz_load_balancer(int stop_tick)
/* make me the ilb owner */
if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
return 1;
- } else if (atomic_read(&nohz.load_balancer) == cpu)
+ } else if (atomic_read(&nohz.load_balancer) == cpu) {
+ int new_ilb;
+
+ if (!(sched_smt_power_savings ||
+ sched_mc_power_savings))
+ return 1;
+ /*
+ * Check to see if there is a more power-efficient
+ * ilb.
+ */
+ new_ilb = find_new_ilb(cpu);
+ if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
+ atomic_set(&nohz.load_balancer, -1);
+ resched_cpu(new_ilb);
+ return 0;
+ }
return 1;
+ }
} else {
if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
return 0;
@@ -3938,11 +4853,6 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
int need_serialize;
- cpumask_var_t tmp;
-
- /* Fails alloc? Rebalancing probably not a priority right now. */
- if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
- return;
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3967,7 +4877,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
+ if (load_balance(cpu, rq, sd, idle, &balance)) {
/*
* We've pulled tasks over so either we're no
* longer idle, or one of our SMT siblings is
@@ -4001,8 +4911,6 @@ out:
*/
if (likely(update_next_balance))
rq->next_balance = next_balance;
-
- free_cpumask_var(tmp);
}
/*
@@ -4052,6 +4960,11 @@ static void run_rebalance_domains(struct softirq_action *h)
#endif
}
+static inline int on_null_domain(int cpu)
+{
+ return !rcu_dereference(cpu_rq(cpu)->sd);
+}
+
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*
@@ -4076,15 +4989,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
}
if (atomic_read(&nohz.load_balancer) == -1) {
- /*
- * simple selection for now: Nominate the
- * first cpu in the nohz list to be the next
- * ilb owner.
- *
- * TBD: Traverse the sched domains and nominate
- * the nearest cpu in the nohz.cpu_mask.
- */
- int ilb = cpumask_first(nohz.cpu_mask);
+ int ilb = find_new_ilb(cpu);
if (ilb < nr_cpu_ids)
resched_cpu(ilb);
@@ -4109,7 +5014,9 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
cpumask_test_cpu(cpu, nohz.cpu_mask))
return;
#endif
- if (time_after_eq(jiffies, rq->next_balance))
+ /* Don't need to rebalance while attached to NULL domain */
+ if (time_after_eq(jiffies, rq->next_balance) &&
+ likely(!on_null_domain(cpu)))
raise_softirq(SCHED_SOFTIRQ);
}
@@ -4129,9 +5036,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
EXPORT_PER_CPU_SYMBOL(kstat);
/*
- * Return any ns on the sched_clock that have not yet been banked in
+ * Return any ns on the sched_clock that have not yet been accounted in
* @p in case that task is currently running.
+ *
+ * Called with task_rq_lock() held on @rq.
*/
+static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+ u64 ns = 0;
+
+ if (task_current(rq, p)) {
+ update_rq_clock(rq);
+ ns = rq->clock - p->se.exec_start;
+ if ((s64)ns < 0)
+ ns = 0;
+ }
+
+ return ns;
+}
+
unsigned long long task_delta_exec(struct task_struct *p)
{
unsigned long flags;
@@ -4139,16 +5062,49 @@ unsigned long long task_delta_exec(struct task_struct *p)
u64 ns = 0;
rq = task_rq_lock(p, &flags);
+ ns = do_task_delta_exec(p, rq);
+ task_rq_unlock(rq, &flags);
- if (task_current(rq, p)) {
- u64 delta_exec;
+ return ns;
+}
- update_rq_clock(rq);
- delta_exec = rq->clock - p->se.exec_start;
- if ((s64)delta_exec > 0)
- ns = delta_exec;
- }
+/*
+ * Return accounted runtime for the task.
+ * In case the task is currently running, return the runtime plus current's
+ * pending runtime that have not been accounted yet.
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+ unsigned long flags;
+ struct rq *rq;
+ u64 ns = 0;
+ rq = task_rq_lock(p, &flags);
+ ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+ task_rq_unlock(rq, &flags);
+
+ return ns;
+}
+
+/*
+ * Return sum_exec_runtime for the thread group.
+ * In case the task is currently running, return the sum plus current's
+ * pending runtime that have not been accounted yet.
+ *
+ * Note that the thread group might have other running tasks as well,
+ * so the return value not includes other pending runtime that other
+ * running tasks might have.
+ */
+unsigned long long thread_group_sched_runtime(struct task_struct *p)
+{
+ struct task_cputime totals;
+ unsigned long flags;
+ struct rq *rq;
+ u64 ns;
+
+ rq = task_rq_lock(p, &flags);
+ thread_group_cputime(p, &totals);
+ ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
task_rq_unlock(rq, &flags);
return ns;
@@ -4177,6 +5133,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
cpustat->nice = cputime64_add(cpustat->nice, tmp);
else
cpustat->user = cputime64_add(cpustat->user, tmp);
+
+ cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
/* Account for user time used */
acct_update_integrals(p);
}
@@ -4238,6 +5196,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
else
cpustat->system = cputime64_add(cpustat->system, tmp);
+ cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+
/* Account for system time used */
acct_update_integrals(p);
}
@@ -4285,7 +5245,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
if (user_tick)
account_user_time(p, one_jiffy, one_jiffy_scaled);
- else if (p != rq->idle)
+ else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
one_jiffy_scaled);
else
@@ -4393,16 +5353,15 @@ void scheduler_tick(void)
curr->sched_class->task_tick(rq, curr, 0);
spin_unlock(&rq->lock);
+ perf_counter_task_tick(curr, cpu);
+
#ifdef CONFIG_SMP
rq->idle_at_tick = idle_cpu(cpu);
trigger_load_balance(rq, cpu);
#endif
}
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
- defined(CONFIG_PREEMPT_TRACER))
-
-static inline unsigned long get_parent_ip(unsigned long addr)
+notrace unsigned long get_parent_ip(unsigned long addr)
{
if (in_lock_functions(addr)) {
addr = CALLER_ADDR2;
@@ -4412,6 +5371,9 @@ static inline unsigned long get_parent_ip(unsigned long addr)
return addr;
}
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+ defined(CONFIG_PREEMPT_TRACER))
+
void __kprobes add_preempt_count(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
@@ -4503,11 +5465,33 @@ static inline void schedule_debug(struct task_struct *prev)
#endif
}
+static void put_prev_task(struct rq *rq, struct task_struct *prev)
+{
+ if (prev->state == TASK_RUNNING) {
+ u64 runtime = prev->se.sum_exec_runtime;
+
+ runtime -= prev->se.prev_sum_exec_runtime;
+ runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+
+ /*
+ * In order to avoid avg_overlap growing stale when we are
+ * indeed overlapping and hence not getting put to sleep, grow
+ * the avg_overlap on preemption.
+ *
+ * We use the average preemption runtime because that
+ * correlates to the amount of cache footprint a task can
+ * build up.
+ */
+ update_avg(&prev->se.avg_overlap, runtime);
+ }
+ prev->sched_class->put_prev_task(rq, prev);
+}
+
/*
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev)
+pick_next_task(struct rq *rq)
{
const struct sched_class *class;
struct task_struct *p;
@@ -4549,7 +5533,7 @@ need_resched:
preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
- rcu_qsctr_inc(cpu);
+ rcu_sched_qs(cpu);
prev = rq->curr;
switch_count = &prev->nivcsw;
@@ -4573,19 +5557,17 @@ need_resched_nonpreemptible:
switch_count = &prev->nvcsw;
}
-#ifdef CONFIG_SMP
- if (prev->sched_class->pre_schedule)
- prev->sched_class->pre_schedule(rq, prev);
-#endif
+ pre_schedule(rq, prev);
if (unlikely(!rq->nr_running))
idle_balance(cpu, rq);
- prev->sched_class->put_prev_task(rq, prev);
- next = pick_next_task(rq, prev);
+ put_prev_task(rq, prev);
+ next = pick_next_task(rq);
if (likely(prev != next)) {
sched_info_switch(prev, next);
+ perf_counter_task_sched_out(prev, next, cpu);
rq->nr_switches++;
rq->curr = next;
@@ -4601,15 +5583,78 @@ need_resched_nonpreemptible:
} else
spin_unlock_irq(&rq->lock);
+ post_schedule(rq);
+
if (unlikely(reacquire_kernel_lock(current) < 0))
goto need_resched_nonpreemptible;
preempt_enable_no_resched();
- if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+ if (need_resched())
goto need_resched;
}
EXPORT_SYMBOL(schedule);
+#ifdef CONFIG_SMP
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+{
+ unsigned int cpu;
+ struct rq *rq;
+
+ if (!sched_feat(OWNER_SPIN))
+ return 0;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ /*
+ * Need to access the cpu field knowing that
+ * DEBUG_PAGEALLOC could have unmapped it if
+ * the mutex owner just released it and exited.
+ */
+ if (probe_kernel_address(&owner->cpu, cpu))
+ goto out;
+#else
+ cpu = owner->cpu;
+#endif
+
+ /*
+ * Even if the access succeeded (likely case),
+ * the cpu field may no longer be valid.
+ */
+ if (cpu >= nr_cpumask_bits)
+ goto out;
+
+ /*
+ * We need to validate that we can do a
+ * get_cpu() and that we have the percpu area.
+ */
+ if (!cpu_online(cpu))
+ goto out;
+
+ rq = cpu_rq(cpu);
+
+ for (;;) {
+ /*
+ * Owner changed, break to re-assess state.
+ */
+ if (lock->owner != owner)
+ break;
+
+ /*
+ * Is that owner really running on that cpu?
+ */
+ if (task_thread_info(rq->curr) != owner || need_resched())
+ return 0;
+
+ cpu_relax();
+ }
+out:
+ return 1;
+}
+#endif
+
#ifdef CONFIG_PREEMPT
/*
* this is the entry point to schedule() from in-kernel preemption
@@ -4637,7 +5682,7 @@ asmlinkage void __sched preempt_schedule(void)
* between schedule and now.
*/
barrier();
- } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+ } while (need_resched());
}
EXPORT_SYMBOL(preempt_schedule);
@@ -4666,7 +5711,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
* between schedule and now.
*/
barrier();
- } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+ } while (need_resched());
}
#endif /* CONFIG_PREEMPT */
@@ -4688,7 +5733,7 @@ EXPORT_SYMBOL(default_wake_function);
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, int sync, void *key)
+ int nr_exclusive, int sync, void *key)
{
wait_queue_t *curr, *next;
@@ -4707,6 +5752,9 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
*/
void __wake_up(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
@@ -4727,11 +5775,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
__wake_up_common(q, mode, 1, 0, NULL);
}
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+{
+ __wake_up_common(q, mode, 1, 0, key);
+}
+
/**
- * __wake_up_sync - wake up threads blocked on a waitqueue.
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
* @q: the waitqueue
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
*
* The sync wakeup differs that the waker knows that it will schedule
* away soon, so while the target thread will be woken up, it will not
@@ -4739,9 +5793,12 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
* with each other. This can prevent needless bouncing between CPUs.
*
* On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
*/
-void
-__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, void *key)
{
unsigned long flags;
int sync = 1;
@@ -4753,9 +5810,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
sync = 0;
spin_lock_irqsave(&q->lock, flags);
- __wake_up_common(q, mode, nr_exclusive, sync, NULL);
+ __wake_up_common(q, mode, nr_exclusive, sync, key);
spin_unlock_irqrestore(&q->lock, flags);
}
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+
+/*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+ __wake_up_sync_key(q, mode, nr_exclusive, NULL);
+}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
/**
@@ -4766,6 +5832,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
* awakened in the same order in which they were queued.
*
* See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
*/
void complete(struct completion *x)
{
@@ -4783,6 +5852,9 @@ EXPORT_SYMBOL(complete);
* @x: holds the state of this particular completion
*
* This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
*/
void complete_all(struct completion *x)
{
@@ -5140,7 +6212,7 @@ SYSCALL_DEFINE1(nice, int, increment)
if (increment > 40)
increment = 40;
- nice = PRIO_TO_NICE(current->static_prio) + increment;
+ nice = TASK_NICE(current) + increment;
if (nice < -20)
nice = -20;
if (nice > 19)
@@ -5258,17 +6330,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
unsigned long flags;
const struct sched_class *prev_class = p->sched_class;
struct rq *rq;
+ int reset_on_fork;
/* may grab non-irq protected spin_locks */
BUG_ON(in_interrupt());
recheck:
/* double check policy once rq lock held */
- if (policy < 0)
+ if (policy < 0) {
+ reset_on_fork = p->sched_reset_on_fork;
policy = oldpolicy = p->policy;
- else if (policy != SCHED_FIFO && policy != SCHED_RR &&
- policy != SCHED_NORMAL && policy != SCHED_BATCH &&
- policy != SCHED_IDLE)
- return -EINVAL;
+ } else {
+ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
+ policy &= ~SCHED_RESET_ON_FORK;
+
+ if (policy != SCHED_FIFO && policy != SCHED_RR &&
+ policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+ policy != SCHED_IDLE)
+ return -EINVAL;
+ }
+
/*
* Valid priorities for SCHED_FIFO and SCHED_RR are
* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@ -5312,6 +6392,10 @@ recheck:
/* can't change other user's priorities */
if (!check_same_owner(p))
return -EPERM;
+
+ /* Normal users shall not reset the sched_reset_on_fork flag */
+ if (p->sched_reset_on_fork && !reset_on_fork)
+ return -EPERM;
}
if (user) {
@@ -5355,6 +6439,8 @@ recheck:
if (running)
p->sched_class->put_prev_task(rq, p);
+ p->sched_reset_on_fork = reset_on_fork;
+
oldprio = p->prio;
__setscheduler(rq, p, policy, param->sched_priority);
@@ -5471,14 +6557,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
if (p) {
retval = security_task_getscheduler(p);
if (!retval)
- retval = p->policy;
+ retval = p->policy
+ | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
}
read_unlock(&tasklist_lock);
return retval;
}
/**
- * sys_sched_getscheduler - get the RT priority of a thread
+ * sys_sched_getparam - get the RT priority of a thread
* @pid: the pid in question.
* @param: structure containing the RT priority.
*/
@@ -5699,27 +6786,21 @@ SYSCALL_DEFINE0(sched_yield)
return 0;
}
+static inline int should_resched(void)
+{
+ return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
+}
+
static void __cond_resched(void)
{
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
- __might_sleep(__FILE__, __LINE__);
-#endif
- /*
- * The BKS might be reacquired before we have dropped
- * PREEMPT_ACTIVE, which could trigger a second
- * cond_resched() call.
- */
- do {
- add_preempt_count(PREEMPT_ACTIVE);
- schedule();
- sub_preempt_count(PREEMPT_ACTIVE);
- } while (need_resched());
+ add_preempt_count(PREEMPT_ACTIVE);
+ schedule();
+ sub_preempt_count(PREEMPT_ACTIVE);
}
int __sched _cond_resched(void)
{
- if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
- system_state == SYSTEM_RUNNING) {
+ if (should_resched()) {
__cond_resched();
return 1;
}
@@ -5728,21 +6809,23 @@ int __sched _cond_resched(void)
EXPORT_SYMBOL(_cond_resched);
/*
- * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
* call schedule, and on return reacquire the lock.
*
* This works OK both with and without CONFIG_PREEMPT. We do strange low-level
* operations here to prevent schedule() from being called twice (once via
* spin_unlock(), once by hand).
*/
-int cond_resched_lock(spinlock_t *lock)
+int __cond_resched_lock(spinlock_t *lock)
{
- int resched = need_resched() && system_state == SYSTEM_RUNNING;
+ int resched = should_resched();
int ret = 0;
+ lockdep_assert_held(lock);
+
if (spin_needbreak(lock) || resched) {
spin_unlock(lock);
- if (resched && need_resched())
+ if (resched)
__cond_resched();
else
cpu_relax();
@@ -5751,13 +6834,13 @@ int cond_resched_lock(spinlock_t *lock)
}
return ret;
}
-EXPORT_SYMBOL(cond_resched_lock);
+EXPORT_SYMBOL(__cond_resched_lock);
-int __sched cond_resched_softirq(void)
+int __sched __cond_resched_softirq(void)
{
BUG_ON(!in_softirq());
- if (need_resched() && system_state == SYSTEM_RUNNING) {
+ if (should_resched()) {
local_bh_enable();
__cond_resched();
local_bh_disable();
@@ -5765,7 +6848,7 @@ int __sched cond_resched_softirq(void)
}
return 0;
}
-EXPORT_SYMBOL(cond_resched_softirq);
+EXPORT_SYMBOL(__cond_resched_softirq);
/**
* yield - yield the current processor to other threads.
@@ -5789,11 +6872,13 @@ EXPORT_SYMBOL(yield);
*/
void __sched io_schedule(void)
{
- struct rq *rq = &__raw_get_cpu_var(runqueues);
+ struct rq *rq = raw_rq();
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
+ current->in_iowait = 1;
schedule();
+ current->in_iowait = 0;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
}
@@ -5801,12 +6886,14 @@ EXPORT_SYMBOL(io_schedule);
long __sched io_schedule_timeout(long timeout)
{
- struct rq *rq = &__raw_get_cpu_var(runqueues);
+ struct rq *rq = raw_rq();
long ret;
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
+ current->in_iowait = 1;
ret = schedule_timeout(timeout);
+ current->in_iowait = 0;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
return ret;
@@ -5939,15 +7026,11 @@ void sched_show_task(struct task_struct *p)
printk(KERN_CONT " %016lx ", thread_saved_pc(p));
#endif
#ifdef CONFIG_DEBUG_STACK_USAGE
- {
- unsigned long *n = end_of_stack(p);
- while (!*n)
- n++;
- free = (unsigned long)n - (unsigned long)end_of_stack(p);
- }
+ free = stack_not_used(p);
#endif
- printk(KERN_CONT "%5lu %5d %6d\n", free,
- task_pid_nr(p), task_pid_nr(p->real_parent));
+ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+ task_pid_nr(p), task_pid_nr(p->real_parent),
+ (unsigned long)task_thread_info(p)->flags);
show_stack(p, NULL);
}
@@ -6127,8 +7210,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
/* Need help from migration thread: drop lock and wait. */
+ struct task_struct *mt = rq->migration_thread;
+
+ get_task_struct(mt);
task_rq_unlock(rq, &flags);
wake_up_process(rq->migration_thread);
+ put_task_struct(mt);
wait_for_completion(&req.done);
tlb_migrate_finish(p->mm);
return 0;
@@ -6186,6 +7273,11 @@ fail:
return ret;
}
+#define RCU_MIGRATION_IDLE 0
+#define RCU_MIGRATION_NEED_QS 1
+#define RCU_MIGRATION_GOT_QS 2
+#define RCU_MIGRATION_MUST_SYNC 3
+
/*
* migration_thread - this is a highprio system thread that performs
* thread migration by bumping thread off CPU then 'pushing' onto
@@ -6193,6 +7285,7 @@ fail:
*/
static int migration_thread(void *data)
{
+ int badcpu;
int cpu = (long)data;
struct rq *rq;
@@ -6208,7 +7301,7 @@ static int migration_thread(void *data)
if (cpu_is_offline(cpu)) {
spin_unlock_irq(&rq->lock);
- goto wait_to_die;
+ break;
}
if (rq->active_balance) {
@@ -6227,23 +7320,23 @@ static int migration_thread(void *data)
req = list_entry(head->next, struct migration_req, list);
list_del_init(head->next);
- spin_unlock(&rq->lock);
- __migrate_task(req->task, cpu, req->dest_cpu);
+ if (req->task != NULL) {
+ spin_unlock(&rq->lock);
+ __migrate_task(req->task, cpu, req->dest_cpu);
+ } else if (likely(cpu == (badcpu = smp_processor_id()))) {
+ req->dest_cpu = RCU_MIGRATION_GOT_QS;
+ spin_unlock(&rq->lock);
+ } else {
+ req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
+ spin_unlock(&rq->lock);
+ WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
+ }
local_irq_enable();
complete(&req->done);
}
__set_current_state(TASK_RUNNING);
- return 0;
-wait_to_die:
- /* Wait for kthread_stop */
- set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop()) {
- schedule();
- set_current_state(TASK_INTERRUPTIBLE);
- }
- __set_current_state(TASK_RUNNING);
return 0;
}
@@ -6418,7 +7511,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
if (!rq->nr_running)
break;
update_rq_clock(rq);
- next = pick_next_task(rq, rq->curr);
+ next = pick_next_task(rq);
if (!next)
break;
next->sched_class->put_prev_task(rq, next);
@@ -6426,6 +7519,15 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
}
}
+
+/*
+ * remove the tasks which were accounted by rq from calc_load_tasks.
+ */
+static void calc_global_load_remove(struct rq *rq)
+{
+ atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+ rq->calc_load_active = 0;
+}
#endif /* CONFIG_HOTPLUG_CPU */
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6649,7 +7751,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
rq = task_rq_lock(p, &flags);
__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
task_rq_unlock(rq, &flags);
+ get_task_struct(p);
cpu_rq(cpu)->migration_thread = p;
+ rq->calc_load_update = calc_load_update;
break;
case CPU_ONLINE:
@@ -6677,6 +7781,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
kthread_bind(cpu_rq(cpu)->migration_thread,
cpumask_any(cpu_online_mask));
kthread_stop(cpu_rq(cpu)->migration_thread);
+ put_task_struct(cpu_rq(cpu)->migration_thread);
cpu_rq(cpu)->migration_thread = NULL;
break;
@@ -6686,6 +7791,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
migrate_live_tasks(cpu);
rq = cpu_rq(cpu);
kthread_stop(rq->migration_thread);
+ put_task_struct(rq->migration_thread);
rq->migration_thread = NULL;
/* Idle task back to normal (off runqueue, low prio) */
spin_lock_irq(&rq->lock);
@@ -6699,7 +7805,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
cpuset_unlock();
migrate_nr_uninterruptible(rq);
BUG_ON(rq->nr_running != 0);
-
+ calc_global_load_remove(rq);
/*
* No need to migrate the tasks: it was best-effort if
* they didn't take sched_hotcpu_mutex. Just wake up
@@ -6735,8 +7841,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
return NOTIFY_OK;
}
-/* Register at highest priority so that task migration (migrate_all_tasks)
- * happens before everything else.
+/*
+ * Register at high priority so that task migration (migrate_all_tasks)
+ * happens before everything else. This has to be lower priority than
+ * the notifier in the perf_counter subsystem, though.
*/
static struct notifier_block __cpuinitdata migration_notifier = {
.notifier_call = migration_call,
@@ -6754,7 +7862,7 @@ static int __init migration_init(void)
migration_call(&migration_notifier, CPU_ONLINE, cpu);
register_cpu_notifier(&migration_notifier);
- return err;
+ return 0;
}
early_initcall(migration_init);
#endif
@@ -6801,7 +7909,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!group->__cpu_power) {
+ if (!group->cpu_power) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n");
@@ -6823,7 +7931,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_or(groupmask, groupmask, sched_group_cpus(group));
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
+
printk(KERN_CONT " %s", str);
+ if (group->cpu_power != SCHED_LOAD_SCALE) {
+ printk(KERN_CONT " (cpu_power = %d)",
+ group->cpu_power);
+ }
group = group->next;
} while (group != sd->groups);
@@ -6939,52 +8052,58 @@ static void free_rootdomain(struct root_domain *rd)
static void rq_attach_root(struct rq *rq, struct root_domain *rd)
{
+ struct root_domain *old_rd = NULL;
unsigned long flags;
spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
- struct root_domain *old_rd = rq->rd;
+ old_rd = rq->rd;
if (cpumask_test_cpu(rq->cpu, old_rd->online))
set_rq_offline(rq);
cpumask_clear_cpu(rq->cpu, old_rd->span);
- if (atomic_dec_and_test(&old_rd->refcount))
- free_rootdomain(old_rd);
+ /*
+ * If we dont want to free the old_rt yet then
+ * set old_rd to NULL to skip the freeing later
+ * in this function:
+ */
+ if (!atomic_dec_and_test(&old_rd->refcount))
+ old_rd = NULL;
}
atomic_inc(&rd->refcount);
rq->rd = rd;
cpumask_set_cpu(rq->cpu, rd->span);
- if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
+ if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);
spin_unlock_irqrestore(&rq->lock, flags);
+
+ if (old_rd)
+ free_rootdomain(old_rd);
}
-static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
+static int init_rootdomain(struct root_domain *rd, bool bootmem)
{
+ gfp_t gfp = GFP_KERNEL;
+
memset(rd, 0, sizeof(*rd));
- if (bootmem) {
- alloc_bootmem_cpumask_var(&def_root_domain.span);
- alloc_bootmem_cpumask_var(&def_root_domain.online);
- alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
- cpupri_init(&rd->cpupri, true);
- return 0;
- }
+ if (bootmem)
+ gfp = GFP_NOWAIT;
- if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->span, gfp))
goto out;
- if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->online, gfp))
goto free_span;
- if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->rto_mask, gfp))
goto free_online;
- if (cpupri_init(&rd->cpupri, false) != 0)
+ if (cpupri_init(&rd->cpupri, bootmem) != 0)
goto free_rto_mask;
return 0;
@@ -7101,7 +8220,7 @@ init_sched_build_groups(const struct cpumask *span,
continue;
cpumask_clear(sched_group_cpus(sg));
- sg->__cpu_power = 0;
+ sg->cpu_power = 0;
for_each_cpu(j, span) {
if (group_fn(j, cpu_map, NULL, tmpmask) != group)
@@ -7195,8 +8314,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
/*
* The cpus mask in sched_group and sched_domain hangs off the end.
- * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
- * for nr_cpu_ids < CONFIG_NR_CPUS.
+ *
+ * ( See the the comments in include/linux/sched.h:struct sched_group
+ * and struct sched_domain. )
*/
struct static_sched_group {
struct sched_group sg;
@@ -7208,6 +8328,39 @@ struct static_sched_domain {
DECLARE_BITMAP(span, CONFIG_NR_CPUS);
};
+struct s_data {
+#ifdef CONFIG_NUMA
+ int sd_allnodes;
+ cpumask_var_t domainspan;
+ cpumask_var_t covered;
+ cpumask_var_t notcovered;
+#endif
+ cpumask_var_t nodemask;
+ cpumask_var_t this_sibling_map;
+ cpumask_var_t this_core_map;
+ cpumask_var_t send_covered;
+ cpumask_var_t tmpmask;
+ struct sched_group **sched_group_nodes;
+ struct root_domain *rd;
+};
+
+enum s_alloc {
+ sa_sched_groups = 0,
+ sa_rootdomain,
+ sa_tmpmask,
+ sa_send_covered,
+ sa_this_core_map,
+ sa_this_sibling_map,
+ sa_nodemask,
+ sa_sched_group_nodes,
+#ifdef CONFIG_NUMA
+ sa_notcovered,
+ sa_covered,
+ sa_domainspan,
+#endif
+ sa_none,
+};
+
/*
* SMT sched-domains:
*/
@@ -7240,7 +8393,7 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
{
int group;
- cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+ cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
group = cpumask_first(mask);
if (sg)
*sg = &per_cpu(sched_group_core, group).sg;
@@ -7269,7 +8422,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
group = cpumask_first(mask);
#elif defined(CONFIG_SCHED_SMT)
- cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+ cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
group = cpumask_first(mask);
#else
group = cpu;
@@ -7317,7 +8470,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
struct sched_domain *sd;
sd = &per_cpu(phys_domains, j).sd;
- if (j != cpumask_first(sched_group_cpus(sd->groups))) {
+ if (j != group_first_cpu(sd->groups)) {
/*
* Only add "power" once for each
* physical package.
@@ -7325,11 +8478,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
continue;
}
- sg_inc_cpu_power(sg, sd->groups->__cpu_power);
+ sg->cpu_power += sd->groups->cpu_power;
}
sg = sg->next;
} while (sg != group_head);
}
+
+static int build_numa_sched_groups(struct s_data *d,
+ const struct cpumask *cpu_map, int num)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg, *prev;
+ int n, j;
+
+ cpumask_clear(d->covered);
+ cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
+ if (cpumask_empty(d->nodemask)) {
+ d->sched_group_nodes[num] = NULL;
+ goto out;
+ }
+
+ sched_domain_node_span(num, d->domainspan);
+ cpumask_and(d->domainspan, d->domainspan, cpu_map);
+
+ sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, num);
+ if (!sg) {
+ printk(KERN_WARNING "Can not alloc domain group for node %d\n",
+ num);
+ return -ENOMEM;
+ }
+ d->sched_group_nodes[num] = sg;
+
+ for_each_cpu(j, d->nodemask) {
+ sd = &per_cpu(node_domains, j).sd;
+ sd->groups = sg;
+ }
+
+ sg->cpu_power = 0;
+ cpumask_copy(sched_group_cpus(sg), d->nodemask);
+ sg->next = sg;
+ cpumask_or(d->covered, d->covered, d->nodemask);
+
+ prev = sg;
+ for (j = 0; j < nr_node_ids; j++) {
+ n = (num + j) % nr_node_ids;
+ cpumask_complement(d->notcovered, d->covered);
+ cpumask_and(d->tmpmask, d->notcovered, cpu_map);
+ cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
+ if (cpumask_empty(d->tmpmask))
+ break;
+ cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
+ if (cpumask_empty(d->tmpmask))
+ continue;
+ sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, num);
+ if (!sg) {
+ printk(KERN_WARNING
+ "Can not alloc domain group for node %d\n", j);
+ return -ENOMEM;
+ }
+ sg->cpu_power = 0;
+ cpumask_copy(sched_group_cpus(sg), d->tmpmask);
+ sg->next = prev->next;
+ cpumask_or(d->covered, d->covered, d->tmpmask);
+ prev->next = sg;
+ prev = sg;
+ }
+out:
+ return 0;
+}
#endif /* CONFIG_NUMA */
#ifdef CONFIG_NUMA
@@ -7383,45 +8601,47 @@ static void free_sched_groups(const struct cpumask *cpu_map,
* there are asymmetries in the topology. If there are asymmetries, group
* having more cpu_power will pickup more load compared to the group having
* less cpu_power.
- *
- * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
- * the maximum number of tasks a group can handle in the presence of other idle
- * or lightly loaded groups in the same sched domain.
*/
static void init_sched_groups_power(int cpu, struct sched_domain *sd)
{
struct sched_domain *child;
struct sched_group *group;
+ long power;
+ int weight;
WARN_ON(!sd || !sd->groups);
- if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
+ if (cpu != group_first_cpu(sd->groups))
return;
child = sd->child;
- sd->groups->__cpu_power = 0;
+ sd->groups->cpu_power = 0;
- /*
- * For perf policy, if the groups in child domain share resources
- * (for example cores sharing some portions of the cache hierarchy
- * or SMT), then set this domain groups cpu_power such that each group
- * can handle only one task, when there are other idle groups in the
- * same sched domain.
- */
- if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
- (child->flags &
- (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
- sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
+ if (!child) {
+ power = SCHED_LOAD_SCALE;
+ weight = cpumask_weight(sched_domain_span(sd));
+ /*
+ * SMT siblings share the power of a single core.
+ * Usually multiple threads get a better yield out of
+ * that one core than a single thread would have,
+ * reflect that in sd->smt_gain.
+ */
+ if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
+ power *= sd->smt_gain;
+ power /= weight;
+ power >>= SCHED_LOAD_SHIFT;
+ }
+ sd->groups->cpu_power += power;
return;
}
/*
- * add cpu_power of each child group to this groups cpu_power
+ * Add cpu_power of each child group to this groups cpu_power.
*/
group = child->groups;
do {
- sg_inc_cpu_power(sd->groups, group->__cpu_power);
+ sd->groups->cpu_power += group->cpu_power;
group = group->next;
} while (group != child->groups);
}
@@ -7495,280 +8715,285 @@ static void set_domain_attribute(struct sched_domain *sd,
}
}
-/*
- * Build sched domains for a given set of cpus and attach the sched domains
- * to the individual cpus
- */
-static int __build_sched_domains(const struct cpumask *cpu_map,
- struct sched_domain_attr *attr)
-{
- int i, err = -ENOMEM;
- struct root_domain *rd;
- cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
- tmpmask;
+static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
+ const struct cpumask *cpu_map)
+{
+ switch (what) {
+ case sa_sched_groups:
+ free_sched_groups(cpu_map, d->tmpmask); /* fall through */
+ d->sched_group_nodes = NULL;
+ case sa_rootdomain:
+ free_rootdomain(d->rd); /* fall through */
+ case sa_tmpmask:
+ free_cpumask_var(d->tmpmask); /* fall through */
+ case sa_send_covered:
+ free_cpumask_var(d->send_covered); /* fall through */
+ case sa_this_core_map:
+ free_cpumask_var(d->this_core_map); /* fall through */
+ case sa_this_sibling_map:
+ free_cpumask_var(d->this_sibling_map); /* fall through */
+ case sa_nodemask:
+ free_cpumask_var(d->nodemask); /* fall through */
+ case sa_sched_group_nodes:
#ifdef CONFIG_NUMA
- cpumask_var_t domainspan, covered, notcovered;
- struct sched_group **sched_group_nodes = NULL;
- int sd_allnodes = 0;
-
- if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
- goto out;
- if (!alloc_cpumask_var(&covered, GFP_KERNEL))
- goto free_domainspan;
- if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
- goto free_covered;
-#endif
-
- if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
- goto free_notcovered;
- if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
- goto free_nodemask;
- if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
- goto free_this_sibling_map;
- if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
- goto free_this_core_map;
- if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
- goto free_send_covered;
+ kfree(d->sched_group_nodes); /* fall through */
+ case sa_notcovered:
+ free_cpumask_var(d->notcovered); /* fall through */
+ case sa_covered:
+ free_cpumask_var(d->covered); /* fall through */
+ case sa_domainspan:
+ free_cpumask_var(d->domainspan); /* fall through */
+#endif
+ case sa_none:
+ break;
+ }
+}
+static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
+ const struct cpumask *cpu_map)
+{
#ifdef CONFIG_NUMA
- /*
- * Allocate the per-node list of sched groups
- */
- sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
- GFP_KERNEL);
- if (!sched_group_nodes) {
+ if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
+ return sa_none;
+ if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
+ return sa_domainspan;
+ if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
+ return sa_covered;
+ /* Allocate the per-node list of sched groups */
+ d->sched_group_nodes = kcalloc(nr_node_ids,
+ sizeof(struct sched_group *), GFP_KERNEL);
+ if (!d->sched_group_nodes) {
printk(KERN_WARNING "Can not alloc sched group node list\n");
- goto free_tmpmask;
- }
-#endif
-
- rd = alloc_rootdomain();
- if (!rd) {
+ return sa_notcovered;
+ }
+ sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
+#endif
+ if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
+ return sa_sched_group_nodes;
+ if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
+ return sa_nodemask;
+ if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
+ return sa_this_sibling_map;
+ if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+ return sa_this_core_map;
+ if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
+ return sa_send_covered;
+ d->rd = alloc_rootdomain();
+ if (!d->rd) {
printk(KERN_WARNING "Cannot alloc root domain\n");
- goto free_sched_groups;
+ return sa_tmpmask;
}
+ return sa_rootdomain;
+}
+static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
+ const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
+{
+ struct sched_domain *sd = NULL;
#ifdef CONFIG_NUMA
- sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
-#endif
-
- /*
- * Set up domains for cpus specified by the cpu_map.
- */
- for_each_cpu(i, cpu_map) {
- struct sched_domain *sd = NULL, *p;
-
- cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
-
-#ifdef CONFIG_NUMA
- if (cpumask_weight(cpu_map) >
- SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
- sd = &per_cpu(allnodes_domains, i).sd;
- SD_INIT(sd, ALLNODES);
- set_domain_attribute(sd, attr);
- cpumask_copy(sched_domain_span(sd), cpu_map);
- cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
- p = sd;
- sd_allnodes = 1;
- } else
- p = NULL;
+ struct sched_domain *parent;
- sd = &per_cpu(node_domains, i).sd;
- SD_INIT(sd, NODE);
+ d->sd_allnodes = 0;
+ if (cpumask_weight(cpu_map) >
+ SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
+ sd = &per_cpu(allnodes_domains, i).sd;
+ SD_INIT(sd, ALLNODES);
set_domain_attribute(sd, attr);
- sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
- sd->parent = p;
- if (p)
- p->child = sd;
- cpumask_and(sched_domain_span(sd),
- sched_domain_span(sd), cpu_map);
+ cpumask_copy(sched_domain_span(sd), cpu_map);
+ cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
+ d->sd_allnodes = 1;
+ }
+ parent = sd;
+
+ sd = &per_cpu(node_domains, i).sd;
+ SD_INIT(sd, NODE);
+ set_domain_attribute(sd, attr);
+ sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
+ sd->parent = parent;
+ if (parent)
+ parent->child = sd;
+ cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
#endif
+ return sd;
+}
- p = sd;
- sd = &per_cpu(phys_domains, i).sd;
- SD_INIT(sd, CPU);
- set_domain_attribute(sd, attr);
- cpumask_copy(sched_domain_span(sd), nodemask);
- sd->parent = p;
- if (p)
- p->child = sd;
- cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
+static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
+ const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ struct sched_domain *parent, int i)
+{
+ struct sched_domain *sd;
+ sd = &per_cpu(phys_domains, i).sd;
+ SD_INIT(sd, CPU);
+ set_domain_attribute(sd, attr);
+ cpumask_copy(sched_domain_span(sd), d->nodemask);
+ sd->parent = parent;
+ if (parent)
+ parent->child = sd;
+ cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
+ return sd;
+}
+static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
+ const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ struct sched_domain *parent, int i)
+{
+ struct sched_domain *sd = parent;
#ifdef CONFIG_SCHED_MC
- p = sd;
- sd = &per_cpu(core_domains, i).sd;
- SD_INIT(sd, MC);
- set_domain_attribute(sd, attr);
- cpumask_and(sched_domain_span(sd), cpu_map,
- cpu_coregroup_mask(i));
- sd->parent = p;
- p->child = sd;
- cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
+ sd = &per_cpu(core_domains, i).sd;
+ SD_INIT(sd, MC);
+ set_domain_attribute(sd, attr);
+ cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
+ sd->parent = parent;
+ parent->child = sd;
+ cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
#endif
+ return sd;
+}
+static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
+ const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ struct sched_domain *parent, int i)
+{
+ struct sched_domain *sd = parent;
#ifdef CONFIG_SCHED_SMT
- p = sd;
- sd = &per_cpu(cpu_domains, i).sd;
- SD_INIT(sd, SIBLING);
- set_domain_attribute(sd, attr);
- cpumask_and(sched_domain_span(sd),
- &per_cpu(cpu_sibling_map, i), cpu_map);
- sd->parent = p;
- p->child = sd;
- cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
+ sd = &per_cpu(cpu_domains, i).sd;
+ SD_INIT(sd, SIBLING);
+ set_domain_attribute(sd, attr);
+ cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
+ sd->parent = parent;
+ parent->child = sd;
+ cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
#endif
- }
+ return sd;
+}
+static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
+ const struct cpumask *cpu_map, int cpu)
+{
+ switch (l) {
#ifdef CONFIG_SCHED_SMT
- /* Set up CPU (sibling) groups */
- for_each_cpu(i, cpu_map) {
- cpumask_and(this_sibling_map,
- &per_cpu(cpu_sibling_map, i), cpu_map);
- if (i != cpumask_first(this_sibling_map))
- continue;
-
- init_sched_build_groups(this_sibling_map, cpu_map,
- &cpu_to_cpu_group,
- send_covered, tmpmask);
- }
+ case SD_LV_SIBLING: /* set up CPU (sibling) groups */
+ cpumask_and(d->this_sibling_map, cpu_map,
+ topology_thread_cpumask(cpu));
+ if (cpu == cpumask_first(d->this_sibling_map))
+ init_sched_build_groups(d->this_sibling_map, cpu_map,
+ &cpu_to_cpu_group,
+ d->send_covered, d->tmpmask);
+ break;
#endif
-
#ifdef CONFIG_SCHED_MC
- /* Set up multi-core groups */
- for_each_cpu(i, cpu_map) {
- cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
- if (i != cpumask_first(this_core_map))
- continue;
-
- init_sched_build_groups(this_core_map, cpu_map,
- &cpu_to_core_group,
- send_covered, tmpmask);
- }
+ case SD_LV_MC: /* set up multi-core groups */
+ cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
+ if (cpu == cpumask_first(d->this_core_map))
+ init_sched_build_groups(d->this_core_map, cpu_map,
+ &cpu_to_core_group,
+ d->send_covered, d->tmpmask);
+ break;
#endif
-
- /* Set up physical groups */
- for (i = 0; i < nr_node_ids; i++) {
- cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
- if (cpumask_empty(nodemask))
- continue;
-
- init_sched_build_groups(nodemask, cpu_map,
- &cpu_to_phys_group,
- send_covered, tmpmask);
- }
-
+ case SD_LV_CPU: /* set up physical groups */
+ cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
+ if (!cpumask_empty(d->nodemask))
+ init_sched_build_groups(d->nodemask, cpu_map,
+ &cpu_to_phys_group,
+ d->send_covered, d->tmpmask);
+ break;
#ifdef CONFIG_NUMA
- /* Set up node groups */
- if (sd_allnodes) {
- init_sched_build_groups(cpu_map, cpu_map,
- &cpu_to_allnodes_group,
- send_covered, tmpmask);
+ case SD_LV_ALLNODES:
+ init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
+ d->send_covered, d->tmpmask);
+ break;
+#endif
+ default:
+ break;
}
+}
- for (i = 0; i < nr_node_ids; i++) {
- /* Set up node groups */
- struct sched_group *sg, *prev;
- int j;
-
- cpumask_clear(covered);
- cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
- if (cpumask_empty(nodemask)) {
- sched_group_nodes[i] = NULL;
- continue;
- }
+/*
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
+ */
+static int __build_sched_domains(const struct cpumask *cpu_map,
+ struct sched_domain_attr *attr)
+{
+ enum s_alloc alloc_state = sa_none;
+ struct s_data d;
+ struct sched_domain *sd;
+ int i;
+#ifdef CONFIG_NUMA
+ d.sd_allnodes = 0;
+#endif
- sched_domain_node_span(i, domainspan);
- cpumask_and(domainspan, domainspan, cpu_map);
+ alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
+ if (alloc_state != sa_rootdomain)
+ goto error;
+ alloc_state = sa_sched_groups;
- sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, i);
- if (!sg) {
- printk(KERN_WARNING "Can not alloc domain group for "
- "node %d\n", i);
- goto error;
- }
- sched_group_nodes[i] = sg;
- for_each_cpu(j, nodemask) {
- struct sched_domain *sd;
+ /*
+ * Set up domains for cpus specified by the cpu_map.
+ */
+ for_each_cpu(i, cpu_map) {
+ cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
+ cpu_map);
- sd = &per_cpu(node_domains, j).sd;
- sd->groups = sg;
- }
- sg->__cpu_power = 0;
- cpumask_copy(sched_group_cpus(sg), nodemask);
- sg->next = sg;
- cpumask_or(covered, covered, nodemask);
- prev = sg;
+ sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
+ sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
+ sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
+ sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
+ }
- for (j = 0; j < nr_node_ids; j++) {
- int n = (i + j) % nr_node_ids;
+ for_each_cpu(i, cpu_map) {
+ build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
+ build_sched_groups(&d, SD_LV_MC, cpu_map, i);
+ }
- cpumask_complement(notcovered, covered);
- cpumask_and(tmpmask, notcovered, cpu_map);
- cpumask_and(tmpmask, tmpmask, domainspan);
- if (cpumask_empty(tmpmask))
- break;
+ /* Set up physical groups */
+ for (i = 0; i < nr_node_ids; i++)
+ build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
- cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
- if (cpumask_empty(tmpmask))
- continue;
+#ifdef CONFIG_NUMA
+ /* Set up node groups */
+ if (d.sd_allnodes)
+ build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
- sg = kmalloc_node(sizeof(struct sched_group) +
- cpumask_size(),
- GFP_KERNEL, i);
- if (!sg) {
- printk(KERN_WARNING
- "Can not alloc domain group for node %d\n", j);
- goto error;
- }
- sg->__cpu_power = 0;
- cpumask_copy(sched_group_cpus(sg), tmpmask);
- sg->next = prev->next;
- cpumask_or(covered, covered, tmpmask);
- prev->next = sg;
- prev = sg;
- }
- }
+ for (i = 0; i < nr_node_ids; i++)
+ if (build_numa_sched_groups(&d, cpu_map, i))
+ goto error;
#endif
/* Calculate CPU power for physical packages and nodes */
#ifdef CONFIG_SCHED_SMT
for_each_cpu(i, cpu_map) {
- struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
-
+ sd = &per_cpu(cpu_domains, i).sd;
init_sched_groups_power(i, sd);
}
#endif
#ifdef CONFIG_SCHED_MC
for_each_cpu(i, cpu_map) {
- struct sched_domain *sd = &per_cpu(core_domains, i).sd;
-
+ sd = &per_cpu(core_domains, i).sd;
init_sched_groups_power(i, sd);
}
#endif
for_each_cpu(i, cpu_map) {
- struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
-
+ sd = &per_cpu(phys_domains, i).sd;
init_sched_groups_power(i, sd);
}
#ifdef CONFIG_NUMA
for (i = 0; i < nr_node_ids; i++)
- init_numa_sched_groups_power(sched_group_nodes[i]);
+ init_numa_sched_groups_power(d.sched_group_nodes[i]);
- if (sd_allnodes) {
+ if (d.sd_allnodes) {
struct sched_group *sg;
cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
- tmpmask);
+ d.tmpmask);
init_numa_sched_groups_power(sg);
}
#endif
/* Attach the domains */
for_each_cpu(i, cpu_map) {
- struct sched_domain *sd;
#ifdef CONFIG_SCHED_SMT
sd = &per_cpu(cpu_domains, i).sd;
#elif defined(CONFIG_SCHED_MC)
@@ -7776,44 +9001,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
#else
sd = &per_cpu(phys_domains, i).sd;
#endif
- cpu_attach_domain(sd, rd, i);
+ cpu_attach_domain(sd, d.rd, i);
}
- err = 0;
-
-free_tmpmask:
- free_cpumask_var(tmpmask);
-free_send_covered:
- free_cpumask_var(send_covered);
-free_this_core_map:
- free_cpumask_var(this_core_map);
-free_this_sibling_map:
- free_cpumask_var(this_sibling_map);
-free_nodemask:
- free_cpumask_var(nodemask);
-free_notcovered:
-#ifdef CONFIG_NUMA
- free_cpumask_var(notcovered);
-free_covered:
- free_cpumask_var(covered);
-free_domainspan:
- free_cpumask_var(domainspan);
-out:
-#endif
- return err;
-
-free_sched_groups:
-#ifdef CONFIG_NUMA
- kfree(sched_group_nodes);
-#endif
- goto free_tmpmask;
+ d.sched_group_nodes = NULL; /* don't free this we still need it */
+ __free_domain_allocs(&d, sa_tmpmask, cpu_map);
+ return 0;
-#ifdef CONFIG_NUMA
error:
- free_sched_groups(cpu_map, tmpmask);
- free_rootdomain(rd);
- goto free_tmpmask;
-#endif
+ __free_domain_allocs(&d, alloc_state, cpu_map);
+ return -ENOMEM;
}
static int build_sched_domains(const struct cpumask *cpu_map)
@@ -8173,6 +9370,8 @@ void __init sched_init_smp(void)
}
#endif /* CONFIG_SMP */
+const_debug unsigned int sysctl_timer_migration = 1;
+
int in_sched_functions(unsigned long addr)
{
return in_lock_functions(addr) ||
@@ -8204,11 +9403,15 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
__set_bit(MAX_RT_PRIO, array->bitmap);
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- rt_rq->highest_prio = MAX_RT_PRIO;
+ rt_rq->highest_prio.curr = MAX_RT_PRIO;
+#ifdef CONFIG_SMP
+ rt_rq->highest_prio.next = MAX_RT_PRIO;
+#endif
#endif
#ifdef CONFIG_SMP
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
+ plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
#endif
rt_rq->rt_time = 0;
@@ -8295,12 +9498,15 @@ void __init sched_init(void)
#ifdef CONFIG_USER_SCHED
alloc_size *= 2;
#endif
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ alloc_size += num_possible_cpus() * cpumask_size();
+#endif
/*
* As sched_init() is called before page_alloc is setup,
* we use alloc_bootmem().
*/
if (alloc_size) {
- ptr = (unsigned long)alloc_bootmem(alloc_size);
+ ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
#ifdef CONFIG_FAIR_GROUP_SCHED
init_task_group.se = (struct sched_entity **)ptr;
@@ -8332,6 +9538,12 @@ void __init sched_init(void)
ptr += nr_cpu_ids * sizeof(void **);
#endif /* CONFIG_USER_SCHED */
#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ for_each_possible_cpu(i) {
+ per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+ ptr += cpumask_size();
+ }
+#endif /* CONFIG_CPUMASK_OFFSTACK */
}
#ifdef CONFIG_SMP
@@ -8367,6 +9579,8 @@ void __init sched_init(void)
rq = cpu_rq(i);
spin_lock_init(&rq->lock);
rq->nr_running = 0;
+ rq->calc_load_active = 0;
+ rq->calc_load_update = jiffies + LOAD_FREQ;
init_cfs_rq(&rq->cfs, rq);
init_rt_rq(&rq->rt, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8387,7 +9601,7 @@ void __init sched_init(void)
* 1024) and two child groups A0 and A1 (of weight 1024 each),
* then A0's share of the cpu resource is:
*
- * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+ * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
*
* We achieve this by letting init_task_group's tasks sit
* directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -8404,11 +9618,11 @@ void __init sched_init(void)
* system cpu resource, based on the weight assigned to root
* user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
* by letting tasks of init_task_group sit in a separate cfs_rq
- * (init_cfs_rq) and having one entity represent this group of
+ * (init_tg_cfs_rq) and having one entity represent this group of
* tasks in rq->cfs (i.e init_task_group->se[] != NULL).
*/
init_tg_cfs_entry(&init_task_group,
- &per_cpu(init_cfs_rq, i),
+ &per_cpu(init_tg_cfs_rq, i),
&per_cpu(init_sched_entity, i), i, 1,
root_task_group.se[i]);
@@ -8434,6 +9648,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
+ rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
@@ -8474,31 +9689,44 @@ void __init sched_init(void)
* when this runqueue becomes "idle".
*/
init_idle(current, smp_processor_id());
+
+ calc_load_update = jiffies + LOAD_FREQ;
+
/*
* During early bootup we pretend to be a normal task:
*/
current->sched_class = &fair_sched_class;
/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
- alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+ alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ
- alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+ alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+ alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
#endif
- alloc_bootmem_cpumask_var(&cpu_isolated_map);
+ alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
#endif /* SMP */
+ perf_counter_init();
+
scheduler_running = 1;
}
#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-void __might_sleep(char *file, int line)
+static inline int preempt_count_equals(int preempt_offset)
+{
+ int nested = preempt_count() & ~PREEMPT_ACTIVE;
+
+ return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+}
+
+void __might_sleep(char *file, int line, int preempt_offset)
{
#ifdef in_atomic
static unsigned long prev_jiffy; /* ratelimiting */
- if ((!in_atomic() && !irqs_disabled()) ||
- system_state != SYSTEM_RUNNING || oops_in_progress)
+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+ system_state != SYSTEM_RUNNING || oops_in_progress)
return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
@@ -9210,6 +10438,16 @@ static int sched_rt_global_constraints(void)
return ret;
}
+
+int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+{
+ /* Don't accept realtime tasks when there is no way for them to run */
+ if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+ return 0;
+
+ return 1;
+}
+
#else /* !CONFIG_RT_GROUP_SCHED */
static int sched_rt_global_constraints(void)
{
@@ -9219,6 +10457,13 @@ static int sched_rt_global_constraints(void)
if (sysctl_sched_rt_period <= 0)
return -EINVAL;
+ /*
+ * There's always some RT tasks in the root group
+ * -- migration, kstopmachine etc..
+ */
+ if (sysctl_sched_rt_runtime == 0)
+ return -EBUSY;
+
spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
for_each_possible_cpu(i) {
struct rt_rq *rt_rq = &cpu_rq(i)->rt;
@@ -9303,8 +10548,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct task_struct *tsk)
{
#ifdef CONFIG_RT_GROUP_SCHED
- /* Don't accept realtime tasks when there is no way for them to run */
- if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
+ if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
return -EINVAL;
#else
/* We don't support RT-tasks being in separate groups */
@@ -9415,6 +10659,7 @@ struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every cpu */
u64 *cpuusage;
+ struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
struct cpuacct *parent;
};
@@ -9439,20 +10684,32 @@ static struct cgroup_subsys_state *cpuacct_create(
struct cgroup_subsys *ss, struct cgroup *cgrp)
{
struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+ int i;
if (!ca)
- return ERR_PTR(-ENOMEM);
+ goto out;
ca->cpuusage = alloc_percpu(u64);
- if (!ca->cpuusage) {
- kfree(ca);
- return ERR_PTR(-ENOMEM);
- }
+ if (!ca->cpuusage)
+ goto out_free_ca;
+
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+ if (percpu_counter_init(&ca->cpustat[i], 0))
+ goto out_free_counters;
if (cgrp->parent)
ca->parent = cgroup_ca(cgrp->parent);
return &ca->css;
+
+out_free_counters:
+ while (--i >= 0)
+ percpu_counter_destroy(&ca->cpustat[i]);
+ free_percpu(ca->cpuusage);
+out_free_ca:
+ kfree(ca);
+out:
+ return ERR_PTR(-ENOMEM);
}
/* destroy an existing cpu accounting group */
@@ -9460,14 +10717,17 @@ static void
cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
{
struct cpuacct *ca = cgroup_ca(cgrp);
+ int i;
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+ percpu_counter_destroy(&ca->cpustat[i]);
free_percpu(ca->cpuusage);
kfree(ca);
}
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
{
- u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
u64 data;
#ifndef CONFIG_64BIT
@@ -9486,7 +10746,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
{
- u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
#ifndef CONFIG_64BIT
/*
@@ -9547,6 +10807,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
return 0;
}
+static const char *cpuacct_stat_desc[] = {
+ [CPUACCT_STAT_USER] = "user",
+ [CPUACCT_STAT_SYSTEM] = "system",
+};
+
+static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+ struct cgroup_map_cb *cb)
+{
+ struct cpuacct *ca = cgroup_ca(cgrp);
+ int i;
+
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
+ s64 val = percpu_counter_read(&ca->cpustat[i]);
+ val = cputime64_to_clock_t(val);
+ cb->fill(cb, cpuacct_stat_desc[i], val);
+ }
+ return 0;
+}
+
static struct cftype files[] = {
{
.name = "usage",
@@ -9557,7 +10836,10 @@ static struct cftype files[] = {
.name = "usage_percpu",
.read_seq_string = cpuacct_percpu_seq_read,
},
-
+ {
+ .name = "stat",
+ .read_map = cpuacct_stats_show,
+ },
};
static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9575,16 +10857,42 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
struct cpuacct *ca;
int cpu;
- if (!cpuacct_subsys.active)
+ if (unlikely(!cpuacct_subsys.active))
return;
cpu = task_cpu(tsk);
+
+ rcu_read_lock();
+
ca = task_ca(tsk);
for (; ca; ca = ca->parent) {
- u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
*cpuusage += cputime;
}
+
+ rcu_read_unlock();
+}
+
+/*
+ * Charge the system/user time to the task's accounting group.
+ */
+static void cpuacct_update_stats(struct task_struct *tsk,
+ enum cpuacct_stat_index idx, cputime_t val)
+{
+ struct cpuacct *ca;
+
+ if (unlikely(!cpuacct_subsys.active))
+ return;
+
+ rcu_read_lock();
+ ca = task_ca(tsk);
+
+ do {
+ percpu_counter_add(&ca->cpustat[idx], val);
+ ca = ca->parent;
+ } while (ca);
+ rcu_read_unlock();
}
struct cgroup_subsys cpuacct_subsys = {
@@ -9595,3 +10903,113 @@ struct cgroup_subsys cpuacct_subsys = {
.subsys_id = cpuacct_subsys_id,
};
#endif /* CONFIG_CGROUP_CPUACCT */
+
+#ifndef CONFIG_SMP
+
+int rcu_expedited_torture_stats(char *page)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
+
+void synchronize_sched_expedited(void)
+{
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#else /* #ifndef CONFIG_SMP */
+
+static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
+static DEFINE_MUTEX(rcu_sched_expedited_mutex);
+
+#define RCU_EXPEDITED_STATE_POST -2
+#define RCU_EXPEDITED_STATE_IDLE -1
+
+static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+
+int rcu_expedited_torture_stats(char *page)
+{
+ int cnt = 0;
+ int cpu;
+
+ cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
+ for_each_online_cpu(cpu) {
+ cnt += sprintf(&page[cnt], " %d:%d",
+ cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
+ }
+ cnt += sprintf(&page[cnt], "\n");
+ return cnt;
+}
+EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
+
+static long synchronize_sched_expedited_count;
+
+/*
+ * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ * approach to force grace period to end quickly. This consumes
+ * significant time on all CPUs, and is thus not recommended for
+ * any sort of common-case code.
+ *
+ * Note that it is illegal to call this function while holding any
+ * lock that is acquired by a CPU-hotplug notifier. Failing to
+ * observe this restriction will result in deadlock.
+ */
+void synchronize_sched_expedited(void)
+{
+ int cpu;
+ unsigned long flags;
+ bool need_full_sync = 0;
+ struct rq *rq;
+ struct migration_req *req;
+ long snap;
+ int trycount = 0;
+
+ smp_mb(); /* ensure prior mod happens before capturing snap. */
+ snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
+ get_online_cpus();
+ while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
+ put_online_cpus();
+ if (trycount++ < 10)
+ udelay(trycount * num_online_cpus());
+ else {
+ synchronize_sched();
+ return;
+ }
+ if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
+ smp_mb(); /* ensure test happens before caller kfree */
+ return;
+ }
+ get_online_cpus();
+ }
+ rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
+ for_each_online_cpu(cpu) {
+ rq = cpu_rq(cpu);
+ req = &per_cpu(rcu_migration_req, cpu);
+ init_completion(&req->done);
+ req->task = NULL;
+ req->dest_cpu = RCU_MIGRATION_NEED_QS;
+ spin_lock_irqsave(&rq->lock, flags);
+ list_add(&req->list, &rq->migration_queue);
+ spin_unlock_irqrestore(&rq->lock, flags);
+ wake_up_process(rq->migration_thread);
+ }
+ for_each_online_cpu(cpu) {
+ rcu_expedited_state = cpu;
+ req = &per_cpu(rcu_migration_req, cpu);
+ rq = cpu_rq(cpu);
+ wait_for_completion(&req->done);
+ spin_lock_irqsave(&rq->lock, flags);
+ if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
+ need_full_sync = 1;
+ req->dest_cpu = RCU_MIGRATION_IDLE;
+ spin_unlock_irqrestore(&rq->lock, flags);
+ }
+ rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+ mutex_unlock(&rcu_sched_expedited_mutex);
+ put_online_cpus();
+ if (need_full_sync)
+ synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index a0b0852414c..e1d16c9a768 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -24,11 +24,12 @@
* The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
* consistent between cpus (never more than 2 jiffies difference).
*/
-#include <linux/sched.h>
-#include <linux/percpu.h>
#include <linux/spinlock.h>
-#include <linux/ktime.h>
+#include <linux/hardirq.h>
#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/ktime.h>
+#include <linux/sched.h>
/*
* Scheduler clock - returns current time in nanosec units.
@@ -37,12 +38,14 @@
*/
unsigned long long __attribute__((weak)) sched_clock(void)
{
- return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+ return (unsigned long long)(jiffies - INITIAL_JIFFIES)
+ * (NSEC_PER_SEC / HZ);
}
static __read_mostly int sched_clock_running;
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+__read_mostly int sched_clock_stable;
struct sched_clock_data {
/*
@@ -87,7 +90,7 @@ void sched_clock_init(void)
}
/*
- * min,max except they take wrapping into account
+ * min, max except they take wrapping into account
*/
static inline u64 wrap_min(u64 x, u64 y)
@@ -111,15 +114,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
s64 delta = now - scd->tick_raw;
u64 clock, min_clock, max_clock;
- WARN_ON_ONCE(!irqs_disabled());
-
if (unlikely(delta < 0))
delta = 0;
/*
* scd->clock = clamp(scd->tick_gtod + delta,
- * max(scd->tick_gtod, scd->clock),
- * scd->tick_gtod + TICK_NSEC);
+ * max(scd->tick_gtod, scd->clock),
+ * scd->tick_gtod + TICK_NSEC);
*/
clock = scd->tick_gtod + delta;
@@ -148,8 +149,20 @@ static void lock_double_clock(struct sched_clock_data *data1,
u64 sched_clock_cpu(int cpu)
{
- struct sched_clock_data *scd = cpu_sdc(cpu);
u64 now, clock, this_clock, remote_clock;
+ struct sched_clock_data *scd;
+
+ if (sched_clock_stable)
+ return sched_clock();
+
+ scd = cpu_sdc(cpu);
+
+ /*
+ * Normally this is not called in NMI context - but if it is,
+ * trying to do any locking here is totally lethal.
+ */
+ if (unlikely(in_nmi()))
+ return scd->clock;
if (unlikely(!sched_clock_running))
return 0ull;
@@ -195,14 +208,18 @@ u64 sched_clock_cpu(int cpu)
void sched_clock_tick(void)
{
- struct sched_clock_data *scd = this_scd();
+ struct sched_clock_data *scd;
u64 now, now_gtod;
+ if (sched_clock_stable)
+ return;
+
if (unlikely(!sched_clock_running))
return;
WARN_ON_ONCE(!irqs_disabled());
+ scd = this_scd();
now_gtod = ktime_to_ns(ktime_get());
now = sched_clock();
@@ -250,7 +267,7 @@ u64 sched_clock_cpu(int cpu)
return sched_clock();
}
-#endif
+#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
unsigned long long cpu_clock(int cpu)
{
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 1e00bfacf9b..0f052fc674d 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -55,7 +55,7 @@ static int convert_prio(int prio)
* cpupri_find - find the best (lowest-pri) CPU in the system
* @cp: The cpupri context
* @p: The task
- * @lowest_mask: A mask to fill in with selected CPUs
+ * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
*
* Note: This function returns the recommended CPUs as calculated during the
* current invokation. By the time the call returns, the CPUs may have in
@@ -81,7 +81,21 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
continue;
- cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+ if (lowest_mask) {
+ cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+
+ /*
+ * We have to ensure that we have at least one bit
+ * still set in the array, since the map could have
+ * been concurrently emptied between the first and
+ * second reads of vec->mask. If we hit this
+ * condition, simply act as though we never hit this
+ * priority level and continue on.
+ */
+ if (cpumask_any(lowest_mask) >= nr_cpu_ids)
+ continue;
+ }
+
return 1;
}
@@ -113,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
/*
* If the cpu was currently mapped to a different value, we
- * first need to unmap the old value
+ * need to map it to the new value then remove the old value.
+ * Note, we must add the new value first, otherwise we risk the
+ * cpu being cleared from pri_active, and this cpu could be
+ * missed for a push or pull.
*/
- if (likely(oldpri != CPUPRI_INVALID)) {
- struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
-
- spin_lock_irqsave(&vec->lock, flags);
-
- vec->count--;
- if (!vec->count)
- clear_bit(oldpri, cp->pri_active);
- cpumask_clear_cpu(cpu, vec->mask);
-
- spin_unlock_irqrestore(&vec->lock, flags);
- }
-
if (likely(newpri != CPUPRI_INVALID)) {
struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
@@ -140,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
spin_unlock_irqrestore(&vec->lock, flags);
}
+ if (likely(oldpri != CPUPRI_INVALID)) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
+
+ spin_lock_irqsave(&vec->lock, flags);
+
+ vec->count--;
+ if (!vec->count)
+ clear_bit(oldpri, cp->pri_active);
+ cpumask_clear_cpu(cpu, vec->mask);
+
+ spin_unlock_irqrestore(&vec->lock, flags);
+ }
*currpri = newpri;
}
@@ -151,10 +167,14 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
*
* Returns: -ENOMEM if memory fails.
*/
-int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
+int cpupri_init(struct cpupri *cp, bool bootmem)
{
+ gfp_t gfp = GFP_KERNEL;
int i;
+ if (bootmem)
+ gfp = GFP_NOWAIT;
+
memset(cp, 0, sizeof(*cp));
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -162,9 +182,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
spin_lock_init(&vec->lock);
vec->count = 0;
- if (bootmem)
- alloc_bootmem_cpumask_var(&vec->mask);
- else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&vec->mask, gfp))
goto cleanup;
}
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 642a94ef8a0..9a7e859b8fb 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -25,7 +25,7 @@ struct cpupri {
#ifdef CONFIG_SMP
int cpupri_find(struct cpupri *cp,
- struct task_struct *p, cpumask_t *lowest_mask);
+ struct task_struct *p, struct cpumask *lowest_mask);
void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp, bool bootmem);
void cpupri_cleanup(struct cpupri *cp);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 16eeba4e416..5ddbd089126 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -162,7 +162,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
spread, rq0_min_vruntime, spread0;
- struct rq *rq = &per_cpu(runqueues, cpu);
+ struct rq *rq = cpu_rq(cpu);
struct sched_entity *last;
unsigned long flags;
@@ -191,7 +191,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
if (last)
max_vruntime = last->vruntime;
min_vruntime = cfs_rq->min_vruntime;
- rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
+ rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
spin_unlock_irqrestore(&rq->lock, flags);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
SPLIT_NS(MIN_vruntime));
@@ -248,7 +248,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
static void print_cpu(struct seq_file *m, int cpu)
{
- struct rq *rq = &per_cpu(runqueues, cpu);
+ struct rq *rq = cpu_rq(cpu);
#ifdef CONFIG_X86
{
@@ -272,7 +272,6 @@ static void print_cpu(struct seq_file *m, int cpu)
P(nr_switches);
P(nr_load_updates);
P(nr_uninterruptible);
- SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies);
PN(next_balance);
P(curr->pid);
PN(clock);
@@ -287,9 +286,6 @@ static void print_cpu(struct seq_file *m, int cpu)
#ifdef CONFIG_SCHEDSTATS
#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
- P(yld_exp_empty);
- P(yld_act_empty);
- P(yld_both_empty);
P(yld_count);
P(sched_switch);
@@ -314,7 +310,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
u64 now = ktime_to_ns(ktime_get());
int cpu;
- SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n",
+ SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
@@ -325,6 +321,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
#define PN(x) \
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+ P(jiffies);
PN(sysctl_sched_latency);
PN(sysctl_sched_min_granularity);
PN(sysctl_sched_wakeup_granularity);
@@ -397,6 +394,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
PN(se.vruntime);
PN(se.sum_exec_runtime);
PN(se.avg_overlap);
+ PN(se.avg_wakeup);
nr_switches = p->nvcsw + p->nivcsw;
@@ -411,6 +409,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
PN(se.wait_max);
PN(se.wait_sum);
P(se.wait_count);
+ PN(se.iowait_sum);
+ P(se.iowait_count);
P(sched_info.bkl_count);
P(se.nr_migrations);
P(se.nr_migrations_cold);
@@ -481,6 +481,8 @@ void proc_sched_set_task(struct task_struct *p)
p->se.wait_max = 0;
p->se.wait_sum = 0;
p->se.wait_count = 0;
+ p->se.iowait_sum = 0;
+ p->se.iowait_count = 0;
p->se.sleep_max = 0;
p->se.sum_sleep_runtime = 0;
p->se.block_max = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5cc1c162044..aa7f8412101 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -24,7 +24,7 @@
/*
* Targeted preemption latency for CPU-bound tasks:
- * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
*
* NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length
@@ -34,13 +34,13 @@
* (to see the precise effective timeslice length of your workload,
* run vmstat and monitor the context-switches (cs) field)
*/
-unsigned int sysctl_sched_latency = 20000000ULL;
+unsigned int sysctl_sched_latency = 5000000ULL;
/*
* Minimal preemption granularity for CPU-bound tasks:
- * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 4000000ULL;
+unsigned int sysctl_sched_min_granularity = 1000000ULL;
/*
* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL;
static unsigned int sched_nr_latency = 5;
/*
- * After fork, child runs first. (default) If set to 0 then
+ * After fork, child runs first. If set to 0 (default) then
* parent will (try to) run first.
*/
-const_debug unsigned int sysctl_sched_child_runs_first = 1;
+unsigned int sysctl_sched_child_runs_first __read_mostly;
/*
* sys_sched_yield() compat mode
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
/*
* SCHED_OTHER wake-up granularity.
- * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies.
*/
-unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
+unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
@@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class;
* CFS operations on generic schedulable entities:
*/
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
- return container_of(se, struct task_struct, se);
-}
-
#ifdef CONFIG_FAIR_GROUP_SCHED
/* cpu runqueue to which this cfs_rq is attached */
@@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q)
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+#ifdef CONFIG_SCHED_DEBUG
+ WARN_ON_ONCE(!entity_is_task(se));
+#endif
+ return container_of(se, struct task_struct, se);
+}
+
/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \
for (; se; se = se->parent)
@@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
}
}
-#else /* CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
+
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+ return container_of(se, struct task_struct, se);
+}
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
{
@@ -266,6 +274,12 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
return min_vruntime;
}
+static inline int entity_before(struct sched_entity *a,
+ struct sched_entity *b)
+{
+ return (s64)(a->vruntime - b->vruntime) < 0;
+}
+
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return se->vruntime - cfs_rq->min_vruntime;
@@ -430,12 +444,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
for_each_sched_entity(se) {
struct load_weight *load;
+ struct load_weight lw;
cfs_rq = cfs_rq_of(se);
load = &cfs_rq->load;
if (unlikely(!se->on_rq)) {
- struct load_weight lw = cfs_rq->load;
+ lw = cfs_rq->load;
update_load_add(&lw, se->load.weight);
load = &lw;
@@ -530,6 +545,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
schedstat_set(se->wait_count, se->wait_count + 1);
schedstat_set(se->wait_sum, se->wait_sum +
rq_of(cfs_rq)->clock - se->wait_start);
+#ifdef CONFIG_SCHEDSTATS
+ if (entity_is_task(se)) {
+ trace_sched_stat_wait(task_of(se),
+ rq_of(cfs_rq)->clock - se->wait_start);
+ }
+#endif
schedstat_set(se->wait_start, 0);
}
@@ -604,9 +625,13 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
+ struct task_struct *tsk = NULL;
+
+ if (entity_is_task(se))
+ tsk = task_of(se);
+
if (se->sleep_start) {
u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
- struct task_struct *tsk = task_of(se);
if ((s64)delta < 0)
delta = 0;
@@ -617,11 +642,13 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->sleep_start = 0;
se->sum_sleep_runtime += delta;
- account_scheduler_latency(tsk, delta >> 10, 1);
+ if (tsk) {
+ account_scheduler_latency(tsk, delta >> 10, 1);
+ trace_sched_stat_sleep(tsk, delta);
+ }
}
if (se->block_start) {
u64 delta = rq_of(cfs_rq)->clock - se->block_start;
- struct task_struct *tsk = task_of(se);
if ((s64)delta < 0)
delta = 0;
@@ -632,17 +659,25 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->block_start = 0;
se->sum_sleep_runtime += delta;
- /*
- * Blocking time is in units of nanosecs, so shift by 20 to
- * get a milliseconds-range estimation of the amount of
- * time that the task spent sleeping:
- */
- if (unlikely(prof_on == SLEEP_PROFILING)) {
+ if (tsk) {
+ if (tsk->in_iowait) {
+ se->iowait_sum += delta;
+ se->iowait_count++;
+ trace_sched_stat_iowait(tsk, delta);
+ }
- profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
- delta >> 20);
+ /*
+ * Blocking time is in units of nanosecs, so shift by
+ * 20 to get a milliseconds-range estimation of the
+ * amount of time that the task spent sleeping:
+ */
+ if (unlikely(prof_on == SLEEP_PROFILING)) {
+ profile_hits(SLEEP_PROFILING,
+ (void *)get_wchan(tsk),
+ delta >> 20);
+ }
+ account_scheduler_latency(tsk, delta >> 10, 0);
}
- account_scheduler_latency(tsk, delta >> 10, 0);
}
#endif
}
@@ -686,16 +721,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
* all of which have the same weight.
*/
if (sched_feat(NORMALIZED_SLEEPER) &&
- task_of(se)->policy != SCHED_IDLE)
+ (!entity_is_task(se) ||
+ task_of(se)->policy != SCHED_IDLE))
thresh = calc_delta_fair(thresh, se);
vruntime -= thresh;
}
-
- /* ensure we never gain time by being placed backwards. */
- vruntime = max_vruntime(se->vruntime, vruntime);
}
+ /* ensure we never gain time by being placed backwards. */
+ vruntime = max_vruntime(se->vruntime, vruntime);
+
se->vruntime = vruntime;
}
@@ -719,7 +755,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
__enqueue_entity(cfs_rq, se);
}
-static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->last == se)
cfs_rq->last = NULL;
@@ -728,6 +764,12 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->next = NULL;
}
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ for_each_sched_entity(se)
+ __clear_buddies(cfs_rq_of(se), se);
+}
+
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
{
@@ -768,8 +810,14 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- if (delta_exec > ideal_runtime)
+ if (delta_exec > ideal_runtime) {
resched_task(rq_of(cfs_rq)->curr);
+ /*
+ * The current task ran long enough, ensure it doesn't get
+ * re-elected due to buddy favours.
+ */
+ clear_buddies(cfs_rq, curr);
+ }
}
static void
@@ -1003,7 +1051,7 @@ static void yield_task_fair(struct rq *rq)
/*
* Already in the rightmost position?
*/
- if (unlikely(!rightmost || rightmost->vruntime < se->vruntime))
+ if (unlikely(!rightmost || entity_before(rightmost, se)))
return;
/*
@@ -1020,17 +1068,21 @@ static void yield_task_fair(struct rq *rq)
* search starts with cpus closest then further out as needed,
* so we always favor a closer, idle cpu.
* Domains may include CPUs that are not usable for migration,
- * hence we need to mask them out (cpu_active_mask)
+ * hence we need to mask them out (rq->rd->online)
*
* Returns the CPU we should wake onto.
*/
#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
+
+#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
+
static int wake_idle(int cpu, struct task_struct *p)
{
struct sched_domain *sd;
int i;
unsigned int chosen_wakeup_cpu;
int this_cpu;
+ struct rq *task_rq = task_rq(p);
/*
* At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
@@ -1063,10 +1115,10 @@ static int wake_idle(int cpu, struct task_struct *p)
for_each_domain(cpu, sd) {
if ((sd->flags & SD_WAKE_IDLE)
|| ((sd->flags & SD_WAKE_IDLE_FAR)
- && !task_hot(p, task_rq(p)->clock, sd))) {
+ && !task_hot(p, task_rq->clock, sd))) {
for_each_cpu_and(i, sched_domain_span(sd),
&p->cpus_allowed) {
- if (cpu_active(i) && idle_cpu(i)) {
+ if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
if (i != task_cpu(p)) {
schedstat_inc(p,
se.nr_wakeups_idle);
@@ -1209,7 +1261,17 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
tg = task_group(p);
weight = p->se.load.weight;
- balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+ /*
+ * In low-load situations, where prev_cpu is idle and this_cpu is idle
+ * due to the sync cause above having dropped tl to 0, we'll always have
+ * an imbalance, but there's really nothing you can do about that, so
+ * that's good too.
+ *
+ * Otherwise check if either cpus are near enough in load to allow this
+ * task to be woken on this_cpu.
+ */
+ balanced = !tl ||
+ 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
/*
@@ -1252,8 +1314,6 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
this_rq = cpu_rq(this_cpu);
new_cpu = prev_cpu;
- if (prev_cpu == this_cpu)
- goto out;
/*
* 'this_sd' is the first domain that both
* this_cpu and prev_cpu are present in:
@@ -1302,16 +1362,63 @@ out:
}
#endif /* CONFIG_SMP */
-static unsigned long wakeup_gran(struct sched_entity *se)
+/*
+ * Adaptive granularity
+ *
+ * se->avg_wakeup gives the average time a task runs until it does a wakeup,
+ * with the limit of wakeup_gran -- when it never does a wakeup.
+ *
+ * So the smaller avg_wakeup is the faster we want this task to preempt,
+ * but we don't want to treat the preemptee unfairly and therefore allow it
+ * to run for at least the amount of time we'd like to run.
+ *
+ * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
+ *
+ * NOTE: we use *nr_running to scale with load, this nicely matches the
+ * degrading latency on load.
+ */
+static unsigned long
+adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
+{
+ u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
+ u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
+ u64 gran = 0;
+
+ if (this_run < expected_wakeup)
+ gran = expected_wakeup - this_run;
+
+ return min_t(s64, gran, sysctl_sched_wakeup_granularity);
+}
+
+static unsigned long
+wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
{
unsigned long gran = sysctl_sched_wakeup_granularity;
+ if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
+ gran = adaptive_gran(curr, se);
+
/*
- * More easily preempt - nice tasks, while not making it harder for
- * + nice tasks.
+ * Since its curr running now, convert the gran from real-time
+ * to virtual-time in his units.
*/
- if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD)
- gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
+ if (sched_feat(ASYM_GRAN)) {
+ /*
+ * By using 'se' instead of 'curr' we penalize light tasks, so
+ * they get preempted easier. That is, if 'se' < 'curr' then
+ * the resulting gran will be larger, therefore penalizing the
+ * lighter, if otoh 'se' > 'curr' then the resulting gran will
+ * be smaller, again penalizing the lighter task.
+ *
+ * This is especially important for buddies when the leftmost
+ * task is higher priority than the buddy.
+ */
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ gran = calc_delta_fair(gran, se);
+ } else {
+ if (unlikely(curr->load.weight != NICE_0_LOAD))
+ gran = calc_delta_fair(gran, curr);
+ }
return gran;
}
@@ -1338,7 +1445,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
if (vdiff <= 0)
return -1;
- gran = wakeup_gran(curr);
+ gran = wakeup_gran(curr, se);
if (vdiff > gran)
return 1;
@@ -1428,17 +1535,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
find_matching_se(&se, &pse);
- while (se) {
- BUG_ON(!pse);
-
- if (wakeup_preempt_entity(se, pse) == 1) {
- resched_task(curr);
- break;
- }
+ BUG_ON(!pse);
- se = parent_entity(se);
- pse = parent_entity(pse);
- }
+ if (wakeup_preempt_entity(se, pse) == 1)
+ resched_task(curr);
}
static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1452,6 +1552,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
do {
se = pick_next_entity(cfs_rq);
+ /*
+ * If se was a buddy, clear it so that it will have to earn
+ * the favour again.
+ */
+ __clear_buddies(cfs_rq, se);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
@@ -1650,11 +1755,13 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
sched_info_queued(p);
update_curr(cfs_rq);
+ if (curr)
+ se->vruntime = curr->vruntime;
place_entity(cfs_rq, se, 1);
/* 'curr' will be NULL if the child belongs to a different group */
if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
- curr && curr->vruntime < se->vruntime) {
+ curr && entity_before(curr, se)) {
/*
* Upon rescheduling, sched_class::put_prev_task() will place
* 'current' within the tree based on its new key value.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index da5d93b5d2c..e2dc63a5815 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,5 +1,6 @@
-SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
-SCHED_FEAT(NORMALIZED_SLEEPER, 1)
+SCHED_FEAT(NEW_FAIR_SLEEPERS, 0)
+SCHED_FEAT(NORMALIZED_SLEEPER, 0)
+SCHED_FEAT(ADAPTIVE_GRAN, 1)
SCHED_FEAT(WAKEUP_PREEMPT, 1)
SCHED_FEAT(START_DEBIT, 1)
SCHED_FEAT(AFFINE_WAKEUPS, 1)
@@ -13,3 +14,4 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
SCHED_FEAT(ASYM_EFF_LOAD, 1)
SCHED_FEAT(WAKEUP_OVERLAP, 0)
SCHED_FEAT(LAST_BUDDY, 1)
+SCHED_FEAT(OWNER_SPIN, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 8a21a2e28c1..499672c10cb 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy
static struct task_struct *pick_next_task_idle(struct rq *rq)
{
schedstat_inc(rq, sched_goidle);
-
+ /* adjust the active tasks as we might go into a long sleep */
+ calc_load_account_active(rq);
return rq->idle;
}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 954e1a81b79..2eb4bd6a526 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,6 +3,52 @@
* policies)
*/
+#ifdef CONFIG_RT_GROUP_SCHED
+
+#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
+
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_SCHED_DEBUG
+ WARN_ON_ONCE(!rt_entity_is_task(rt_se));
+#endif
+ return container_of(rt_se, struct task_struct, rt);
+}
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+ return rt_rq->rq;
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+ return rt_se->rt_rq;
+}
+
+#else /* CONFIG_RT_GROUP_SCHED */
+
+#define rt_entity_is_task(rt_se) (1)
+
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+ return container_of(rt_se, struct task_struct, rt);
+}
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+ return container_of(rt_rq, struct rq, rt);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+ struct task_struct *p = rt_task_of(rt_se);
+ struct rq *rq = task_rq(p);
+
+ return &rq->rt;
+}
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+
#ifdef CONFIG_SMP
static inline int rt_overloaded(struct rq *rq)
@@ -37,25 +83,86 @@ static inline void rt_clear_overload(struct rq *rq)
cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
}
-static void update_rt_migration(struct rq *rq)
+static void update_rt_migration(struct rt_rq *rt_rq)
{
- if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
- if (!rq->rt.overloaded) {
- rt_set_overload(rq);
- rq->rt.overloaded = 1;
+ if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
+ if (!rt_rq->overloaded) {
+ rt_set_overload(rq_of_rt_rq(rt_rq));
+ rt_rq->overloaded = 1;
}
- } else if (rq->rt.overloaded) {
- rt_clear_overload(rq);
- rq->rt.overloaded = 0;
+ } else if (rt_rq->overloaded) {
+ rt_clear_overload(rq_of_rt_rq(rt_rq));
+ rt_rq->overloaded = 0;
}
}
-#endif /* CONFIG_SMP */
-static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ if (!rt_entity_is_task(rt_se))
+ return;
+
+ rt_rq = &rq_of_rt_rq(rt_rq)->rt;
+
+ rt_rq->rt_nr_total++;
+ if (rt_se->nr_cpus_allowed > 1)
+ rt_rq->rt_nr_migratory++;
+
+ update_rt_migration(rt_rq);
+}
+
+static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ if (!rt_entity_is_task(rt_se))
+ return;
+
+ rt_rq = &rq_of_rt_rq(rt_rq)->rt;
+
+ rt_rq->rt_nr_total--;
+ if (rt_se->nr_cpus_allowed > 1)
+ rt_rq->rt_nr_migratory--;
+
+ update_rt_migration(rt_rq);
+}
+
+static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+ plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ plist_node_init(&p->pushable_tasks, p->prio);
+ plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+}
+
+static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+ plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+}
+
+static inline int has_pushable_tasks(struct rq *rq)
+{
+ return !plist_head_empty(&rq->rt.pushable_tasks);
+}
+
+#else
+
+static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline
+void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+}
+
+static inline
+void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
- return container_of(rt_se, struct task_struct, rt);
}
+#endif /* CONFIG_SMP */
+
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
return !list_empty(&rt_se->run_list);
@@ -79,16 +186,6 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
#define for_each_leaf_rt_rq(rt_rq, rq) \
list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
-static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
-{
- return rt_rq->rq;
-}
-
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
-{
- return rt_se->rt_rq;
-}
-
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = rt_se->parent)
@@ -108,7 +205,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
if (rt_rq->rt_nr_running) {
if (rt_se && !on_rt_rq(rt_se))
enqueue_rt_entity(rt_se);
- if (rt_rq->highest_prio < curr->prio)
+ if (rt_rq->highest_prio.curr < curr->prio)
resched_task(curr);
}
}
@@ -176,19 +273,6 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
#define for_each_leaf_rt_rq(rt_rq, rq) \
for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
-{
- return container_of(rt_rq, struct rq, rt);
-}
-
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
-{
- struct task_struct *p = rt_task_of(rt_se);
- struct rq *rq = task_rq(p);
-
- return &rq->rt;
-}
-
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = NULL)
@@ -473,7 +557,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
struct rt_rq *rt_rq = group_rt_rq(rt_se);
if (rt_rq)
- return rt_rq->highest_prio;
+ return rt_rq->highest_prio.curr;
#endif
return rt_task_of(rt_se)->prio;
@@ -531,6 +615,8 @@ static void update_curr_rt(struct rq *rq)
curr->se.exec_start = rq->clock;
cpuacct_charge(curr, delta_exec);
+ sched_rt_avg_update(rq, delta_exec);
+
if (!rt_bandwidth_enabled())
return;
@@ -547,91 +633,174 @@ static void update_curr_rt(struct rq *rq)
}
}
-static inline
-void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+#if defined CONFIG_SMP
+
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
+
+static inline int next_prio(struct rq *rq)
{
- WARN_ON(!rt_prio(rt_se_prio(rt_se)));
- rt_rq->rt_nr_running++;
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
-#ifdef CONFIG_SMP
- struct rq *rq = rq_of_rt_rq(rt_rq);
-#endif
+ struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
+
+ if (next && rt_prio(next->prio))
+ return next->prio;
+ else
+ return MAX_RT_PRIO;
+}
+
+static void
+inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ if (prio < prev_prio) {
+
+ /*
+ * If the new task is higher in priority than anything on the
+ * run-queue, we know that the previous high becomes our
+ * next-highest.
+ */
+ rt_rq->highest_prio.next = prev_prio;
- rt_rq->highest_prio = rt_se_prio(rt_se);
-#ifdef CONFIG_SMP
if (rq->online)
- cpupri_set(&rq->rd->cpupri, rq->cpu,
- rt_se_prio(rt_se));
-#endif
- }
-#endif
-#ifdef CONFIG_SMP
- if (rt_se->nr_cpus_allowed > 1) {
- struct rq *rq = rq_of_rt_rq(rt_rq);
+ cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
- rq->rt.rt_nr_migratory++;
- }
+ } else if (prio == rt_rq->highest_prio.curr)
+ /*
+ * If the next task is equal in priority to the highest on
+ * the run-queue, then we implicitly know that the next highest
+ * task cannot be any lower than current
+ */
+ rt_rq->highest_prio.next = prio;
+ else if (prio < rt_rq->highest_prio.next)
+ /*
+ * Otherwise, we need to recompute next-highest
+ */
+ rt_rq->highest_prio.next = next_prio(rq);
+}
- update_rt_migration(rq_of_rt_rq(rt_rq));
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
- if (rt_se_boosted(rt_se))
- rt_rq->rt_nr_boosted++;
+static void
+dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
- if (rt_rq->tg)
- start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
-#else
- start_rt_bandwidth(&def_rt_bandwidth);
-#endif
+ if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
+ rt_rq->highest_prio.next = next_prio(rq);
+
+ if (rq->online && rt_rq->highest_prio.curr != prev_prio)
+ cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
}
+#else /* CONFIG_SMP */
+
static inline
-void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-#ifdef CONFIG_SMP
- int highest_prio = rt_rq->highest_prio;
-#endif
+void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+static inline
+void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+
+#endif /* CONFIG_SMP */
- WARN_ON(!rt_prio(rt_se_prio(rt_se)));
- WARN_ON(!rt_rq->rt_nr_running);
- rt_rq->rt_nr_running--;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+static void
+inc_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+ int prev_prio = rt_rq->highest_prio.curr;
+
+ if (prio < prev_prio)
+ rt_rq->highest_prio.curr = prio;
+
+ inc_rt_prio_smp(rt_rq, prio, prev_prio);
+}
+
+static void
+dec_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+ int prev_prio = rt_rq->highest_prio.curr;
+
if (rt_rq->rt_nr_running) {
- struct rt_prio_array *array;
- WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
- if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
- /* recalculate */
- array = &rt_rq->active;
- rt_rq->highest_prio =
+ WARN_ON(prio < prev_prio);
+
+ /*
+ * This may have been our highest task, and therefore
+ * we may have some recomputation to do
+ */
+ if (prio == prev_prio) {
+ struct rt_prio_array *array = &rt_rq->active;
+
+ rt_rq->highest_prio.curr =
sched_find_first_bit(array->bitmap);
- } /* otherwise leave rq->highest prio alone */
+ }
+
} else
- rt_rq->highest_prio = MAX_RT_PRIO;
-#endif
-#ifdef CONFIG_SMP
- if (rt_se->nr_cpus_allowed > 1) {
- struct rq *rq = rq_of_rt_rq(rt_rq);
- rq->rt.rt_nr_migratory--;
- }
+ rt_rq->highest_prio.curr = MAX_RT_PRIO;
- if (rt_rq->highest_prio != highest_prio) {
- struct rq *rq = rq_of_rt_rq(rt_rq);
+ dec_rt_prio_smp(rt_rq, prio, prev_prio);
+}
- if (rq->online)
- cpupri_set(&rq->rd->cpupri, rq->cpu,
- rt_rq->highest_prio);
- }
+#else
+
+static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
+static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
+
+#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
- update_rt_migration(rq_of_rt_rq(rt_rq));
-#endif /* CONFIG_SMP */
#ifdef CONFIG_RT_GROUP_SCHED
+
+static void
+inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ if (rt_se_boosted(rt_se))
+ rt_rq->rt_nr_boosted++;
+
+ if (rt_rq->tg)
+ start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
+}
+
+static void
+dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
if (rt_se_boosted(rt_se))
rt_rq->rt_nr_boosted--;
WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
-#endif
+}
+
+#else /* CONFIG_RT_GROUP_SCHED */
+
+static void
+inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ start_rt_bandwidth(&def_rt_bandwidth);
+}
+
+static inline
+void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static inline
+void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ int prio = rt_se_prio(rt_se);
+
+ WARN_ON(!rt_prio(prio));
+ rt_rq->rt_nr_running++;
+
+ inc_rt_prio(rt_rq, prio);
+ inc_rt_migration(rt_se, rt_rq);
+ inc_rt_group(rt_se, rt_rq);
+}
+
+static inline
+void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ WARN_ON(!rt_prio(rt_se_prio(rt_se)));
+ WARN_ON(!rt_rq->rt_nr_running);
+ rt_rq->rt_nr_running--;
+
+ dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+ dec_rt_migration(rt_se, rt_rq);
+ dec_rt_group(rt_se, rt_rq);
}
static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -718,7 +887,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
enqueue_rt_entity(rt_se);
- inc_cpu_load(rq, p->se.load.weight);
+ if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+ enqueue_pushable_task(rq, p);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -728,7 +898,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
update_curr_rt(rq);
dequeue_rt_entity(rt_se);
- dec_cpu_load(rq, p->se.load.weight);
+ dequeue_pushable_task(rq, p);
}
/*
@@ -805,20 +975,15 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
- cpumask_var_t mask;
-
if (rq->curr->rt.nr_cpus_allowed == 1)
return;
- if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
- return;
-
if (p->rt.nr_cpus_allowed != 1
- && cpupri_find(&rq->rd->cpupri, p, mask))
- goto free;
+ && cpupri_find(&rq->rd->cpupri, p, NULL))
+ return;
- if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
- goto free;
+ if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
+ return;
/*
* There appears to be other cpus that can accept
@@ -827,8 +992,6 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
*/
requeue_task_rt(rq, p, 1);
resched_task(rq->curr);
-free:
- free_cpumask_var(mask);
}
#endif /* CONFIG_SMP */
@@ -878,7 +1041,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
return next;
}
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *_pick_next_task_rt(struct rq *rq)
{
struct sched_rt_entity *rt_se;
struct task_struct *p;
@@ -900,6 +1063,26 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
p = rt_task_of(rt_se);
p->se.exec_start = rq->clock;
+
+ return p;
+}
+
+static struct task_struct *pick_next_task_rt(struct rq *rq)
+{
+ struct task_struct *p = _pick_next_task_rt(rq);
+
+ /* The running task is never eligible for pushing */
+ if (p)
+ dequeue_pushable_task(rq, p);
+
+#ifdef CONFIG_SMP
+ /*
+ * We detect this state here so that we can avoid taking the RQ
+ * lock again later if there is no need to push
+ */
+ rq->post_schedule = has_pushable_tasks(rq);
+#endif
+
return p;
}
@@ -907,6 +1090,13 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
update_curr_rt(rq);
p->se.exec_start = 0;
+
+ /*
+ * The previous task needs to be made eligible for pushing
+ * if it is still active
+ */
+ if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
+ enqueue_pushable_task(rq, p);
}
#ifdef CONFIG_SMP
@@ -960,16 +1150,17 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
-static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
+static inline int pick_optimal_cpu(int this_cpu,
+ const struct cpumask *mask)
{
int first;
/* "this_cpu" is cheaper to preempt than a remote processor */
- if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
+ if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
return this_cpu;
- first = first_cpu(*mask);
- if (first != NR_CPUS)
+ first = cpumask_first(mask);
+ if (first < nr_cpu_ids)
return first;
return -1;
@@ -981,6 +1172,7 @@ static int find_lowest_rq(struct task_struct *task)
struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
+ cpumask_var_t domain_mask;
if (task->rt.nr_cpus_allowed == 1)
return -1; /* No other targets possible */
@@ -989,13 +1181,6 @@ static int find_lowest_rq(struct task_struct *task)
return -1; /* No targets found */
/*
- * Only consider CPUs that are usable for migration.
- * I guess we might want to change cpupri_find() to ignore those
- * in the first place.
- */
- cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
-
- /*
* At this point we have built a mask of cpus representing the
* lowest priority tasks in the system. Now we want to elect
* the best one based on our affinity and topology.
@@ -1013,19 +1198,25 @@ static int find_lowest_rq(struct task_struct *task)
if (this_cpu == cpu)
this_cpu = -1; /* Skip this_cpu opt if the same */
- for_each_domain(cpu, sd) {
- if (sd->flags & SD_WAKE_AFFINE) {
- cpumask_t domain_mask;
- int best_cpu;
+ if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
+ for_each_domain(cpu, sd) {
+ if (sd->flags & SD_WAKE_AFFINE) {
+ int best_cpu;
- cpumask_and(&domain_mask, sched_domain_span(sd),
- lowest_mask);
+ cpumask_and(domain_mask,
+ sched_domain_span(sd),
+ lowest_mask);
- best_cpu = pick_optimal_cpu(this_cpu,
- &domain_mask);
- if (best_cpu != -1)
- return best_cpu;
+ best_cpu = pick_optimal_cpu(this_cpu,
+ domain_mask);
+
+ if (best_cpu != -1) {
+ free_cpumask_var(domain_mask);
+ return best_cpu;
+ }
+ }
}
+ free_cpumask_var(domain_mask);
}
/*
@@ -1072,7 +1263,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
}
/* If this rq is still suitable use it. */
- if (lowest_rq->rt.highest_prio > task->prio)
+ if (lowest_rq->rt.highest_prio.curr > task->prio)
break;
/* try again */
@@ -1083,6 +1274,26 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
return lowest_rq;
}
+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_tasks(rq))
+ return NULL;
+
+ p = plist_first_entry(&rq->rt.pushable_tasks,
+ struct task_struct, pushable_tasks);
+
+ BUG_ON(rq->cpu != task_cpu(p));
+ BUG_ON(task_current(rq, p));
+ BUG_ON(p->rt.nr_cpus_allowed <= 1);
+
+ BUG_ON(!p->se.on_rq);
+ BUG_ON(!rt_task(p));
+
+ return p;
+}
+
/*
* If the current CPU has more than one RT task, see if the non
* running task can migrate over to a CPU that is running a task
@@ -1092,13 +1303,11 @@ static int push_rt_task(struct rq *rq)
{
struct task_struct *next_task;
struct rq *lowest_rq;
- int ret = 0;
- int paranoid = RT_MAX_TRIES;
if (!rq->rt.overloaded)
return 0;
- next_task = pick_next_highest_task_rt(rq, -1);
+ next_task = pick_next_pushable_task(rq);
if (!next_task)
return 0;
@@ -1127,16 +1336,34 @@ static int push_rt_task(struct rq *rq)
struct task_struct *task;
/*
* find lock_lowest_rq releases rq->lock
- * so it is possible that next_task has changed.
- * If it has, then try again.
+ * so it is possible that next_task has migrated.
+ *
+ * We need to make sure that the task is still on the same
+ * run-queue and is also still the next task eligible for
+ * pushing.
*/
- task = pick_next_highest_task_rt(rq, -1);
- if (unlikely(task != next_task) && task && paranoid--) {
- put_task_struct(next_task);
- next_task = task;
- goto retry;
+ task = pick_next_pushable_task(rq);
+ if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ /*
+ * If we get here, the task hasnt moved at all, but
+ * it has failed to push. We will not try again,
+ * since the other cpus will pull from us when they
+ * are ready.
+ */
+ dequeue_pushable_task(rq, next_task);
+ goto out;
}
- goto out;
+
+ if (!task)
+ /* No more tasks, just exit */
+ goto out;
+
+ /*
+ * Something has shifted, try again.
+ */
+ put_task_struct(next_task);
+ next_task = task;
+ goto retry;
}
deactivate_task(rq, next_task, 0);
@@ -1147,23 +1374,12 @@ static int push_rt_task(struct rq *rq)
double_unlock_balance(rq, lowest_rq);
- ret = 1;
out:
put_task_struct(next_task);
- return ret;
+ return 1;
}
-/*
- * TODO: Currently we just use the second highest prio task on
- * the queue, and stop when it can't migrate (or there's
- * no more RT tasks). There may be a case where a lower
- * priority RT task has a different affinity than the
- * higher RT task. In this case the lower RT task could
- * possibly be able to migrate where as the higher priority
- * RT task could not. We currently ignore this issue.
- * Enhancements are welcome!
- */
static void push_rt_tasks(struct rq *rq)
{
/* push_rt_task will return true if it moved an RT */
@@ -1174,33 +1390,35 @@ static void push_rt_tasks(struct rq *rq)
static int pull_rt_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, ret = 0, cpu;
- struct task_struct *p, *next;
+ struct task_struct *p;
struct rq *src_rq;
if (likely(!rt_overloaded(this_rq)))
return 0;
- next = pick_next_task_rt(this_rq);
-
for_each_cpu(cpu, this_rq->rd->rto_mask) {
if (this_cpu == cpu)
continue;
src_rq = cpu_rq(cpu);
+
+ /*
+ * Don't bother taking the src_rq->lock if the next highest
+ * task is known to be lower-priority than our current task.
+ * This may look racy, but if this value is about to go
+ * logically higher, the src_rq will push this task away.
+ * And if its going logically lower, we do not care
+ */
+ if (src_rq->rt.highest_prio.next >=
+ this_rq->rt.highest_prio.curr)
+ continue;
+
/*
* We can potentially drop this_rq's lock in
* double_lock_balance, and another CPU could
- * steal our next task - hence we must cause
- * the caller to recalculate the next task
- * in that case:
+ * alter this_rq
*/
- if (double_lock_balance(this_rq, src_rq)) {
- struct task_struct *old_next = next;
-
- next = pick_next_task_rt(this_rq);
- if (next != old_next)
- ret = 1;
- }
+ double_lock_balance(this_rq, src_rq);
/*
* Are there still pullable RT tasks?
@@ -1214,7 +1432,7 @@ static int pull_rt_task(struct rq *this_rq)
* Do we have an RT task that preempts
* the to-be-scheduled task?
*/
- if (p && (!next || (p->prio < next->prio))) {
+ if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
WARN_ON(p == src_rq->curr);
WARN_ON(!p->se.on_rq);
@@ -1224,12 +1442,9 @@ static int pull_rt_task(struct rq *this_rq)
* This is just that p is wakeing up and hasn't
* had a chance to schedule. We only pull
* p if it is lower in priority than the
- * current task on the run queue or
- * this_rq next task is lower in prio than
- * the current task on that rq.
+ * current task on the run queue
*/
- if (p->prio < src_rq->curr->prio ||
- (next && next->prio < src_rq->curr->prio))
+ if (p->prio < src_rq->curr->prio)
goto skip;
ret = 1;
@@ -1242,13 +1457,7 @@ static int pull_rt_task(struct rq *this_rq)
* case there's an even higher prio task
* in another runqueue. (low likelyhood
* but possible)
- *
- * Update next so that we won't pick a task
- * on another cpu with a priority lower (or equal)
- * than the one we just picked.
*/
- next = p;
-
}
skip:
double_unlock_balance(this_rq, src_rq);
@@ -1260,24 +1469,13 @@ static int pull_rt_task(struct rq *this_rq)
static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
{
/* Try to pull RT tasks here if we lower this rq's prio */
- if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
+ if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
pull_rt_task(rq);
}
static void post_schedule_rt(struct rq *rq)
{
- /*
- * If we have more than one rt_task queued, then
- * see if we can push the other rt_tasks off to other CPUS.
- * Note we may release the rq lock, and since
- * the lock was owned by prev, we need to release it
- * first via finish_lock_switch and then reaquire it here.
- */
- if (unlikely(rq->rt.overloaded)) {
- spin_lock_irq(&rq->lock);
- push_rt_tasks(rq);
- spin_unlock_irq(&rq->lock);
- }
+ push_rt_tasks(rq);
}
/*
@@ -1288,7 +1486,8 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- rq->rt.overloaded)
+ has_pushable_tasks(rq) &&
+ p->rt.nr_cpus_allowed > 1)
push_rt_tasks(rq);
}
@@ -1324,6 +1523,24 @@ static void set_cpus_allowed_rt(struct task_struct *p,
if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
struct rq *rq = task_rq(p);
+ if (!task_current(rq, p)) {
+ /*
+ * Make sure we dequeue this task from the pushable list
+ * before going further. It will either remain off of
+ * the list because we are no longer pushable, or it
+ * will be requeued.
+ */
+ if (p->rt.nr_cpus_allowed > 1)
+ dequeue_pushable_task(rq, p);
+
+ /*
+ * Requeue if our weight is changing and still > 1
+ */
+ if (weight > 1)
+ enqueue_pushable_task(rq, p);
+
+ }
+
if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
rq->rt.rt_nr_migratory++;
} else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
@@ -1331,7 +1548,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
rq->rt.rt_nr_migratory--;
}
- update_rt_migration(rq);
+ update_rt_migration(&rq->rt);
}
cpumask_copy(&p->cpus_allowed, new_mask);
@@ -1346,7 +1563,7 @@ static void rq_online_rt(struct rq *rq)
__enable_runtime(rq);
- cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
+ cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
}
/* Assumes rq->lock is held */
@@ -1383,7 +1600,7 @@ static inline void init_sched_rt_class(void)
unsigned int i;
for_each_possible_cpu(i)
- alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
+ zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
GFP_KERNEL, cpu_to_node(i));
}
#endif /* CONFIG_SMP */
@@ -1438,7 +1655,7 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p,
* can release the rq lock and p could migrate.
* Only reschedule if p is still on the same runqueue.
*/
- if (p->prio > rq->rt.highest_prio && rq->curr == p)
+ if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
resched_task(p);
#else
/* For UP simply resched on drop of prio */
@@ -1509,6 +1726,9 @@ static void set_curr_task_rt(struct rq *rq)
struct task_struct *p = rq->curr;
p->se.exec_start = rq->clock;
+
+ /* The running task is never eligible for pushing */
+ dequeue_pushable_task(rq, p);
}
static const struct sched_class rt_sched_class = {
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index f2773b5d122..32d2bd4061b 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -4,7 +4,7 @@
* bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
-#define SCHEDSTAT_VERSION 14
+#define SCHEDSTAT_VERSION 15
static int show_schedstat(struct seq_file *seq, void *v)
{
@@ -26,9 +26,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
/* runqueue-specific stats */
seq_printf(seq,
- "cpu%d %u %u %u %u %u %u %u %u %u %llu %llu %lu",
- cpu, rq->yld_both_empty,
- rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
+ "cpu%d %u %u %u %u %u %u %llu %llu %lu",
+ cpu, rq->yld_count,
rq->sched_switch, rq->sched_count, rq->sched_goidle,
rq->ttwu_count, rq->ttwu_local,
rq->rq_cpu_time,
@@ -296,20 +295,21 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
static inline void account_group_user_time(struct task_struct *tsk,
cputime_t cputime)
{
- struct signal_struct *sig;
+ struct thread_group_cputimer *cputimer;
/* tsk == current, ensure it is safe to use ->signal */
if (unlikely(tsk->exit_state))
return;
- sig = tsk->signal;
- if (sig->cputime.totals) {
- struct task_cputime *times;
+ cputimer = &tsk->signal->cputimer;
- times = per_cpu_ptr(sig->cputime.totals, get_cpu());
- times->utime = cputime_add(times->utime, cputime);
- put_cpu_no_resched();
- }
+ if (!cputimer->running)
+ return;
+
+ spin_lock(&cputimer->lock);
+ cputimer->cputime.utime =
+ cputime_add(cputimer->cputime.utime, cputime);
+ spin_unlock(&cputimer->lock);
}
/**
@@ -325,20 +325,21 @@ static inline void account_group_user_time(struct task_struct *tsk,
static inline void account_group_system_time(struct task_struct *tsk,
cputime_t cputime)
{
- struct signal_struct *sig;
+ struct thread_group_cputimer *cputimer;
/* tsk == current, ensure it is safe to use ->signal */
if (unlikely(tsk->exit_state))
return;
- sig = tsk->signal;
- if (sig->cputime.totals) {
- struct task_cputime *times;
+ cputimer = &tsk->signal->cputimer;
- times = per_cpu_ptr(sig->cputime.totals, get_cpu());
- times->stime = cputime_add(times->stime, cputime);
- put_cpu_no_resched();
- }
+ if (!cputimer->running)
+ return;
+
+ spin_lock(&cputimer->lock);
+ cputimer->cputime.stime =
+ cputime_add(cputimer->cputime.stime, cputime);
+ spin_unlock(&cputimer->lock);
}
/**
@@ -354,6 +355,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
static inline void account_group_exec_runtime(struct task_struct *tsk,
unsigned long long ns)
{
+ struct thread_group_cputimer *cputimer;
struct signal_struct *sig;
sig = tsk->signal;
@@ -362,11 +364,12 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
if (unlikely(!sig))
return;
- if (sig->cputime.totals) {
- struct task_cputime *times;
+ cputimer = &sig->cputimer;
- times = per_cpu_ptr(sig->cputime.totals, get_cpu());
- times->sum_exec_runtime += ns;
- put_cpu_no_resched();
- }
+ if (!cputimer->running)
+ return;
+
+ spin_lock(&cputimer->lock);
+ cputimer->cputime.sum_exec_runtime += ns;
+ spin_unlock(&cputimer->lock);
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ad64fcb731f..57d4b13b631 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -8,6 +8,7 @@
#include <linux/seccomp.h>
#include <linux/sched.h>
+#include <linux/compat.h>
/* #define SECCOMP_DEBUG 1 */
#define NR_SECCOMP_MODES 1
@@ -22,7 +23,7 @@ static int mode1_syscalls[] = {
0, /* null terminated */
};
-#ifdef TIF_32BIT
+#ifdef CONFIG_COMPAT
static int mode1_syscalls_32[] = {
__NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,
0, /* null terminated */
@@ -37,8 +38,8 @@ void __secure_computing(int this_syscall)
switch (mode) {
case 1:
syscall = mode1_syscalls;
-#ifdef TIF_32BIT
- if (test_thread_flag(TIF_32BIT))
+#ifdef CONFIG_COMPAT
+ if (is_compat_task())
syscall = mode1_syscalls_32;
#endif
do {
diff --git a/kernel/signal.c b/kernel/signal.c
index e73759783dc..64c5deeaca5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -27,7 +27,7 @@
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <linux/nsproxy.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
#include <asm/param.h>
#include <asm/uaccess.h>
@@ -41,8 +41,6 @@
static struct kmem_cache *sigqueue_cachep;
-DEFINE_TRACE(sched_signal_send);
-
static void __user *sig_handler(struct task_struct *t, int sig)
{
return t->sighand->action[sig - 1].sa.sa_handler;
@@ -55,10 +53,22 @@ static int sig_handler_ignored(void __user *handler, int sig)
(handler == SIG_DFL && sig_kernel_ignore(sig));
}
-static int sig_ignored(struct task_struct *t, int sig)
+static int sig_task_ignored(struct task_struct *t, int sig,
+ int from_ancestor_ns)
{
void __user *handler;
+ handler = sig_handler(t, sig);
+
+ if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
+ handler == SIG_DFL && !from_ancestor_ns)
+ return 1;
+
+ return sig_handler_ignored(handler, sig);
+}
+
+static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
+{
/*
* Blocked signals are never ignored, since the
* signal handler may change by the time it is
@@ -67,14 +77,13 @@ static int sig_ignored(struct task_struct *t, int sig)
if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
return 0;
- handler = sig_handler(t, sig);
- if (!sig_handler_ignored(handler, sig))
+ if (!sig_task_ignored(t, sig, from_ancestor_ns))
return 0;
/*
* Tracers may want to know about even ignored signals.
*/
- return !tracehook_consider_ignored_signal(t, sig, handler);
+ return !tracehook_consider_ignored_signal(t, sig);
}
/*
@@ -238,14 +247,19 @@ void flush_sigqueue(struct sigpending *queue)
/*
* Flush all pending signals for a task.
*/
+void __flush_signals(struct task_struct *t)
+{
+ clear_tsk_thread_flag(t, TIF_SIGPENDING);
+ flush_sigqueue(&t->pending);
+ flush_sigqueue(&t->signal->shared_pending);
+}
+
void flush_signals(struct task_struct *t)
{
unsigned long flags;
spin_lock_irqsave(&t->sighand->siglock, flags);
- clear_tsk_thread_flag(t, TIF_SIGPENDING);
- flush_sigqueue(&t->pending);
- flush_sigqueue(&t->signal->shared_pending);
+ __flush_signals(t);
spin_unlock_irqrestore(&t->sighand->siglock, flags);
}
@@ -318,7 +332,7 @@ int unhandled_signal(struct task_struct *tsk, int sig)
return 1;
if (handler != SIG_IGN && handler != SIG_DFL)
return 0;
- return !tracehook_consider_fatal_signal(tsk, sig, handler);
+ return !tracehook_consider_fatal_signal(tsk, sig);
}
@@ -624,7 +638,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
* Returns true if the signal should be actually delivered, otherwise
* it should be dropped.
*/
-static int prepare_signal(int sig, struct task_struct *p)
+static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
{
struct signal_struct *signal = p->signal;
struct task_struct *t;
@@ -708,7 +722,7 @@ static int prepare_signal(int sig, struct task_struct *p)
}
}
- return !sig_ignored(p, sig);
+ return !sig_ignored(p, sig, from_ancestor_ns);
}
/*
@@ -777,7 +791,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
!(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
!sigismember(&t->real_blocked, sig) &&
(sig == SIGKILL ||
- !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) {
+ !tracehook_consider_fatal_signal(t, sig))) {
/*
* This signal will be fatal to the whole group.
*/
@@ -813,16 +827,18 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
}
-static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
- int group)
+static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
+ int group, int from_ancestor_ns)
{
struct sigpending *pending;
struct sigqueue *q;
+ int override_rlimit;
trace_sched_signal_send(sig, t);
assert_spin_locked(&t->sighand->siglock);
- if (!prepare_signal(sig, t))
+
+ if (!prepare_signal(sig, t, from_ancestor_ns))
return 0;
pending = group ? &t->signal->shared_pending : &t->pending;
@@ -848,9 +864,13 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
make sure at least one signal gets delivered and don't
pass on the info struct. */
- q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
- (is_si_special(info) ||
- info->si_code >= 0)));
+ if (sig < SIGRTMIN)
+ override_rlimit = (is_si_special(info) || info->si_code >= 0);
+ else
+ override_rlimit = 0;
+
+ q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
+ override_rlimit);
if (q) {
list_add_tail(&q->list, &pending->list);
switch ((unsigned long) info) {
@@ -871,6 +891,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
break;
default:
copy_siginfo(&q->info, info);
+ if (from_ancestor_ns)
+ q->info.si_pid = 0;
break;
}
} else if (!is_si_special(info)) {
@@ -889,6 +911,20 @@ out_set:
return 0;
}
+static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
+ int group)
+{
+ int from_ancestor_ns = 0;
+
+#ifdef CONFIG_PID_NS
+ if (!is_si_special(info) && SI_FROMUSER(info) &&
+ task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0)
+ from_ancestor_ns = 1;
+#endif
+
+ return __send_signal(sig, info, t, group, from_ancestor_ns);
+}
+
int print_fatal_signals;
static void print_fatal_signal(struct pt_regs *regs, int signr)
@@ -909,7 +945,9 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
}
#endif
printk("\n");
+ preempt_disable();
show_regs(regs);
+ preempt_enable();
}
static int __init setup_print_fatal_signals(char *str)
@@ -1131,7 +1169,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
if (sig && p->sighand) {
unsigned long flags;
spin_lock_irqsave(&p->sighand->siglock, flags);
- ret = __group_send_sig_info(sig, info, p);
+ ret = __send_signal(sig, info, p, 1, 0);
spin_unlock_irqrestore(&p->sighand->siglock, flags);
}
out_unlock:
@@ -1318,7 +1356,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
goto ret;
ret = 1; /* the signal is ignored */
- if (!prepare_signal(sig, t))
+ if (!prepare_signal(sig, t, 0))
goto out;
ret = 0;
@@ -1365,7 +1403,6 @@ int do_notify_parent(struct task_struct *tsk, int sig)
struct siginfo info;
unsigned long flags;
struct sighand_struct *psig;
- struct task_cputime cputime;
int ret = sig;
BUG_ON(sig == -1);
@@ -1373,7 +1410,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
/* do_notify_parent_cldstop should have been called instead. */
BUG_ON(task_is_stopped_or_traced(tsk));
- BUG_ON(!tsk->ptrace &&
+ BUG_ON(!task_ptrace(tsk) &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
info.si_signo = sig;
@@ -1395,9 +1432,10 @@ int do_notify_parent(struct task_struct *tsk, int sig)
info.si_uid = __task_cred(tsk)->uid;
rcu_read_unlock();
- thread_group_cputime(tsk, &cputime);
- info.si_utime = cputime_to_jiffies(cputime.utime);
- info.si_stime = cputime_to_jiffies(cputime.stime);
+ info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
+ tsk->signal->utime));
+ info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
+ tsk->signal->stime));
info.si_status = tsk->exit_code & 0x7f;
if (tsk->exit_code & 0x80)
@@ -1411,7 +1449,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
psig = tsk->parent->sighand;
spin_lock_irqsave(&psig->siglock, flags);
- if (!tsk->ptrace && sig == SIGCHLD &&
+ if (!task_ptrace(tsk) && sig == SIGCHLD &&
(psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
(psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
/*
@@ -1448,7 +1486,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
struct task_struct *parent;
struct sighand_struct *sighand;
- if (tsk->ptrace & PT_PTRACED)
+ if (task_ptrace(tsk))
parent = tsk->parent;
else {
tsk = tsk->group_leader;
@@ -1461,7 +1499,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
* see comment in do_notify_parent() abot the following 3 lines
*/
rcu_read_lock();
- info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+ info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
info.si_uid = __task_cred(tsk)->uid;
rcu_read_unlock();
@@ -1497,7 +1535,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
static inline int may_ptrace_stop(void)
{
- if (!likely(current->ptrace & PT_PTRACED))
+ if (!likely(task_ptrace(current)))
return 0;
/*
* Are we in the middle of do_coredump?
@@ -1573,7 +1611,15 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
read_lock(&tasklist_lock);
if (may_ptrace_stop()) {
do_notify_parent_cldstop(current, CLD_TRAPPED);
+ /*
+ * Don't want to allow preemption here, because
+ * sys_ptrace() needs this task to be inactive.
+ *
+ * XXX: implement read_unlock_no_resched().
+ */
+ preempt_disable();
read_unlock(&tasklist_lock);
+ preempt_enable_no_resched();
schedule();
} else {
/*
@@ -1707,7 +1753,7 @@ static int do_signal_stop(int signr)
static int ptrace_signal(int signr, siginfo_t *info,
struct pt_regs *regs, void *cookie)
{
- if (!(current->ptrace & PT_PTRACED))
+ if (!task_ptrace(current))
return signr;
ptrace_signal_deliver(regs, cookie);
@@ -1834,9 +1880,16 @@ relock:
/*
* Global init gets no signals it doesn't want.
+ * Container-init gets no signals it doesn't want from same
+ * container.
+ *
+ * Note that if global/container-init sees a sig_kernel_only()
+ * signal here, the signal must have been generated internally
+ * or must have come from an ancestor namespace. In either
+ * case, the signal cannot be dropped.
*/
if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
- !signal_group_exit(signal))
+ !sig_kernel_only(signr))
continue;
if (sig_kernel_stop(signr)) {
@@ -2233,24 +2286,17 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
return kill_something_info(sig, &info, pid);
}
-static int do_tkill(pid_t tgid, pid_t pid, int sig)
+static int
+do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
{
- int error;
- struct siginfo info;
struct task_struct *p;
unsigned long flags;
-
- error = -ESRCH;
- info.si_signo = sig;
- info.si_errno = 0;
- info.si_code = SI_TKILL;
- info.si_pid = task_tgid_vnr(current);
- info.si_uid = current_uid();
+ int error = -ESRCH;
rcu_read_lock();
p = find_task_by_vpid(pid);
if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
- error = check_kill_permission(sig, &info, p);
+ error = check_kill_permission(sig, info, p);
/*
* The null signal is a permissions and process existence
* probe. No signal is actually delivered.
@@ -2260,7 +2306,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
* signal is private anyway.
*/
if (!error && sig && lock_task_sighand(p, &flags)) {
- error = specific_send_sig_info(sig, &info, p);
+ error = specific_send_sig_info(sig, info, p);
unlock_task_sighand(p, &flags);
}
}
@@ -2269,6 +2315,19 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
return error;
}
+static int do_tkill(pid_t tgid, pid_t pid, int sig)
+{
+ struct siginfo info;
+
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_TKILL;
+ info.si_pid = task_tgid_vnr(current);
+ info.si_uid = current_uid();
+
+ return do_send_specific(tgid, pid, sig, &info);
+}
+
/**
* sys_tgkill - send signal to one specific thread
* @tgid: the thread group ID of the thread
@@ -2318,6 +2377,32 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
return kill_proc_info(sig, &info, pid);
}
+long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
+{
+ /* This is only valid for single tasks */
+ if (pid <= 0 || tgid <= 0)
+ return -EINVAL;
+
+ /* Not even root can pretend to send signals from the kernel.
+ Nor can they impersonate a kill(), which adds source info. */
+ if (info->si_code >= 0)
+ return -EPERM;
+ info->si_signo = sig;
+
+ return do_send_specific(tgid, pid, sig, info);
+}
+
+SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
+ siginfo_t __user *, uinfo)
+{
+ siginfo_t info;
+
+ if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
+ return -EFAULT;
+
+ return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+
int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
{
struct task_struct *t = current;
@@ -2369,11 +2454,9 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
stack_t oss;
int error;
- if (uoss) {
- oss.ss_sp = (void __user *) current->sas_ss_sp;
- oss.ss_size = current->sas_ss_size;
- oss.ss_flags = sas_ss_flags(sp);
- }
+ oss.ss_sp = (void __user *) current->sas_ss_sp;
+ oss.ss_size = current->sas_ss_size;
+ oss.ss_flags = sas_ss_flags(sp);
if (uss) {
void __user *ss_sp;
@@ -2381,10 +2464,12 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
int ss_flags;
error = -EFAULT;
- if (!access_ok(VERIFY_READ, uss, sizeof(*uss))
- || __get_user(ss_sp, &uss->ss_sp)
- || __get_user(ss_flags, &uss->ss_flags)
- || __get_user(ss_size, &uss->ss_size))
+ if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
+ goto out;
+ error = __get_user(ss_sp, &uss->ss_sp) |
+ __get_user(ss_flags, &uss->ss_flags) |
+ __get_user(ss_size, &uss->ss_size);
+ if (error)
goto out;
error = -EPERM;
@@ -2416,13 +2501,16 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
current->sas_ss_size = ss_size;
}
+ error = 0;
if (uoss) {
error = -EFAULT;
- if (copy_to_user(uoss, &oss, sizeof(oss)))
+ if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))
goto out;
+ error = __put_user(oss.ss_sp, &uoss->ss_sp) |
+ __put_user(oss.ss_size, &uoss->ss_size) |
+ __put_user(oss.ss_flags, &uoss->ss_flags);
}
- error = 0;
out:
return error;
}
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
new file mode 100644
index 00000000000..09d7519557d
--- /dev/null
+++ b/kernel/slow-work.c
@@ -0,0 +1,645 @@
+/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ *
+ * See Documentation/slow-work.txt
+ */
+
+#include <linux/module.h>
+#include <linux/slow-work.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
+
+#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
+ * things to do */
+#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
+ * OOM */
+
+static void slow_work_cull_timeout(unsigned long);
+static void slow_work_oom_timeout(unsigned long);
+
+#ifdef CONFIG_SYSCTL
+static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
+ void __user *, size_t *, loff_t *);
+
+static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
+ void __user *, size_t *, loff_t *);
+#endif
+
+/*
+ * The pool of threads has at least min threads in it as long as someone is
+ * using the facility, and may have as many as max.
+ *
+ * A portion of the pool may be processing very slow operations.
+ */
+static unsigned slow_work_min_threads = 2;
+static unsigned slow_work_max_threads = 4;
+static unsigned vslow_work_proportion = 50; /* % of threads that may process
+ * very slow work */
+
+#ifdef CONFIG_SYSCTL
+static const int slow_work_min_min_threads = 2;
+static int slow_work_max_max_threads = 255;
+static const int slow_work_min_vslow = 1;
+static const int slow_work_max_vslow = 99;
+
+ctl_table slow_work_sysctls[] = {
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "min-threads",
+ .data = &slow_work_min_threads,
+ .maxlen = sizeof(unsigned),
+ .mode = 0644,
+ .proc_handler = slow_work_min_threads_sysctl,
+ .extra1 = (void *) &slow_work_min_min_threads,
+ .extra2 = &slow_work_max_threads,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "max-threads",
+ .data = &slow_work_max_threads,
+ .maxlen = sizeof(unsigned),
+ .mode = 0644,
+ .proc_handler = slow_work_max_threads_sysctl,
+ .extra1 = &slow_work_min_threads,
+ .extra2 = (void *) &slow_work_max_max_threads,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "vslow-percentage",
+ .data = &vslow_work_proportion,
+ .maxlen = sizeof(unsigned),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = (void *) &slow_work_min_vslow,
+ .extra2 = (void *) &slow_work_max_vslow,
+ },
+ { .ctl_name = 0 }
+};
+#endif
+
+/*
+ * The active state of the thread pool
+ */
+static atomic_t slow_work_thread_count;
+static atomic_t vslow_work_executing_count;
+
+static bool slow_work_may_not_start_new_thread;
+static bool slow_work_cull; /* cull a thread due to lack of activity */
+static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
+static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
+static struct slow_work slow_work_new_thread; /* new thread starter */
+
+/*
+ * The queues of work items and the lock governing access to them. These are
+ * shared between all the CPUs. It doesn't make sense to have per-CPU queues
+ * as the number of threads bears no relation to the number of CPUs.
+ *
+ * There are two queues of work items: one for slow work items, and one for
+ * very slow work items.
+ */
+static LIST_HEAD(slow_work_queue);
+static LIST_HEAD(vslow_work_queue);
+static DEFINE_SPINLOCK(slow_work_queue_lock);
+
+/*
+ * The thread controls. A variable used to signal to the threads that they
+ * should exit when the queue is empty, a waitqueue used by the threads to wait
+ * for signals, and a completion set by the last thread to exit.
+ */
+static bool slow_work_threads_should_exit;
+static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
+static DECLARE_COMPLETION(slow_work_last_thread_exited);
+
+/*
+ * The number of users of the thread pool and its lock. Whilst this is zero we
+ * have no threads hanging around, and when this reaches zero, we wait for all
+ * active or queued work items to complete and kill all the threads we do have.
+ */
+static int slow_work_user_count;
+static DEFINE_MUTEX(slow_work_user_lock);
+
+/*
+ * Calculate the maximum number of active threads in the pool that are
+ * permitted to process very slow work items.
+ *
+ * The answer is rounded up to at least 1, but may not equal or exceed the
+ * maximum number of the threads in the pool. This means we always have at
+ * least one thread that can process slow work items, and we always have at
+ * least one thread that won't get tied up doing so.
+ */
+static unsigned slow_work_calc_vsmax(void)
+{
+ unsigned vsmax;
+
+ vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
+ vsmax /= 100;
+ vsmax = max(vsmax, 1U);
+ return min(vsmax, slow_work_max_threads - 1);
+}
+
+/*
+ * Attempt to execute stuff queued on a slow thread. Return true if we managed
+ * it, false if there was nothing to do.
+ */
+static bool slow_work_execute(void)
+{
+ struct slow_work *work = NULL;
+ unsigned vsmax;
+ bool very_slow;
+
+ vsmax = slow_work_calc_vsmax();
+
+ /* see if we can schedule a new thread to be started if we're not
+ * keeping up with the work */
+ if (!waitqueue_active(&slow_work_thread_wq) &&
+ (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
+ atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
+ !slow_work_may_not_start_new_thread)
+ slow_work_enqueue(&slow_work_new_thread);
+
+ /* find something to execute */
+ spin_lock_irq(&slow_work_queue_lock);
+ if (!list_empty(&vslow_work_queue) &&
+ atomic_read(&vslow_work_executing_count) < vsmax) {
+ work = list_entry(vslow_work_queue.next,
+ struct slow_work, link);
+ if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
+ BUG();
+ list_del_init(&work->link);
+ atomic_inc(&vslow_work_executing_count);
+ very_slow = true;
+ } else if (!list_empty(&slow_work_queue)) {
+ work = list_entry(slow_work_queue.next,
+ struct slow_work, link);
+ if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
+ BUG();
+ list_del_init(&work->link);
+ very_slow = false;
+ } else {
+ very_slow = false; /* avoid the compiler warning */
+ }
+ spin_unlock_irq(&slow_work_queue_lock);
+
+ if (!work)
+ return false;
+
+ if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
+ BUG();
+
+ work->ops->execute(work);
+
+ if (very_slow)
+ atomic_dec(&vslow_work_executing_count);
+ clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
+
+ /* if someone tried to enqueue the item whilst we were executing it,
+ * then it'll be left unenqueued to avoid multiple threads trying to
+ * execute it simultaneously
+ *
+ * there is, however, a race between us testing the pending flag and
+ * getting the spinlock, and between the enqueuer setting the pending
+ * flag and getting the spinlock, so we use a deferral bit to tell us
+ * if the enqueuer got there first
+ */
+ if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
+ spin_lock_irq(&slow_work_queue_lock);
+
+ if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
+ test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
+ goto auto_requeue;
+
+ spin_unlock_irq(&slow_work_queue_lock);
+ }
+
+ work->ops->put_ref(work);
+ return true;
+
+auto_requeue:
+ /* we must complete the enqueue operation
+ * - we transfer our ref on the item back to the appropriate queue
+ * - don't wake another thread up as we're awake already
+ */
+ if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+ list_add_tail(&work->link, &vslow_work_queue);
+ else
+ list_add_tail(&work->link, &slow_work_queue);
+ spin_unlock_irq(&slow_work_queue_lock);
+ return true;
+}
+
+/**
+ * slow_work_enqueue - Schedule a slow work item for processing
+ * @work: The work item to queue
+ *
+ * Schedule a slow work item for processing. If the item is already undergoing
+ * execution, this guarantees not to re-enter the execution routine until the
+ * first execution finishes.
+ *
+ * The item is pinned by this function as it retains a reference to it, managed
+ * through the item operations. The item is unpinned once it has been
+ * executed.
+ *
+ * An item may hog the thread that is running it for a relatively large amount
+ * of time, sufficient, for example, to perform several lookup, mkdir, create
+ * and setxattr operations. It may sleep on I/O and may sleep to obtain locks.
+ *
+ * Conversely, if a number of items are awaiting processing, it may take some
+ * time before any given item is given attention. The number of threads in the
+ * pool may be increased to deal with demand, but only up to a limit.
+ *
+ * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
+ * the very slow queue, from which only a portion of the threads will be
+ * allowed to pick items to execute. This ensures that very slow items won't
+ * overly block ones that are just ordinarily slow.
+ *
+ * Returns 0 if successful, -EAGAIN if not.
+ */
+int slow_work_enqueue(struct slow_work *work)
+{
+ unsigned long flags;
+
+ BUG_ON(slow_work_user_count <= 0);
+ BUG_ON(!work);
+ BUG_ON(!work->ops);
+ BUG_ON(!work->ops->get_ref);
+
+ /* when honouring an enqueue request, we only promise that we will run
+ * the work function in the future; we do not promise to run it once
+ * per enqueue request
+ *
+ * we use the PENDING bit to merge together repeat requests without
+ * having to disable IRQs and take the spinlock, whilst still
+ * maintaining our promise
+ */
+ if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
+ spin_lock_irqsave(&slow_work_queue_lock, flags);
+
+ /* we promise that we will not attempt to execute the work
+ * function in more than one thread simultaneously
+ *
+ * this, however, leaves us with a problem if we're asked to
+ * enqueue the work whilst someone is executing the work
+ * function as simply queueing the work immediately means that
+ * another thread may try executing it whilst it is already
+ * under execution
+ *
+ * to deal with this, we set the ENQ_DEFERRED bit instead of
+ * enqueueing, and the thread currently executing the work
+ * function will enqueue the work item when the work function
+ * returns and it has cleared the EXECUTING bit
+ */
+ if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
+ set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
+ } else {
+ if (work->ops->get_ref(work) < 0)
+ goto cant_get_ref;
+ if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+ list_add_tail(&work->link, &vslow_work_queue);
+ else
+ list_add_tail(&work->link, &slow_work_queue);
+ wake_up(&slow_work_thread_wq);
+ }
+
+ spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+ }
+ return 0;
+
+cant_get_ref:
+ spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+ return -EAGAIN;
+}
+EXPORT_SYMBOL(slow_work_enqueue);
+
+/*
+ * Schedule a cull of the thread pool at some time in the near future
+ */
+static void slow_work_schedule_cull(void)
+{
+ mod_timer(&slow_work_cull_timer,
+ round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
+}
+
+/*
+ * Worker thread culling algorithm
+ */
+static bool slow_work_cull_thread(void)
+{
+ unsigned long flags;
+ bool do_cull = false;
+
+ spin_lock_irqsave(&slow_work_queue_lock, flags);
+
+ if (slow_work_cull) {
+ slow_work_cull = false;
+
+ if (list_empty(&slow_work_queue) &&
+ list_empty(&vslow_work_queue) &&
+ atomic_read(&slow_work_thread_count) >
+ slow_work_min_threads) {
+ slow_work_schedule_cull();
+ do_cull = true;
+ }
+ }
+
+ spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+ return do_cull;
+}
+
+/*
+ * Determine if there is slow work available for dispatch
+ */
+static inline bool slow_work_available(int vsmax)
+{
+ return !list_empty(&slow_work_queue) ||
+ (!list_empty(&vslow_work_queue) &&
+ atomic_read(&vslow_work_executing_count) < vsmax);
+}
+
+/*
+ * Worker thread dispatcher
+ */
+static int slow_work_thread(void *_data)
+{
+ int vsmax;
+
+ DEFINE_WAIT(wait);
+
+ set_freezable();
+ set_user_nice(current, -5);
+
+ for (;;) {
+ vsmax = vslow_work_proportion;
+ vsmax *= atomic_read(&slow_work_thread_count);
+ vsmax /= 100;
+
+ prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
+ TASK_INTERRUPTIBLE);
+ if (!freezing(current) &&
+ !slow_work_threads_should_exit &&
+ !slow_work_available(vsmax) &&
+ !slow_work_cull)
+ schedule();
+ finish_wait(&slow_work_thread_wq, &wait);
+
+ try_to_freeze();
+
+ vsmax = vslow_work_proportion;
+ vsmax *= atomic_read(&slow_work_thread_count);
+ vsmax /= 100;
+
+ if (slow_work_available(vsmax) && slow_work_execute()) {
+ cond_resched();
+ if (list_empty(&slow_work_queue) &&
+ list_empty(&vslow_work_queue) &&
+ atomic_read(&slow_work_thread_count) >
+ slow_work_min_threads)
+ slow_work_schedule_cull();
+ continue;
+ }
+
+ if (slow_work_threads_should_exit)
+ break;
+
+ if (slow_work_cull && slow_work_cull_thread())
+ break;
+ }
+
+ if (atomic_dec_and_test(&slow_work_thread_count))
+ complete_and_exit(&slow_work_last_thread_exited, 0);
+ return 0;
+}
+
+/*
+ * Handle thread cull timer expiration
+ */
+static void slow_work_cull_timeout(unsigned long data)
+{
+ slow_work_cull = true;
+ wake_up(&slow_work_thread_wq);
+}
+
+/*
+ * Get a reference on slow work thread starter
+ */
+static int slow_work_new_thread_get_ref(struct slow_work *work)
+{
+ return 0;
+}
+
+/*
+ * Drop a reference on slow work thread starter
+ */
+static void slow_work_new_thread_put_ref(struct slow_work *work)
+{
+}
+
+/*
+ * Start a new slow work thread
+ */
+static void slow_work_new_thread_execute(struct slow_work *work)
+{
+ struct task_struct *p;
+
+ if (slow_work_threads_should_exit)
+ return;
+
+ if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
+ return;
+
+ if (!mutex_trylock(&slow_work_user_lock))
+ return;
+
+ slow_work_may_not_start_new_thread = true;
+ atomic_inc(&slow_work_thread_count);
+ p = kthread_run(slow_work_thread, NULL, "kslowd");
+ if (IS_ERR(p)) {
+ printk(KERN_DEBUG "Slow work thread pool: OOM\n");
+ if (atomic_dec_and_test(&slow_work_thread_count))
+ BUG(); /* we're running on a slow work thread... */
+ mod_timer(&slow_work_oom_timer,
+ round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
+ } else {
+ /* ratelimit the starting of new threads */
+ mod_timer(&slow_work_oom_timer, jiffies + 1);
+ }
+
+ mutex_unlock(&slow_work_user_lock);
+}
+
+static const struct slow_work_ops slow_work_new_thread_ops = {
+ .get_ref = slow_work_new_thread_get_ref,
+ .put_ref = slow_work_new_thread_put_ref,
+ .execute = slow_work_new_thread_execute,
+};
+
+/*
+ * post-OOM new thread start suppression expiration
+ */
+static void slow_work_oom_timeout(unsigned long data)
+{
+ slow_work_may_not_start_new_thread = false;
+}
+
+#ifdef CONFIG_SYSCTL
+/*
+ * Handle adjustment of the minimum number of threads
+ */
+static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+ int n;
+
+ if (ret == 0) {
+ mutex_lock(&slow_work_user_lock);
+ if (slow_work_user_count > 0) {
+ /* see if we need to start or stop threads */
+ n = atomic_read(&slow_work_thread_count) -
+ slow_work_min_threads;
+
+ if (n < 0 && !slow_work_may_not_start_new_thread)
+ slow_work_enqueue(&slow_work_new_thread);
+ else if (n > 0)
+ slow_work_schedule_cull();
+ }
+ mutex_unlock(&slow_work_user_lock);
+ }
+
+ return ret;
+}
+
+/*
+ * Handle adjustment of the maximum number of threads
+ */
+static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+ int n;
+
+ if (ret == 0) {
+ mutex_lock(&slow_work_user_lock);
+ if (slow_work_user_count > 0) {
+ /* see if we need to stop threads */
+ n = slow_work_max_threads -
+ atomic_read(&slow_work_thread_count);
+
+ if (n < 0)
+ slow_work_schedule_cull();
+ }
+ mutex_unlock(&slow_work_user_lock);
+ }
+
+ return ret;
+}
+#endif /* CONFIG_SYSCTL */
+
+/**
+ * slow_work_register_user - Register a user of the facility
+ *
+ * Register a user of the facility, starting up the initial threads if there
+ * aren't any other users at this point. This will return 0 if successful, or
+ * an error if not.
+ */
+int slow_work_register_user(void)
+{
+ struct task_struct *p;
+ int loop;
+
+ mutex_lock(&slow_work_user_lock);
+
+ if (slow_work_user_count == 0) {
+ printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
+ init_completion(&slow_work_last_thread_exited);
+
+ slow_work_threads_should_exit = false;
+ slow_work_init(&slow_work_new_thread,
+ &slow_work_new_thread_ops);
+ slow_work_may_not_start_new_thread = false;
+ slow_work_cull = false;
+
+ /* start the minimum number of threads */
+ for (loop = 0; loop < slow_work_min_threads; loop++) {
+ atomic_inc(&slow_work_thread_count);
+ p = kthread_run(slow_work_thread, NULL, "kslowd");
+ if (IS_ERR(p))
+ goto error;
+ }
+ printk(KERN_NOTICE "Slow work thread pool: Ready\n");
+ }
+
+ slow_work_user_count++;
+ mutex_unlock(&slow_work_user_lock);
+ return 0;
+
+error:
+ if (atomic_dec_and_test(&slow_work_thread_count))
+ complete(&slow_work_last_thread_exited);
+ if (loop > 0) {
+ printk(KERN_ERR "Slow work thread pool:"
+ " Aborting startup on ENOMEM\n");
+ slow_work_threads_should_exit = true;
+ wake_up_all(&slow_work_thread_wq);
+ wait_for_completion(&slow_work_last_thread_exited);
+ printk(KERN_ERR "Slow work thread pool: Aborted\n");
+ }
+ mutex_unlock(&slow_work_user_lock);
+ return PTR_ERR(p);
+}
+EXPORT_SYMBOL(slow_work_register_user);
+
+/**
+ * slow_work_unregister_user - Unregister a user of the facility
+ *
+ * Unregister a user of the facility, killing all the threads if this was the
+ * last one.
+ */
+void slow_work_unregister_user(void)
+{
+ mutex_lock(&slow_work_user_lock);
+
+ BUG_ON(slow_work_user_count <= 0);
+
+ slow_work_user_count--;
+ if (slow_work_user_count == 0) {
+ printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
+ slow_work_threads_should_exit = true;
+ del_timer_sync(&slow_work_cull_timer);
+ del_timer_sync(&slow_work_oom_timer);
+ wake_up_all(&slow_work_thread_wq);
+ wait_for_completion(&slow_work_last_thread_exited);
+ printk(KERN_NOTICE "Slow work thread pool:"
+ " Shut down complete\n");
+ }
+
+ mutex_unlock(&slow_work_user_lock);
+}
+EXPORT_SYMBOL(slow_work_unregister_user);
+
+/*
+ * Initialise the slow work facility
+ */
+static int __init init_slow_work(void)
+{
+ unsigned nr_cpus = num_possible_cpus();
+
+ if (slow_work_max_threads < nr_cpus)
+ slow_work_max_threads = nr_cpus;
+#ifdef CONFIG_SYSCTL
+ if (slow_work_max_max_threads < nr_cpus * 2)
+ slow_work_max_max_threads = nr_cpus * 2;
+#endif
+ return 0;
+}
+
+subsys_initcall(init_slow_work);
diff --git a/kernel/smp.c b/kernel/smp.c
index 5cfa0e5e3e8..94188b8ecc3 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -2,39 +2,82 @@
* Generic helpers for smp ipi calls
*
* (C) Jens Axboe <jens.axboe@oracle.com> 2008
- *
*/
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
#include <linux/smp.h>
+#include <linux/cpu.h>
static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
-static LIST_HEAD(call_function_queue);
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock);
+
+static struct {
+ struct list_head queue;
+ spinlock_t lock;
+} call_function __cacheline_aligned_in_smp =
+ {
+ .queue = LIST_HEAD_INIT(call_function.queue),
+ .lock = __SPIN_LOCK_UNLOCKED(call_function.lock),
+ };
enum {
- CSD_FLAG_WAIT = 0x01,
- CSD_FLAG_ALLOC = 0x02,
+ CSD_FLAG_LOCK = 0x01,
};
struct call_function_data {
- struct call_single_data csd;
- spinlock_t lock;
- unsigned int refs;
- struct rcu_head rcu_head;
- unsigned long cpumask_bits[];
+ struct call_single_data csd;
+ spinlock_t lock;
+ unsigned int refs;
+ cpumask_var_t cpumask;
};
struct call_single_queue {
- struct list_head list;
- spinlock_t lock;
+ struct list_head list;
+ spinlock_t lock;
+};
+
+static DEFINE_PER_CPU(struct call_function_data, cfd_data) = {
+ .lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
+};
+
+static int
+hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+ struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
+ cpu_to_node(cpu)))
+ return NOTIFY_BAD;
+ break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ free_cpumask_var(cfd->cpumask);
+ break;
+#endif
+ };
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
+ .notifier_call = hotplug_cfd,
};
static int __cpuinit init_call_single_data(void)
{
+ void *cpu = (void *)(long)smp_processor_id();
int i;
for_each_possible_cpu(i) {
@@ -43,29 +86,63 @@ static int __cpuinit init_call_single_data(void)
spin_lock_init(&q->lock);
INIT_LIST_HEAD(&q->list);
}
+
+ hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
+ register_cpu_notifier(&hotplug_cfd_notifier);
+
return 0;
}
early_initcall(init_call_single_data);
-static void csd_flag_wait(struct call_single_data *data)
+/*
+ * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
+ *
+ * For non-synchronous ipi calls the csd can still be in use by the
+ * previous function call. For multi-cpu calls its even more interesting
+ * as we'll have to ensure no other cpu is observing our csd.
+ */
+static void csd_lock_wait(struct call_single_data *data)
{
- /* Wait for response */
- do {
- if (!(data->flags & CSD_FLAG_WAIT))
- break;
+ while (data->flags & CSD_FLAG_LOCK)
cpu_relax();
- } while (1);
+}
+
+static void csd_lock(struct call_single_data *data)
+{
+ csd_lock_wait(data);
+ data->flags = CSD_FLAG_LOCK;
+
+ /*
+ * prevent CPU from reordering the above assignment
+ * to ->flags with any subsequent assignments to other
+ * fields of the specified call_single_data structure:
+ */
+ smp_mb();
+}
+
+static void csd_unlock(struct call_single_data *data)
+{
+ WARN_ON(!(data->flags & CSD_FLAG_LOCK));
+
+ /*
+ * ensure we're all done before releasing data:
+ */
+ smp_mb();
+
+ data->flags &= ~CSD_FLAG_LOCK;
}
/*
- * Insert a previously allocated call_single_data element for execution
- * on the given CPU. data must already have ->func, ->info, and ->flags set.
+ * Insert a previously allocated call_single_data element
+ * for execution on the given CPU. data must already have
+ * ->func, ->info, and ->flags set.
*/
-static void generic_exec_single(int cpu, struct call_single_data *data)
+static
+void generic_exec_single(int cpu, struct call_single_data *data, int wait)
{
struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
- int wait = data->flags & CSD_FLAG_WAIT, ipi;
unsigned long flags;
+ int ipi;
spin_lock_irqsave(&dst->lock, flags);
ipi = list_empty(&dst->list);
@@ -73,24 +150,21 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
spin_unlock_irqrestore(&dst->lock, flags);
/*
- * Make the list addition visible before sending the ipi.
+ * The list addition should be visible before sending the IPI
+ * handler locks the list to pull the entry off it because of
+ * normal cache coherency rules implied by spinlocks.
+ *
+ * If IPIs can go out of order to the cache coherency protocol
+ * in an architecture, sufficient synchronisation should be added
+ * to arch code to make it appear to obey cache coherency WRT
+ * locking and barrier primitives. Generic code isn't really
+ * equipped to do the right thing...
*/
- smp_mb();
-
if (ipi)
arch_send_call_function_single_ipi(cpu);
if (wait)
- csd_flag_wait(data);
-}
-
-static void rcu_free_call_data(struct rcu_head *head)
-{
- struct call_function_data *data;
-
- data = container_of(head, struct call_function_data, rcu_head);
-
- kfree(data);
+ csd_lock_wait(data);
}
/*
@@ -103,99 +177,88 @@ void generic_smp_call_function_interrupt(void)
int cpu = get_cpu();
/*
- * It's ok to use list_for_each_rcu() here even though we may delete
- * 'pos', since list_del_rcu() doesn't clear ->next
+ * Ensure entry is visible on call_function_queue after we have
+ * entered the IPI. See comment in smp_call_function_many.
+ * If we don't have this, then we may miss an entry on the list
+ * and never get another IPI to process it.
+ */
+ smp_mb();
+
+ /*
+ * It's ok to use list_for_each_rcu() here even though we may
+ * delete 'pos', since list_del_rcu() doesn't clear ->next
*/
- rcu_read_lock();
- list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
+ list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
int refs;
- if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits)))
+ spin_lock(&data->lock);
+ if (!cpumask_test_cpu(cpu, data->cpumask)) {
+ spin_unlock(&data->lock);
continue;
+ }
+ cpumask_clear_cpu(cpu, data->cpumask);
+ spin_unlock(&data->lock);
data->csd.func(data->csd.info);
spin_lock(&data->lock);
- cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits));
WARN_ON(data->refs == 0);
- data->refs--;
- refs = data->refs;
+ refs = --data->refs;
+ if (!refs) {
+ spin_lock(&call_function.lock);
+ list_del_rcu(&data->csd.list);
+ spin_unlock(&call_function.lock);
+ }
spin_unlock(&data->lock);
if (refs)
continue;
- spin_lock(&call_function_lock);
- list_del_rcu(&data->csd.list);
- spin_unlock(&call_function_lock);
-
- if (data->csd.flags & CSD_FLAG_WAIT) {
- /*
- * serialize stores to data with the flag clear
- * and wakeup
- */
- smp_wmb();
- data->csd.flags &= ~CSD_FLAG_WAIT;
- }
- if (data->csd.flags & CSD_FLAG_ALLOC)
- call_rcu(&data->rcu_head, rcu_free_call_data);
+ csd_unlock(&data->csd);
}
- rcu_read_unlock();
put_cpu();
}
/*
- * Invoked by arch to handle an IPI for call function single. Must be called
- * from the arch with interrupts disabled.
+ * Invoked by arch to handle an IPI for call function single. Must be
+ * called from the arch with interrupts disabled.
*/
void generic_smp_call_function_single_interrupt(void)
{
struct call_single_queue *q = &__get_cpu_var(call_single_queue);
+ unsigned int data_flags;
LIST_HEAD(list);
- /*
- * Need to see other stores to list head for checking whether
- * list is empty without holding q->lock
- */
- smp_read_barrier_depends();
- while (!list_empty(&q->list)) {
- unsigned int data_flags;
-
- spin_lock(&q->lock);
- list_replace_init(&q->list, &list);
- spin_unlock(&q->lock);
-
- while (!list_empty(&list)) {
- struct call_single_data *data;
-
- data = list_entry(list.next, struct call_single_data,
- list);
- list_del(&data->list);
-
- /*
- * 'data' can be invalid after this call if
- * flags == 0 (when called through
- * generic_exec_single(), so save them away before
- * making the call.
- */
- data_flags = data->flags;
-
- data->func(data->info);
-
- if (data_flags & CSD_FLAG_WAIT) {
- smp_wmb();
- data->flags &= ~CSD_FLAG_WAIT;
- } else if (data_flags & CSD_FLAG_ALLOC)
- kfree(data);
- }
+ spin_lock(&q->lock);
+ list_replace_init(&q->list, &list);
+ spin_unlock(&q->lock);
+
+ while (!list_empty(&list)) {
+ struct call_single_data *data;
+
+ data = list_entry(list.next, struct call_single_data, list);
+ list_del(&data->list);
+
/*
- * See comment on outer loop
+ * 'data' can be invalid after this call if flags == 0
+ * (when called through generic_exec_single()),
+ * so save them away before making the call:
*/
- smp_read_barrier_depends();
+ data_flags = data->flags;
+
+ data->func(data->info);
+
+ /*
+ * Unlocked CSDs are valid through generic_exec_single():
+ */
+ if (data_flags & CSD_FLAG_LOCK)
+ csd_unlock(data);
}
}
+static DEFINE_PER_CPU(struct call_single_data, csd_data);
+
/*
* smp_call_function_single - Run a function on a specific CPU
* @func: The function to run. This must be fast and non-blocking.
@@ -209,41 +272,45 @@ void generic_smp_call_function_single_interrupt(void)
int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
int wait)
{
- struct call_single_data d;
+ struct call_single_data d = {
+ .flags = 0,
+ };
unsigned long flags;
- /* prevent preemption and reschedule on another processor,
- as well as CPU removal */
- int me = get_cpu();
+ int this_cpu;
int err = 0;
+ /*
+ * prevent preemption and reschedule on another processor,
+ * as well as CPU removal
+ */
+ this_cpu = get_cpu();
+
/* Can deadlock when called with interrupts disabled */
- WARN_ON(irqs_disabled());
+ WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
- if (cpu == me) {
+ if (cpu == this_cpu) {
local_irq_save(flags);
func(info);
local_irq_restore(flags);
- } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
- struct call_single_data *data = NULL;
+ } else {
+ if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
+ struct call_single_data *data = &d;
- if (!wait) {
- data = kmalloc(sizeof(*data), GFP_ATOMIC);
- if (data)
- data->flags = CSD_FLAG_ALLOC;
- }
- if (!data) {
- data = &d;
- data->flags = CSD_FLAG_WAIT;
- }
+ if (!wait)
+ data = &__get_cpu_var(csd_data);
- data->func = func;
- data->info = info;
- generic_exec_single(cpu, data);
- } else {
- err = -ENXIO; /* CPU not online */
+ csd_lock(data);
+
+ data->func = func;
+ data->info = info;
+ generic_exec_single(cpu, data, wait);
+ } else {
+ err = -ENXIO; /* CPU not online */
+ }
}
put_cpu();
+
return err;
}
EXPORT_SYMBOL(smp_call_function_single);
@@ -253,23 +320,26 @@ EXPORT_SYMBOL(smp_call_function_single);
* @cpu: The CPU to run on.
* @data: Pre-allocated and setup data structure
*
- * Like smp_call_function_single(), but allow caller to pass in a pre-allocated
- * data structure. Useful for embedding @data inside other structures, for
- * instance.
- *
+ * Like smp_call_function_single(), but allow caller to pass in a
+ * pre-allocated data structure. Useful for embedding @data inside
+ * other structures, for instance.
*/
-void __smp_call_function_single(int cpu, struct call_single_data *data)
+void __smp_call_function_single(int cpu, struct call_single_data *data,
+ int wait)
{
+ csd_lock(data);
+
/* Can deadlock when called with interrupts disabled */
- WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled());
+ WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress);
- generic_exec_single(cpu, data);
+ generic_exec_single(cpu, data, wait);
}
-/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */
+/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
+
#ifndef arch_send_call_function_ipi_mask
-#define arch_send_call_function_ipi_mask(maskp) \
- arch_send_call_function_ipi(*(maskp))
+# define arch_send_call_function_ipi_mask(maskp) \
+ arch_send_call_function_ipi(*(maskp))
#endif
/**
@@ -277,7 +347,8 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
* @mask: The set of cpus to run on (only runs on online subset).
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ * @wait: If true, wait (atomically) until function has completed
+ * on other CPUs.
*
* If @wait is true, then returns once @func has returned. Note that @wait
* will be implicitly turned on in case of allocation failures, since
@@ -288,27 +359,27 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
* must be disabled when calling this function.
*/
void smp_call_function_many(const struct cpumask *mask,
- void (*func)(void *), void *info,
- bool wait)
+ void (*func)(void *), void *info, bool wait)
{
struct call_function_data *data;
unsigned long flags;
- int cpu, next_cpu;
+ int cpu, next_cpu, this_cpu = smp_processor_id();
/* Can deadlock when called with interrupts disabled */
- WARN_ON(irqs_disabled());
+ WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
- /* So, what's a CPU they want? Ignoring this one. */
+ /* So, what's a CPU they want? Ignoring this one. */
cpu = cpumask_first_and(mask, cpu_online_mask);
- if (cpu == smp_processor_id())
+ if (cpu == this_cpu)
cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
+
/* No online cpus? We're done. */
if (cpu >= nr_cpu_ids)
return;
/* Do we have another CPU which isn't us? */
next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
- if (next_cpu == smp_processor_id())
+ if (next_cpu == this_cpu)
next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
/* Fastpath: do that cpu by itself. */
@@ -317,43 +388,40 @@ void smp_call_function_many(const struct cpumask *mask,
return;
}
- data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC);
- if (unlikely(!data)) {
- /* Slow path. */
- for_each_online_cpu(cpu) {
- if (cpu == smp_processor_id())
- continue;
- if (cpumask_test_cpu(cpu, mask))
- smp_call_function_single(cpu, func, info, wait);
- }
- return;
- }
+ data = &__get_cpu_var(cfd_data);
+ csd_lock(&data->csd);
- spin_lock_init(&data->lock);
- data->csd.flags = CSD_FLAG_ALLOC;
- if (wait)
- data->csd.flags |= CSD_FLAG_WAIT;
+ spin_lock_irqsave(&data->lock, flags);
data->csd.func = func;
data->csd.info = info;
- cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask);
- cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits));
- data->refs = cpumask_weight(to_cpumask(data->cpumask_bits));
+ cpumask_and(data->cpumask, mask, cpu_online_mask);
+ cpumask_clear_cpu(this_cpu, data->cpumask);
+ data->refs = cpumask_weight(data->cpumask);
- spin_lock_irqsave(&call_function_lock, flags);
- list_add_tail_rcu(&data->csd.list, &call_function_queue);
- spin_unlock_irqrestore(&call_function_lock, flags);
+ spin_lock(&call_function.lock);
+ /*
+ * Place entry at the _HEAD_ of the list, so that any cpu still
+ * observing the entry in generic_smp_call_function_interrupt()
+ * will not miss any other list entries:
+ */
+ list_add_rcu(&data->csd.list, &call_function.queue);
+ spin_unlock(&call_function.lock);
+
+ spin_unlock_irqrestore(&data->lock, flags);
/*
* Make the list addition visible before sending the ipi.
+ * (IPIs must obey or appear to obey normal Linux cache
+ * coherency rules -- see comment in generic_exec_single).
*/
smp_mb();
/* Send a message to all CPUs in the map */
- arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits));
+ arch_send_call_function_ipi_mask(data->cpumask);
- /* optionally wait for the CPUs to complete */
+ /* Optionally wait for the CPUs to complete */
if (wait)
- csd_flag_wait(&data->csd);
+ csd_lock_wait(&data->csd);
}
EXPORT_SYMBOL(smp_call_function_many);
@@ -361,7 +429,8 @@ EXPORT_SYMBOL(smp_call_function_many);
* smp_call_function(): Run a function on all other CPUs.
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ * @wait: If true, wait (atomically) until function has completed
+ * on other CPUs.
*
* Returns 0.
*
@@ -377,26 +446,27 @@ int smp_call_function(void (*func)(void *), void *info, int wait)
preempt_disable();
smp_call_function_many(cpu_online_mask, func, info, wait);
preempt_enable();
+
return 0;
}
EXPORT_SYMBOL(smp_call_function);
void ipi_call_lock(void)
{
- spin_lock(&call_function_lock);
+ spin_lock(&call_function.lock);
}
void ipi_call_unlock(void)
{
- spin_unlock(&call_function_lock);
+ spin_unlock(&call_function.lock);
}
void ipi_call_lock_irq(void)
{
- spin_lock_irq(&call_function_lock);
+ spin_lock_irq(&call_function.lock);
}
void ipi_call_unlock_irq(void)
{
- spin_unlock_irq(&call_function_lock);
+ spin_unlock_irq(&call_function.lock);
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de9cd8..7db25067cd2 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -21,9 +21,13 @@
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/rcupdate.h>
+#include <linux/ftrace.h>
#include <linux/smp.h>
#include <linux/tick.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/irq.h>
+
#include <asm/irq.h>
/*
- No shared variables, all the data are CPU local.
@@ -52,13 +56,18 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+char *softirq_to_name[NR_SOFTIRQS] = {
+ "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK",
+ "TASKLET", "SCHED", "HRTIMER", "RCU"
+};
+
/*
* we cannot loop indefinitely here to avoid userspace starvation,
* but we also don't want to introduce a worst case 1/HZ latency
* to the pending events, so lets the scheduler to balance
* the softirq load for us.
*/
-static inline void wakeup_softirqd(void)
+void wakeup_softirqd(void)
{
/* Interrupts are disabled: no need to stop preemption */
struct task_struct *tsk = __get_cpu_var(ksoftirqd);
@@ -79,13 +88,23 @@ static void __local_bh_disable(unsigned long ip)
WARN_ON_ONCE(in_irq());
raw_local_irq_save(flags);
- add_preempt_count(SOFTIRQ_OFFSET);
+ /*
+ * The preempt tracer hooks into add_preempt_count and will break
+ * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
+ * is set and before current->softirq_enabled is cleared.
+ * We must manually increment preempt_count here and manually
+ * call the trace_preempt_off later.
+ */
+ preempt_count() += SOFTIRQ_OFFSET;
/*
* Were softirqs turned off above:
*/
if (softirq_count() == SOFTIRQ_OFFSET)
trace_softirqs_off(ip);
raw_local_irq_restore(flags);
+
+ if (preempt_count() == SOFTIRQ_OFFSET)
+ trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
}
#else /* !CONFIG_TRACE_IRQFLAGS */
static inline void __local_bh_disable(unsigned long ip)
@@ -180,7 +199,7 @@ asmlinkage void __do_softirq(void)
account_system_vtime(current);
__local_bh_disable((unsigned long)__builtin_return_address(0));
- trace_softirq_enter();
+ lockdep_softirq_enter();
cpu = smp_processor_id();
restart:
@@ -194,18 +213,21 @@ restart:
do {
if (pending & 1) {
int prev_count = preempt_count();
+ kstat_incr_softirqs_this_cpu(h - softirq_vec);
+ trace_softirq_entry(h, softirq_vec);
h->action(h);
-
+ trace_softirq_exit(h, softirq_vec);
if (unlikely(prev_count != preempt_count())) {
- printk(KERN_ERR "huh, entered softirq %td %p"
+ printk(KERN_ERR "huh, entered softirq %td %s %p"
"with preempt_count %08x,"
" exited with %08x?\n", h - softirq_vec,
+ softirq_to_name[h - softirq_vec],
h->action, prev_count, preempt_count());
preempt_count() = prev_count;
}
- rcu_bh_qsctr_inc(cpu);
+ rcu_bh_qs(cpu);
}
h++;
pending >>= 1;
@@ -220,7 +242,7 @@ restart:
if (pending)
wakeup_softirqd();
- trace_softirq_exit();
+ lockdep_softirq_exit();
account_system_vtime(current);
_local_bh_enable();
@@ -323,7 +345,9 @@ void open_softirq(int nr, void (*action)(struct softirq_action *))
softirq_vec[nr].action = action;
}
-/* Tasklets */
+/*
+ * Tasklets
+ */
struct tasklet_head
{
struct tasklet_struct *head;
@@ -361,6 +385,17 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
EXPORT_SYMBOL(__tasklet_hi_schedule);
+void __tasklet_hi_schedule_first(struct tasklet_struct *t)
+{
+ BUG_ON(!irqs_disabled());
+
+ t->next = __get_cpu_var(tasklet_hi_vec).head;
+ __get_cpu_var(tasklet_hi_vec).head = t;
+ __raise_softirq_irqoff(HI_SOFTIRQ);
+}
+
+EXPORT_SYMBOL(__tasklet_hi_schedule_first);
+
static void tasklet_action(struct softirq_action *a)
{
struct tasklet_struct *list;
@@ -450,9 +485,9 @@ void tasklet_kill(struct tasklet_struct *t)
printk("Attempt to kill tasklet from interrupt\n");
while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
- do
+ do {
yield();
- while (test_bit(TASKLET_STATE_SCHED, &t->state));
+ } while (test_bit(TASKLET_STATE_SCHED, &t->state));
}
tasklet_unlock_wait(t);
clear_bit(TASKLET_STATE_SCHED, &t->state);
@@ -460,6 +495,66 @@ void tasklet_kill(struct tasklet_struct *t)
EXPORT_SYMBOL(tasklet_kill);
+/*
+ * tasklet_hrtimer
+ */
+
+/*
+ * The trampoline is called when the hrtimer expires. If this is
+ * called from the hrtimer interrupt then we schedule the tasklet as
+ * the timer callback function expects to run in softirq context. If
+ * it's called in softirq context anyway (i.e. high resolution timers
+ * disabled) then the hrtimer callback is called right away.
+ */
+static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
+{
+ struct tasklet_hrtimer *ttimer =
+ container_of(timer, struct tasklet_hrtimer, timer);
+
+ if (hrtimer_is_hres_active(timer)) {
+ tasklet_hi_schedule(&ttimer->tasklet);
+ return HRTIMER_NORESTART;
+ }
+ return ttimer->function(timer);
+}
+
+/*
+ * Helper function which calls the hrtimer callback from
+ * tasklet/softirq context
+ */
+static void __tasklet_hrtimer_trampoline(unsigned long data)
+{
+ struct tasklet_hrtimer *ttimer = (void *)data;
+ enum hrtimer_restart restart;
+
+ restart = ttimer->function(&ttimer->timer);
+ if (restart != HRTIMER_NORESTART)
+ hrtimer_restart(&ttimer->timer);
+}
+
+/**
+ * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
+ * @ttimer: tasklet_hrtimer which is initialized
+ * @function: hrtimer callback funtion which gets called from softirq context
+ * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
+ * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
+ */
+void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
+ enum hrtimer_restart (*function)(struct hrtimer *),
+ clockid_t which_clock, enum hrtimer_mode mode)
+{
+ hrtimer_init(&ttimer->timer, which_clock, mode);
+ ttimer->timer.function = __hrtimer_tasklet_trampoline;
+ tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
+ (unsigned long)ttimer);
+ ttimer->function = function;
+}
+EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
+
+/*
+ * Remote softirq bits
+ */
+
DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
EXPORT_PER_CPU_SYMBOL(softirq_work_list);
@@ -496,7 +591,7 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir
cp->flags = 0;
cp->priv = softirq;
- __smp_call_function_single(cpu, cp);
+ __smp_call_function_single(cpu, cp, 0);
return 0;
}
return 1;
@@ -626,6 +721,7 @@ static int ksoftirqd(void * __bind_cpu)
preempt_enable_no_resched();
cond_resched();
preempt_disable();
+ rcu_sched_qs((long)__bind_cpu);
}
preempt_enable();
set_current_state(TASK_INTERRUPTIBLE);
@@ -795,12 +891,17 @@ int __init __weak early_irq_init(void)
return 0;
}
+int __init __weak arch_probe_nr_irqs(void)
+{
+ return 0;
+}
+
int __init __weak arch_early_irq_init(void)
{
return 0;
}
-int __weak arch_init_chip_data(struct irq_desc *desc, int cpu)
+int __weak arch_init_chip_data(struct irq_desc *desc, int node)
{
return 0;
}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d9188c66278..88796c33083 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -16,6 +16,7 @@
#include <linux/lockdep.h>
#include <linux/notifier.h>
#include <linux/module.h>
+#include <linux/sysctl.h>
#include <asm/irq_regs.h>
@@ -88,6 +89,14 @@ void touch_all_softlockup_watchdogs(void)
}
EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
+int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ touch_all_softlockup_watchdogs();
+ return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+}
+
/*
* This callback runs from the timer interrupt, and checks
* whether the watchdog thread has hung or not:
@@ -157,97 +166,11 @@ void softlockup_tick(void)
}
/*
- * Have a reasonable limit on the number of tasks checked:
- */
-unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
-
-/*
- * Zero means infinite timeout - no checking done:
- */
-unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
-
-unsigned long __read_mostly sysctl_hung_task_warnings = 10;
-
-/*
- * Only do the hung-tasks check on one CPU:
- */
-static int check_cpu __read_mostly = -1;
-
-static void check_hung_task(struct task_struct *t, unsigned long now)
-{
- unsigned long switch_count = t->nvcsw + t->nivcsw;
-
- if (t->flags & PF_FROZEN)
- return;
-
- if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
- t->last_switch_count = switch_count;
- t->last_switch_timestamp = now;
- return;
- }
- if ((long)(now - t->last_switch_timestamp) <
- sysctl_hung_task_timeout_secs)
- return;
- if (!sysctl_hung_task_warnings)
- return;
- sysctl_hung_task_warnings--;
-
- /*
- * Ok, the task did not get scheduled for more than 2 minutes,
- * complain:
- */
- printk(KERN_ERR "INFO: task %s:%d blocked for more than "
- "%ld seconds.\n", t->comm, t->pid,
- sysctl_hung_task_timeout_secs);
- printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
- " disables this message.\n");
- sched_show_task(t);
- __debug_show_held_locks(t);
-
- t->last_switch_timestamp = now;
- touch_nmi_watchdog();
-
- if (softlockup_panic)
- panic("softlockup: blocked tasks");
-}
-
-/*
- * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
- * a really long time (120 seconds). If that happens, print out
- * a warning.
- */
-static void check_hung_uninterruptible_tasks(int this_cpu)
-{
- int max_count = sysctl_hung_task_check_count;
- unsigned long now = get_timestamp(this_cpu);
- struct task_struct *g, *t;
-
- /*
- * If the system crashed already then all bets are off,
- * do not report extra hung tasks:
- */
- if (test_taint(TAINT_DIE) || did_panic)
- return;
-
- read_lock(&tasklist_lock);
- do_each_thread(g, t) {
- if (!--max_count)
- goto unlock;
- /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
- if (t->state == TASK_UNINTERRUPTIBLE)
- check_hung_task(t, now);
- } while_each_thread(g, t);
- unlock:
- read_unlock(&tasklist_lock);
-}
-
-/*
* The watchdog thread - runs every second and touches the timestamp.
*/
static int watchdog(void *__bind_cpu)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
- int this_cpu = (long)__bind_cpu;
sched_setscheduler(current, SCHED_FIFO, &param);
@@ -267,11 +190,6 @@ static int watchdog(void *__bind_cpu)
if (kthread_should_stop())
break;
- if (this_cpu == check_cpu) {
- if (sysctl_hung_task_timeout_secs)
- check_hung_uninterruptible_tasks(this_cpu);
- }
-
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
@@ -303,18 +221,9 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- check_cpu = cpumask_any(cpu_online_mask);
wake_up_process(per_cpu(watchdog_task, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- if (hotcpu == check_cpu) {
- /* Pick any other online cpu. */
- check_cpu = cpumask_any_but(cpu_online_mask, hotcpu);
- }
- break;
-
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
if (!per_cpu(watchdog_task, hotcpu))
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 29ab20749dd..5ddab730cb2 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,44 +21,29 @@
#include <linux/debug_locks.h>
#include <linux/module.h>
+#ifndef _spin_trylock
int __lockfunc _spin_trylock(spinlock_t *lock)
{
- preempt_disable();
- if (_raw_spin_trylock(lock)) {
- spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
- return 1;
- }
-
- preempt_enable();
- return 0;
+ return __spin_trylock(lock);
}
EXPORT_SYMBOL(_spin_trylock);
+#endif
+#ifndef _read_trylock
int __lockfunc _read_trylock(rwlock_t *lock)
{
- preempt_disable();
- if (_raw_read_trylock(lock)) {
- rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_);
- return 1;
- }
-
- preempt_enable();
- return 0;
+ return __read_trylock(lock);
}
EXPORT_SYMBOL(_read_trylock);
+#endif
+#ifndef _write_trylock
int __lockfunc _write_trylock(rwlock_t *lock)
{
- preempt_disable();
- if (_raw_write_trylock(lock)) {
- rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_);
- return 1;
- }
-
- preempt_enable();
- return 0;
+ return __write_trylock(lock);
}
EXPORT_SYMBOL(_write_trylock);
+#endif
/*
* If lockdep is enabled then we use the non-preemption spin-ops
@@ -67,130 +52,101 @@ EXPORT_SYMBOL(_write_trylock);
*/
#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
+#ifndef _read_lock
void __lockfunc _read_lock(rwlock_t *lock)
{
- preempt_disable();
- rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
- LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
+ __read_lock(lock);
}
EXPORT_SYMBOL(_read_lock);
+#endif
+#ifndef _spin_lock_irqsave
unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
{
- unsigned long flags;
-
- local_irq_save(flags);
- preempt_disable();
- spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- /*
- * On lockdep we dont want the hand-coded irq-enable of
- * _raw_spin_lock_flags() code, because lockdep assumes
- * that interrupts are not re-enabled during lock-acquire:
- */
-#ifdef CONFIG_LOCKDEP
- LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
-#else
- _raw_spin_lock_flags(lock, &flags);
-#endif
- return flags;
+ return __spin_lock_irqsave(lock);
}
EXPORT_SYMBOL(_spin_lock_irqsave);
+#endif
+#ifndef _spin_lock_irq
void __lockfunc _spin_lock_irq(spinlock_t *lock)
{
- local_irq_disable();
- preempt_disable();
- spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+ __spin_lock_irq(lock);
}
EXPORT_SYMBOL(_spin_lock_irq);
+#endif
+#ifndef _spin_lock_bh
void __lockfunc _spin_lock_bh(spinlock_t *lock)
{
- local_bh_disable();
- preempt_disable();
- spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+ __spin_lock_bh(lock);
}
EXPORT_SYMBOL(_spin_lock_bh);
+#endif
+#ifndef _read_lock_irqsave
unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
{
- unsigned long flags;
-
- local_irq_save(flags);
- preempt_disable();
- rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
- LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
- return flags;
+ return __read_lock_irqsave(lock);
}
EXPORT_SYMBOL(_read_lock_irqsave);
+#endif
+#ifndef _read_lock_irq
void __lockfunc _read_lock_irq(rwlock_t *lock)
{
- local_irq_disable();
- preempt_disable();
- rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
- LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
+ __read_lock_irq(lock);
}
EXPORT_SYMBOL(_read_lock_irq);
+#endif
+#ifndef _read_lock_bh
void __lockfunc _read_lock_bh(rwlock_t *lock)
{
- local_bh_disable();
- preempt_disable();
- rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
- LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
+ __read_lock_bh(lock);
}
EXPORT_SYMBOL(_read_lock_bh);
+#endif
+#ifndef _write_lock_irqsave
unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
{
- unsigned long flags;
-
- local_irq_save(flags);
- preempt_disable();
- rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
- return flags;
+ return __write_lock_irqsave(lock);
}
EXPORT_SYMBOL(_write_lock_irqsave);
+#endif
+#ifndef _write_lock_irq
void __lockfunc _write_lock_irq(rwlock_t *lock)
{
- local_irq_disable();
- preempt_disable();
- rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
+ __write_lock_irq(lock);
}
EXPORT_SYMBOL(_write_lock_irq);
+#endif
+#ifndef _write_lock_bh
void __lockfunc _write_lock_bh(rwlock_t *lock)
{
- local_bh_disable();
- preempt_disable();
- rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
+ __write_lock_bh(lock);
}
EXPORT_SYMBOL(_write_lock_bh);
+#endif
+#ifndef _spin_lock
void __lockfunc _spin_lock(spinlock_t *lock)
{
- preempt_disable();
- spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+ __spin_lock(lock);
}
-
EXPORT_SYMBOL(_spin_lock);
+#endif
+#ifndef _write_lock
void __lockfunc _write_lock(rwlock_t *lock)
{
- preempt_disable();
- rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
+ __write_lock(lock);
}
-
EXPORT_SYMBOL(_write_lock);
+#endif
#else /* CONFIG_PREEMPT: */
@@ -299,16 +255,8 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
local_irq_save(flags);
preempt_disable();
spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
- /*
- * On lockdep we dont want the hand-coded irq-enable of
- * _raw_spin_lock_flags() code, because lockdep assumes
- * that interrupts are not re-enabled during lock-acquire:
- */
-#ifdef CONFIG_LOCKDEP
- LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
-#else
- _raw_spin_lock_flags(lock, &flags);
-#endif
+ LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock,
+ _raw_spin_lock_flags, &flags);
return flags;
}
EXPORT_SYMBOL(_spin_lock_irqsave_nested);
@@ -324,125 +272,109 @@ EXPORT_SYMBOL(_spin_lock_nest_lock);
#endif
+#ifndef _spin_unlock
void __lockfunc _spin_unlock(spinlock_t *lock)
{
- spin_release(&lock->dep_map, 1, _RET_IP_);
- _raw_spin_unlock(lock);
- preempt_enable();
+ __spin_unlock(lock);
}
EXPORT_SYMBOL(_spin_unlock);
+#endif
+#ifndef _write_unlock
void __lockfunc _write_unlock(rwlock_t *lock)
{
- rwlock_release(&lock->dep_map, 1, _RET_IP_);
- _raw_write_unlock(lock);
- preempt_enable();
+ __write_unlock(lock);
}
EXPORT_SYMBOL(_write_unlock);
+#endif
+#ifndef _read_unlock
void __lockfunc _read_unlock(rwlock_t *lock)
{
- rwlock_release(&lock->dep_map, 1, _RET_IP_);
- _raw_read_unlock(lock);
- preempt_enable();
+ __read_unlock(lock);
}
EXPORT_SYMBOL(_read_unlock);
+#endif
+#ifndef _spin_unlock_irqrestore
void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
{
- spin_release(&lock->dep_map, 1, _RET_IP_);
- _raw_spin_unlock(lock);
- local_irq_restore(flags);
- preempt_enable();
+ __spin_unlock_irqrestore(lock, flags);
}
EXPORT_SYMBOL(_spin_unlock_irqrestore);
+#endif
+#ifndef _spin_unlock_irq
void __lockfunc _spin_unlock_irq(spinlock_t *lock)
{
- spin_release(&lock->dep_map, 1, _RET_IP_);
- _raw_spin_unlock(lock);
- local_irq_enable();
- preempt_enable();
+ __spin_unlock_irq(lock);
}
EXPORT_SYMBOL(_spin_unlock_irq);
+#endif
+#ifndef _spin_unlock_bh
void __lockfunc _spin_unlock_bh(spinlock_t *lock)
{
- spin_release(&lock->dep_map, 1, _RET_IP_);
- _raw_spin_unlock(lock);
- preempt_enable_no_resched();
- local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+ __spin_unlock_bh(lock);
}
EXPORT_SYMBOL(_spin_unlock_bh);
+#endif
+#ifndef _read_unlock_irqrestore
void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
- rwlock_release(&lock->dep_map, 1, _RET_IP_);
- _raw_read_unlock(lock);
- local_irq_restore(flags);
- preempt_enable();
+ __read_unlock_irqrestore(lock, flags);
}
EXPORT_SYMBOL(_read_unlock_irqrestore);
+#endif
+#ifndef _read_unlock_irq
void __lockfunc _read_unlock_irq(rwlock_t *lock)
{
- rwlock_release(&lock->dep_map, 1, _RET_IP_);
- _raw_read_unlock(lock);
- local_irq_enable();
- preempt_enable();
+ __read_unlock_irq(lock);
}
EXPORT_SYMBOL(_read_unlock_irq);
+#endif
+#ifndef _read_unlock_bh
void __lockfunc _read_unlock_bh(rwlock_t *lock)
{
- rwlock_release(&lock->dep_map, 1, _RET_IP_);
- _raw_read_unlock(lock);
- preempt_enable_no_resched();
- local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+ __read_unlock_bh(lock);
}
EXPORT_SYMBOL(_read_unlock_bh);
+#endif
+#ifndef _write_unlock_irqrestore
void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
- rwlock_release(&lock->dep_map, 1, _RET_IP_);
- _raw_write_unlock(lock);
- local_irq_restore(flags);
- preempt_enable();
+ __write_unlock_irqrestore(lock, flags);
}
EXPORT_SYMBOL(_write_unlock_irqrestore);
+#endif
+#ifndef _write_unlock_irq
void __lockfunc _write_unlock_irq(rwlock_t *lock)
{
- rwlock_release(&lock->dep_map, 1, _RET_IP_);
- _raw_write_unlock(lock);
- local_irq_enable();
- preempt_enable();
+ __write_unlock_irq(lock);
}
EXPORT_SYMBOL(_write_unlock_irq);
+#endif
+#ifndef _write_unlock_bh
void __lockfunc _write_unlock_bh(rwlock_t *lock)
{
- rwlock_release(&lock->dep_map, 1, _RET_IP_);
- _raw_write_unlock(lock);
- preempt_enable_no_resched();
- local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+ __write_unlock_bh(lock);
}
EXPORT_SYMBOL(_write_unlock_bh);
+#endif
+#ifndef _spin_trylock_bh
int __lockfunc _spin_trylock_bh(spinlock_t *lock)
{
- local_bh_disable();
- preempt_disable();
- if (_raw_spin_trylock(lock)) {
- spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
- return 1;
- }
-
- preempt_enable_no_resched();
- local_bh_enable_ip((unsigned long)__builtin_return_address(0));
- return 0;
+ return __spin_trylock_bh(lock);
}
EXPORT_SYMBOL(_spin_trylock_bh);
+#endif
notrace int in_lock_functions(unsigned long addr)
{
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 0cd415ee62a..912823e2a11 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -44,7 +44,7 @@ static DEFINE_MUTEX(setup_lock);
static int refcount;
static struct workqueue_struct *stop_machine_wq;
static struct stop_machine_data active, idle;
-static const cpumask_t *active_cpus;
+static const struct cpumask *active_cpus;
static void *stop_machine_work;
static void set_state(enum stopmachine_state newstate)
@@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
* doesn't hit this CPU until we're ready. */
get_cpu();
for_each_online_cpu(i) {
- sm_work = percpu_ptr(stop_machine_work, i);
+ sm_work = per_cpu_ptr(stop_machine_work, i);
INIT_WORK(sm_work, stop_cpu);
queue_work_on(i, stop_machine_wq, sm_work);
}
diff --git a/kernel/sys.c b/kernel/sys.c
index e7dc0e10a48..b3f1097c76f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
#include <linux/prctl.h>
#include <linux/highuid.h>
#include <linux/fs.h>
+#include <linux/perf_counter.h>
#include <linux/resource.h>
#include <linux/kernel.h>
#include <linux/kexec.h>
@@ -34,6 +35,7 @@
#include <linux/seccomp.h>
#include <linux/cpu.h>
#include <linux/ptrace.h>
+#include <linux/fs_struct.h>
#include <linux/compat.h>
#include <linux/syscalls.h>
@@ -359,6 +361,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
void __user *, arg)
{
char buffer[256];
+ int ret = 0;
/* We only trust the superuser with rebooting the system. */
if (!capable(CAP_SYS_BOOT))
@@ -396,7 +399,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
kernel_halt();
unlock_kernel();
do_exit(0);
- break;
+ panic("cannot halt");
case LINUX_REBOOT_CMD_POWER_OFF:
kernel_power_off();
@@ -416,29 +419,22 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
#ifdef CONFIG_KEXEC
case LINUX_REBOOT_CMD_KEXEC:
- {
- int ret;
- ret = kernel_kexec();
- unlock_kernel();
- return ret;
- }
+ ret = kernel_kexec();
+ break;
#endif
#ifdef CONFIG_HIBERNATION
case LINUX_REBOOT_CMD_SW_SUSPEND:
- {
- int ret = hibernate();
- unlock_kernel();
- return ret;
- }
+ ret = hibernate();
+ break;
#endif
default:
- unlock_kernel();
- return -EINVAL;
+ ret = -EINVAL;
+ break;
}
unlock_kernel();
- return 0;
+ return ret;
}
static void deferred_cad(struct work_struct *dummy)
@@ -559,7 +555,7 @@ error:
abort_creds(new);
return retval;
}
-
+
/*
* change the user struct in a credentials set to match the new UID
*/
@@ -571,6 +567,11 @@ static int set_user(struct cred *new)
if (!new_user)
return -EAGAIN;
+ if (!task_can_switch_user(new_user, current)) {
+ free_uid(new_user);
+ return -EINVAL;
+ }
+
if (atomic_read(&new_user->processes) >=
current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
new_user != INIT_USER) {
@@ -631,10 +632,11 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
goto error;
}
- retval = -EAGAIN;
- if (new->uid != old->uid && set_user(new) < 0)
- goto error;
-
+ if (new->uid != old->uid) {
+ retval = set_user(new);
+ if (retval < 0)
+ goto error;
+ }
if (ruid != (uid_t) -1 ||
(euid != (uid_t) -1 && euid != old->uid))
new->suid = new->euid;
@@ -680,9 +682,10 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
retval = -EPERM;
if (capable(CAP_SETUID)) {
new->suid = new->uid = uid;
- if (uid != old->uid && set_user(new) < 0) {
- retval = -EAGAIN;
- goto error;
+ if (uid != old->uid) {
+ retval = set_user(new);
+ if (retval < 0)
+ goto error;
}
} else if (uid != old->uid && uid != new->suid) {
goto error;
@@ -734,11 +737,13 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
goto error;
}
- retval = -EAGAIN;
if (ruid != (uid_t) -1) {
new->uid = ruid;
- if (ruid != old->uid && set_user(new) < 0)
- goto error;
+ if (ruid != old->uid) {
+ retval = set_user(new);
+ if (retval < 0)
+ goto error;
+ }
}
if (euid != (uid_t) -1)
new->euid = euid;
@@ -1004,10 +1009,8 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
if (err)
goto out;
- if (task_pgrp(p) != pgrp) {
+ if (task_pgrp(p) != pgrp)
change_pid(p, PIDTYPE_PGID, pgrp);
- set_task_pgrp(p, pid_nr(pgrp));
- }
err = 0;
out:
@@ -1110,289 +1113,6 @@ out:
return err;
}
-/*
- * Supplementary group IDs
- */
-
-/* init to 2 - one for init_task, one to ensure it is never freed */
-struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
-
-struct group_info *groups_alloc(int gidsetsize)
-{
- struct group_info *group_info;
- int nblocks;
- int i;
-
- nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
- /* Make sure we always allocate at least one indirect block pointer */
- nblocks = nblocks ? : 1;
- group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
- if (!group_info)
- return NULL;
- group_info->ngroups = gidsetsize;
- group_info->nblocks = nblocks;
- atomic_set(&group_info->usage, 1);
-
- if (gidsetsize <= NGROUPS_SMALL)
- group_info->blocks[0] = group_info->small_block;
- else {
- for (i = 0; i < nblocks; i++) {
- gid_t *b;
- b = (void *)__get_free_page(GFP_USER);
- if (!b)
- goto out_undo_partial_alloc;
- group_info->blocks[i] = b;
- }
- }
- return group_info;
-
-out_undo_partial_alloc:
- while (--i >= 0) {
- free_page((unsigned long)group_info->blocks[i]);
- }
- kfree(group_info);
- return NULL;
-}
-
-EXPORT_SYMBOL(groups_alloc);
-
-void groups_free(struct group_info *group_info)
-{
- if (group_info->blocks[0] != group_info->small_block) {
- int i;
- for (i = 0; i < group_info->nblocks; i++)
- free_page((unsigned long)group_info->blocks[i]);
- }
- kfree(group_info);
-}
-
-EXPORT_SYMBOL(groups_free);
-
-/* export the group_info to a user-space array */
-static int groups_to_user(gid_t __user *grouplist,
- const struct group_info *group_info)
-{
- int i;
- unsigned int count = group_info->ngroups;
-
- for (i = 0; i < group_info->nblocks; i++) {
- unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
- unsigned int len = cp_count * sizeof(*grouplist);
-
- if (copy_to_user(grouplist, group_info->blocks[i], len))
- return -EFAULT;
-
- grouplist += NGROUPS_PER_BLOCK;
- count -= cp_count;
- }
- return 0;
-}
-
-/* fill a group_info from a user-space array - it must be allocated already */
-static int groups_from_user(struct group_info *group_info,
- gid_t __user *grouplist)
-{
- int i;
- unsigned int count = group_info->ngroups;
-
- for (i = 0; i < group_info->nblocks; i++) {
- unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
- unsigned int len = cp_count * sizeof(*grouplist);
-
- if (copy_from_user(group_info->blocks[i], grouplist, len))
- return -EFAULT;
-
- grouplist += NGROUPS_PER_BLOCK;
- count -= cp_count;
- }
- return 0;
-}
-
-/* a simple Shell sort */
-static void groups_sort(struct group_info *group_info)
-{
- int base, max, stride;
- int gidsetsize = group_info->ngroups;
-
- for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
- ; /* nothing */
- stride /= 3;
-
- while (stride) {
- max = gidsetsize - stride;
- for (base = 0; base < max; base++) {
- int left = base;
- int right = left + stride;
- gid_t tmp = GROUP_AT(group_info, right);
-
- while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
- GROUP_AT(group_info, right) =
- GROUP_AT(group_info, left);
- right = left;
- left -= stride;
- }
- GROUP_AT(group_info, right) = tmp;
- }
- stride /= 3;
- }
-}
-
-/* a simple bsearch */
-int groups_search(const struct group_info *group_info, gid_t grp)
-{
- unsigned int left, right;
-
- if (!group_info)
- return 0;
-
- left = 0;
- right = group_info->ngroups;
- while (left < right) {
- unsigned int mid = (left+right)/2;
- int cmp = grp - GROUP_AT(group_info, mid);
- if (cmp > 0)
- left = mid + 1;
- else if (cmp < 0)
- right = mid;
- else
- return 1;
- }
- return 0;
-}
-
-/**
- * set_groups - Change a group subscription in a set of credentials
- * @new: The newly prepared set of credentials to alter
- * @group_info: The group list to install
- *
- * Validate a group subscription and, if valid, insert it into a set
- * of credentials.
- */
-int set_groups(struct cred *new, struct group_info *group_info)
-{
- int retval;
-
- retval = security_task_setgroups(group_info);
- if (retval)
- return retval;
-
- put_group_info(new->group_info);
- groups_sort(group_info);
- get_group_info(group_info);
- new->group_info = group_info;
- return 0;
-}
-
-EXPORT_SYMBOL(set_groups);
-
-/**
- * set_current_groups - Change current's group subscription
- * @group_info: The group list to impose
- *
- * Validate a group subscription and, if valid, impose it upon current's task
- * security record.
- */
-int set_current_groups(struct group_info *group_info)
-{
- struct cred *new;
- int ret;
-
- new = prepare_creds();
- if (!new)
- return -ENOMEM;
-
- ret = set_groups(new, group_info);
- if (ret < 0) {
- abort_creds(new);
- return ret;
- }
-
- return commit_creds(new);
-}
-
-EXPORT_SYMBOL(set_current_groups);
-
-SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
-{
- const struct cred *cred = current_cred();
- int i;
-
- if (gidsetsize < 0)
- return -EINVAL;
-
- /* no need to grab task_lock here; it cannot change */
- i = cred->group_info->ngroups;
- if (gidsetsize) {
- if (i > gidsetsize) {
- i = -EINVAL;
- goto out;
- }
- if (groups_to_user(grouplist, cred->group_info)) {
- i = -EFAULT;
- goto out;
- }
- }
-out:
- return i;
-}
-
-/*
- * SMP: Our groups are copy-on-write. We can set them safely
- * without another task interfering.
- */
-
-SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
-{
- struct group_info *group_info;
- int retval;
-
- if (!capable(CAP_SETGID))
- return -EPERM;
- if ((unsigned)gidsetsize > NGROUPS_MAX)
- return -EINVAL;
-
- group_info = groups_alloc(gidsetsize);
- if (!group_info)
- return -ENOMEM;
- retval = groups_from_user(group_info, grouplist);
- if (retval) {
- put_group_info(group_info);
- return retval;
- }
-
- retval = set_current_groups(group_info);
- put_group_info(group_info);
-
- return retval;
-}
-
-/*
- * Check whether we're fsgid/egid or in the supplemental group..
- */
-int in_group_p(gid_t grp)
-{
- const struct cred *cred = current_cred();
- int retval = 1;
-
- if (grp != cred->fsgid)
- retval = groups_search(cred->group_info, grp);
- return retval;
-}
-
-EXPORT_SYMBOL(in_group_p);
-
-int in_egroup_p(gid_t grp)
-{
- const struct cred *cred = current_cred();
- int retval = 1;
-
- if (grp != cred->egid)
- retval = groups_search(cred->group_info, grp);
- return retval;
-}
-
-EXPORT_SYMBOL(in_egroup_p);
-
DECLARE_RWSEM(uts_sem);
SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
@@ -1525,22 +1245,14 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
return -EINVAL;
if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
return -EFAULT;
+ if (new_rlim.rlim_cur > new_rlim.rlim_max)
+ return -EINVAL;
old_rlim = current->signal->rlim + resource;
if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
!capable(CAP_SYS_RESOURCE))
return -EPERM;
-
- if (resource == RLIMIT_NOFILE) {
- if (new_rlim.rlim_max == RLIM_INFINITY)
- new_rlim.rlim_max = sysctl_nr_open;
- if (new_rlim.rlim_cur == RLIM_INFINITY)
- new_rlim.rlim_cur = sysctl_nr_open;
- if (new_rlim.rlim_max > sysctl_nr_open)
- return -EPERM;
- }
-
- if (new_rlim.rlim_cur > new_rlim.rlim_max)
- return -EINVAL;
+ if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
+ return -EPERM;
retval = security_task_setrlimit(resource, &new_rlim);
if (retval)
@@ -1799,6 +1511,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SET_TSC:
error = SET_TSC_CTL(arg2);
break;
+ case PR_TASK_PERF_COUNTERS_DISABLE:
+ error = perf_counter_task_disable();
+ break;
+ case PR_TASK_PERF_COUNTERS_ENABLE:
+ error = perf_counter_task_enable();
+ break;
case PR_GET_TIMERSLACK:
error = current->timer_slack_ns;
break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 27dad296738..44e5936118d 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -49,6 +49,7 @@ cond_syscall(sys_sendmsg);
cond_syscall(compat_sys_sendmsg);
cond_syscall(sys_recvmsg);
cond_syscall(compat_sys_recvmsg);
+cond_syscall(compat_sys_recvfrom);
cond_syscall(sys_socketcall);
cond_syscall(sys_futex);
cond_syscall(compat_sys_futex);
@@ -175,3 +176,6 @@ cond_syscall(compat_sys_timerfd_settime);
cond_syscall(compat_sys_timerfd_gettime);
cond_syscall(sys_eventfd);
cond_syscall(sys_eventfd2);
+
+/* performance counters: */
+cond_syscall(sys_perf_counter_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 368d1638ee7..3125cff1c57 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,6 +27,7 @@
#include <linux/security.h>
#include <linux/ctype.h>
#include <linux/utsname.h>
+#include <linux/kmemcheck.h>
#include <linux/smp_lock.h>
#include <linux/fs.h>
#include <linux/init.h>
@@ -48,6 +49,8 @@
#include <linux/acpi.h>
#include <linux/reboot.h>
#include <linux/ftrace.h>
+#include <linux/slow-work.h>
+#include <linux/perf_counter.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -95,14 +98,15 @@ static int sixty = 60;
static int neg_one = -1;
#endif
-#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING)
-static int two = 2;
-#endif
-
static int zero;
-static int one = 1;
+static int __maybe_unused one = 1;
+static int __maybe_unused two = 2;
+static unsigned long one_ul = 1;
static int one_hundred = 100;
+/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
+static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
+
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
static int minolduid;
@@ -112,6 +116,7 @@ static int ngroups_max = NGROUPS_MAX;
#ifdef CONFIG_MODULES
extern char modprobe_path[];
+extern int modules_disabled;
#endif
#ifdef CONFIG_CHR_DEV_SG
extern int sg_big_buff;
@@ -240,6 +245,14 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
#endif
static struct ctl_table kern_table[] = {
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_child_runs_first",
+ .data = &sysctl_sched_child_runs_first,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
#ifdef CONFIG_SCHED_DEBUG
{
.ctl_name = CTL_UNNUMBERED,
@@ -294,14 +307,6 @@ static struct ctl_table kern_table[] = {
},
{
.ctl_name = CTL_UNNUMBERED,
- .procname = "sched_child_runs_first",
- .data = &sysctl_sched_child_runs_first,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = CTL_UNNUMBERED,
.procname = "sched_features",
.data = &sysctl_sched_features,
.maxlen = sizeof(unsigned int),
@@ -324,6 +329,25 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_time_avg",
+ .data = &sysctl_sched_time_avg,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "timer_migration",
+ .data = &sysctl_timer_migration,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
#endif
{
.ctl_name = CTL_UNNUMBERED,
@@ -532,6 +556,17 @@ static struct ctl_table kern_table[] = {
.proc_handler = &proc_dostring,
.strategy = &sysctl_string,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "modules_disabled",
+ .data = &modules_disabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ /* only handle a transition from default "0" to "1" */
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &one,
+ },
#endif
#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
{
@@ -720,6 +755,14 @@ static struct ctl_table kern_table[] = {
.proc_handler = &proc_dointvec,
},
{
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "panic_on_io_nmi",
+ .data = &panic_on_io_nmi,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
.ctl_name = KERN_BOOTLOADER_TYPE,
.procname = "bootloader_type",
.data = &bootloader_type,
@@ -729,6 +772,14 @@ static struct ctl_table kern_table[] = {
},
{
.ctl_name = CTL_UNNUMBERED,
+ .procname = "bootloader_version",
+ .data = &bootloader_version,
+ .maxlen = sizeof (int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
.procname = "kstack_depth_to_print",
.data = &kstack_depth_to_print,
.maxlen = sizeof(int),
@@ -809,11 +860,24 @@ static struct ctl_table kern_table[] = {
.data = &softlockup_thresh,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
+ .proc_handler = &proc_dosoftlockup_thresh,
.strategy = &sysctl_intvec,
.extra1 = &neg_one,
.extra2 = &sixty,
},
+#endif
+#ifdef CONFIG_DETECT_HUNG_TASK
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_panic",
+ .data = &sysctl_hung_task_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
{
.ctl_name = CTL_UNNUMBERED,
.procname = "hung_task_check_count",
@@ -829,7 +893,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_hung_task_timeout_secs,
.maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = &proc_doulongvec_minmax,
+ .proc_handler = &proc_dohung_task_timeout_secs,
.strategy = &sysctl_intvec,
},
{
@@ -889,16 +953,51 @@ static struct ctl_table kern_table[] = {
.proc_handler = &proc_dointvec,
},
#endif
-#ifdef CONFIG_UNEVICTABLE_LRU
+#ifdef CONFIG_SLOW_WORK
{
.ctl_name = CTL_UNNUMBERED,
- .procname = "scan_unevictable_pages",
- .data = &scan_unevictable_pages,
- .maxlen = sizeof(scan_unevictable_pages),
+ .procname = "slow-work",
+ .mode = 0555,
+ .child = slow_work_sysctls,
+ },
+#endif
+#ifdef CONFIG_PERF_COUNTERS
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "perf_counter_paranoid",
+ .data = &sysctl_perf_counter_paranoid,
+ .maxlen = sizeof(sysctl_perf_counter_paranoid),
.mode = 0644,
- .proc_handler = &scan_unevictable_handler,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "perf_counter_mlock_kb",
+ .data = &sysctl_perf_counter_mlock,
+ .maxlen = sizeof(sysctl_perf_counter_mlock),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "perf_counter_max_sample_rate",
+ .data = &sysctl_perf_counter_sample_rate,
+ .maxlen = sizeof(sysctl_perf_counter_sample_rate),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_KMEMCHECK
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "kmemcheck",
+ .data = &kmemcheck_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
},
#endif
+
/*
* NOTE: do not add new entries to this table unless you have read
* Documentation/sysctl/ctl_unnumbered.txt
@@ -974,7 +1073,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = &dirty_background_bytes_handler,
.strategy = &sysctl_intvec,
- .extra1 = &one,
+ .extra1 = &one_ul,
},
{
.ctl_name = VM_DIRTY_RATIO,
@@ -995,7 +1094,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = &dirty_bytes_handler,
.strategy = &sysctl_intvec,
- .extra1 = &one,
+ .extra1 = &dirty_bytes_min,
},
{
.procname = "dirty_writeback_centisecs",
@@ -1009,7 +1108,7 @@ static struct ctl_table vm_table[] = {
.data = &dirty_expire_interval,
.maxlen = sizeof(dirty_expire_interval),
.mode = 0644,
- .proc_handler = &proc_dointvec_userhz_jiffies,
+ .proc_handler = &proc_dointvec,
},
{
.ctl_name = VM_NR_PDFLUSH_THREADS,
@@ -1212,16 +1311,14 @@ static struct ctl_table vm_table[] = {
.strategy = &sysctl_jiffies,
},
#endif
-#ifdef CONFIG_SECURITY
{
.ctl_name = CTL_UNNUMBERED,
.procname = "mmap_min_addr",
- .data = &mmap_min_addr,
- .maxlen = sizeof(unsigned long),
+ .data = &dac_mmap_min_addr,
+ .maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = &proc_doulongvec_minmax,
+ .proc_handler = &mmap_min_addr_handler,
},
-#endif
#ifdef CONFIG_NUMA
{
.ctl_name = CTL_UNNUMBERED,
@@ -1259,6 +1356,14 @@ static struct ctl_table vm_table[] = {
.extra2 = &one,
},
#endif
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "scan_unevictable_pages",
+ .data = &scan_unevictable_pages,
+ .maxlen = sizeof(scan_unevictable_pages),
+ .mode = 0644,
+ .proc_handler = &scan_unevictable_handler,
+ },
/*
* NOTE: do not add new entries to this table unless you have read
* Documentation/sysctl/ctl_unnumbered.txt
@@ -1372,10 +1477,7 @@ static struct ctl_table fs_table[] = {
.data = &lease_break_time,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
- .extra1 = &zero,
- .extra2 = &two,
+ .proc_handler = &proc_dointvec,
},
#endif
#ifdef CONFIG_AIO
@@ -1416,7 +1518,10 @@ static struct ctl_table fs_table[] = {
.data = &suid_dumpable,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &two,
},
#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
{
@@ -2197,7 +2302,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
void *data)
{
#define TMPBUFLEN 21
- int *i, vleft, first=1, neg, val;
+ int *i, vleft, first = 1, neg;
unsigned long lval;
size_t left, len;
@@ -2250,8 +2355,6 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
len = p-buf;
if ((len < left) && *p && !isspace(*p))
break;
- if (neg)
- val = -val;
s += len;
left -= len;
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index fafeb48f27c..b38423ca711 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -219,6 +219,7 @@ static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
{ NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
{ NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
{ NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
+ { NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" },
{}
};
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 888adbcca30..ea8384d3caa 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -108,7 +108,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
/*
* Send taskstats data in @skb to listener with nl_pid @pid
*/
-static int send_reply(struct sk_buff *skb, pid_t pid)
+static int send_reply(struct sk_buff *skb, struct genl_info *info)
{
struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
void *reply = genlmsg_data(genlhdr);
@@ -120,7 +120,7 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
return rc;
}
- return genlmsg_unicast(skb, pid);
+ return genlmsg_reply(skb, info);
}
/*
@@ -150,7 +150,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
if (!skb_next)
break;
}
- rc = genlmsg_unicast(skb_cur, s->pid);
+ rc = genlmsg_unicast(&init_net, skb_cur, s->pid);
if (rc == -ECONNREFUSED) {
s->valid = 0;
delcount++;
@@ -418,7 +418,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
goto err;
}
- rc = send_reply(rep_skb, info->snd_pid);
+ rc = send_reply(rep_skb, info);
err:
fput_light(file, fput_needed);
@@ -487,7 +487,7 @@ free_return_rc:
} else
goto err;
- return send_reply(rep_skb, info->snd_pid);
+ return send_reply(rep_skb, info);
err:
nlmsg_free(rep_skb);
return rc;
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 905b0b50792..0b0a6366c9d 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
-obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index ea2f48af83c..620b58abdc3 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,6 +18,7 @@
#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/sysdev.h>
+#include <linux/tick.h>
/* The registered clock event devices */
static LIST_HEAD(clockevent_devices);
@@ -54,6 +55,7 @@ unsigned long clockevent_delta2ns(unsigned long latch,
return (unsigned long) clc;
}
+EXPORT_SYMBOL_GPL(clockevent_delta2ns);
/**
* clockevents_set_mode - set the operating mode of a clock event device
@@ -68,6 +70,17 @@ void clockevents_set_mode(struct clock_event_device *dev,
if (dev->mode != mode) {
dev->set_mode(mode, dev);
dev->mode = mode;
+
+ /*
+ * A nsec2cyc multiplicator of 0 is invalid and we'd crash
+ * on it, so fix it up and emit a warning:
+ */
+ if (mode == CLOCK_EVT_MODE_ONESHOT) {
+ if (unlikely(!dev->mult)) {
+ dev->mult = 1;
+ WARN_ON(1);
+ }
+ }
}
}
@@ -124,11 +137,12 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
*/
int clockevents_register_notifier(struct notifier_block *nb)
{
+ unsigned long flags;
int ret;
- spin_lock(&clockevents_lock);
+ spin_lock_irqsave(&clockevents_lock, flags);
ret = raw_notifier_chain_register(&clockevents_chain, nb);
- spin_unlock(&clockevents_lock);
+ spin_unlock_irqrestore(&clockevents_lock, flags);
return ret;
}
@@ -165,26 +179,20 @@ static void clockevents_notify_released(void)
*/
void clockevents_register_device(struct clock_event_device *dev)
{
+ unsigned long flags;
+
BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
BUG_ON(!dev->cpumask);
- /*
- * A nsec2cyc multiplicator of 0 is invalid and we'd crash
- * on it, so fix it up and emit a warning:
- */
- if (unlikely(!dev->mult)) {
- dev->mult = 1;
- WARN_ON(1);
- }
-
- spin_lock(&clockevents_lock);
+ spin_lock_irqsave(&clockevents_lock, flags);
list_add(&dev->list, &clockevent_devices);
clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
clockevents_notify_released();
- spin_unlock(&clockevents_lock);
+ spin_unlock_irqrestore(&clockevents_lock, flags);
}
+EXPORT_SYMBOL_GPL(clockevents_register_device);
/*
* Noop handler when we shut down an event device
@@ -230,8 +238,9 @@ void clockevents_exchange_device(struct clock_event_device *old,
void clockevents_notify(unsigned long reason, void *arg)
{
struct list_head *node, *tmp;
+ unsigned long flags;
- spin_lock(&clockevents_lock);
+ spin_lock_irqsave(&clockevents_lock, flags);
clockevents_do_notify(reason, arg);
switch (reason) {
@@ -246,7 +255,7 @@ void clockevents_notify(unsigned long reason, void *arg)
default:
break;
}
- spin_unlock(&clockevents_lock);
+ spin_unlock_irqrestore(&clockevents_lock, flags);
}
EXPORT_SYMBOL_GPL(clockevents_notify);
#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ca89e1593f0..7466cb81125 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -31,6 +31,82 @@
#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
#include <linux/tick.h>
+void timecounter_init(struct timecounter *tc,
+ const struct cyclecounter *cc,
+ u64 start_tstamp)
+{
+ tc->cc = cc;
+ tc->cycle_last = cc->read(cc);
+ tc->nsec = start_tstamp;
+}
+EXPORT_SYMBOL(timecounter_init);
+
+/**
+ * timecounter_read_delta - get nanoseconds since last call of this function
+ * @tc: Pointer to time counter
+ *
+ * When the underlying cycle counter runs over, this will be handled
+ * correctly as long as it does not run over more than once between
+ * calls.
+ *
+ * The first call to this function for a new time counter initializes
+ * the time tracking and returns an undefined result.
+ */
+static u64 timecounter_read_delta(struct timecounter *tc)
+{
+ cycle_t cycle_now, cycle_delta;
+ u64 ns_offset;
+
+ /* read cycle counter: */
+ cycle_now = tc->cc->read(tc->cc);
+
+ /* calculate the delta since the last timecounter_read_delta(): */
+ cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
+
+ /* convert to nanoseconds: */
+ ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
+
+ /* update time stamp of timecounter_read_delta() call: */
+ tc->cycle_last = cycle_now;
+
+ return ns_offset;
+}
+
+u64 timecounter_read(struct timecounter *tc)
+{
+ u64 nsec;
+
+ /* increment time by nanoseconds since last call */
+ nsec = timecounter_read_delta(tc);
+ nsec += tc->nsec;
+ tc->nsec = nsec;
+
+ return nsec;
+}
+EXPORT_SYMBOL(timecounter_read);
+
+u64 timecounter_cyc2time(struct timecounter *tc,
+ cycle_t cycle_tstamp)
+{
+ u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
+ u64 nsec;
+
+ /*
+ * Instead of always treating cycle_tstamp as more recent
+ * than tc->cycle_last, detect when it is too far in the
+ * future and treat it as old time stamp instead.
+ */
+ if (cycle_delta > tc->cc->mask / 2) {
+ cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
+ nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
+ } else {
+ nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
+ }
+
+ return nsec;
+}
+EXPORT_SYMBOL(timecounter_cyc2time);
+
/* XXX - Would like a better way for initializing curr_clocksource */
extern struct clocksource clocksource_jiffies;
@@ -105,12 +181,12 @@ static void clocksource_watchdog(unsigned long data)
resumed = test_and_clear_bit(0, &watchdog_resumed);
- wdnow = watchdog->read();
+ wdnow = watchdog->read(watchdog);
wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
watchdog_last = wdnow;
list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
- csnow = cs->read();
+ csnow = cs->read(cs);
if (unlikely(resumed)) {
cs->wd_last = csnow;
@@ -171,7 +247,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
list_add(&cs->wd_list, &watchdog_list);
if (!started && watchdog) {
- watchdog_last = watchdog->read();
+ watchdog_last = watchdog->read(watchdog);
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
add_timer_on(&watchdog_timer,
cpumask_first(cpu_online_mask));
@@ -192,7 +268,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
cse->flags &= ~CLOCK_SOURCE_WATCHDOG;
/* Start if list is not empty */
if (!list_empty(&watchdog_list)) {
- watchdog_last = watchdog->read();
+ watchdog_last = watchdog->read(watchdog);
watchdog_timer.expires =
jiffies + WATCHDOG_INTERVAL;
add_timer_on(&watchdog_timer,
@@ -326,9 +402,6 @@ int clocksource_register(struct clocksource *c)
unsigned long flags;
int ret;
- /* save mult_orig on registration */
- c->mult_orig = c->mult;
-
spin_lock_irqsave(&clocksource_lock, flags);
ret = clocksource_enqueue(c);
if (!ret)
@@ -436,6 +509,18 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
}
}
+ /*
+ * Check to make sure we don't switch to a non-highres capable
+ * clocksource if the tick code is in oneshot mode (highres or nohz)
+ */
+ if (tick_oneshot_mode_active() && ovr &&
+ !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
+ printk(KERN_WARNING "%s clocksource is not HRT compatible. "
+ "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
+ ovr = NULL;
+ override_name[0] = 0;
+ }
+
/* Reselect, when the override name has changed */
if (ovr != clocksource_override) {
clocksource_override = ovr;
@@ -464,7 +549,13 @@ sysfs_show_available_clocksources(struct sys_device *dev,
spin_lock_irq(&clocksource_lock);
list_for_each_entry(src, &clocksource_list, list) {
- count += snprintf(buf + count,
+ /*
+ * Don't show non-HRES clocksource if the tick code is
+ * in one shot mode (highres=on or nohz=on)
+ */
+ if (!tick_oneshot_mode_active() ||
+ (src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
+ count += snprintf(buf + count,
max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
"%s ", src->name);
}
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 06f197560f3..c3f6c30816e 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -50,7 +50,7 @@
*/
#define JIFFIES_SHIFT 8
-static cycle_t jiffies_read(void)
+static cycle_t jiffies_read(struct clocksource *cs)
{
return (cycle_t) jiffies;
}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index f5f793d9241..7fc64375ff4 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -1,71 +1,129 @@
/*
- * linux/kernel/time/ntp.c
- *
* NTP state machine interfaces and logic.
*
* This code was mainly moved from kernel/timer.c and kernel/time.c
* Please see those files for relevant copyright info and historical
* changelogs.
*/
-
-#include <linux/mm.h>
-#include <linux/time.h>
-#include <linux/timex.h>
-#include <linux/jiffies.h>
-#include <linux/hrtimer.h>
#include <linux/capability.h>
-#include <linux/math64.h>
#include <linux/clocksource.h>
#include <linux/workqueue.h>
-#include <asm/timex.h>
+#include <linux/hrtimer.h>
+#include <linux/jiffies.h>
+#include <linux/math64.h>
+#include <linux/timex.h>
+#include <linux/time.h>
+#include <linux/mm.h>
/*
- * Timekeeping variables
+ * NTP timekeeping variables:
*/
-unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */
-unsigned long tick_nsec; /* ACTHZ period (nsec) */
-u64 tick_length;
-static u64 tick_length_base;
-static struct hrtimer leap_timer;
+/* USER_HZ period (usecs): */
+unsigned long tick_usec = TICK_USEC;
-#define MAX_TICKADJ 500 /* microsecs */
-#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
- NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
+/* ACTHZ period (nsecs): */
+unsigned long tick_nsec;
+
+u64 tick_length;
+static u64 tick_length_base;
+
+static struct hrtimer leap_timer;
+
+#define MAX_TICKADJ 500LL /* usecs */
+#define MAX_TICKADJ_SCALED \
+ (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
/*
* phase-lock loop variables
*/
-/* TIME_ERROR prevents overwriting the CMOS clock */
-static int time_state = TIME_OK; /* clock synchronization status */
-int time_status = STA_UNSYNC; /* clock status bits */
-static long time_tai; /* TAI offset (s) */
-static s64 time_offset; /* time adjustment (ns) */
-static long time_constant = 2; /* pll time constant */
-long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
-long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
-static s64 time_freq; /* frequency offset (scaled ns/s)*/
-static long time_reftime; /* time at last adjustment (s) */
-long time_adjust;
-static long ntp_tick_adj;
+/*
+ * clock synchronization status
+ *
+ * (TIME_ERROR prevents overwriting the CMOS clock)
+ */
+static int time_state = TIME_OK;
+
+/* clock status bits: */
+int time_status = STA_UNSYNC;
+
+/* TAI offset (secs): */
+static long time_tai;
+
+/* time adjustment (nsecs): */
+static s64 time_offset;
+
+/* pll time constant: */
+static long time_constant = 2;
+
+/* maximum error (usecs): */
+long time_maxerror = NTP_PHASE_LIMIT;
+
+/* estimated error (usecs): */
+long time_esterror = NTP_PHASE_LIMIT;
+
+/* frequency offset (scaled nsecs/secs): */
+static s64 time_freq;
+
+/* time at last adjustment (secs): */
+static long time_reftime;
+
+long time_adjust;
+
+/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
+static s64 ntp_tick_adj;
+
+/*
+ * NTP methods:
+ */
+
+/*
+ * Update (tick_length, tick_length_base, tick_nsec), based
+ * on (tick_usec, ntp_tick_adj, time_freq):
+ */
static void ntp_update_frequency(void)
{
- u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
- << NTP_SCALE_SHIFT;
- second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT;
- second_length += time_freq;
+ u64 second_length;
+ u64 new_base;
+
+ second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
+ << NTP_SCALE_SHIFT;
+
+ second_length += ntp_tick_adj;
+ second_length += time_freq;
- tick_length_base = second_length;
+ tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
+ new_base = div_u64(second_length, NTP_INTERVAL_FREQ);
- tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
- tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ);
+ /*
+ * Don't wait for the next second_overflow, apply
+ * the change to the tick length immediately:
+ */
+ tick_length += new_base - tick_length_base;
+ tick_length_base = new_base;
+}
+
+static inline s64 ntp_update_offset_fll(s64 offset64, long secs)
+{
+ time_status &= ~STA_MODE;
+
+ if (secs < MINSEC)
+ return 0;
+
+ if (!(time_status & STA_FLL) && (secs <= MAXSEC))
+ return 0;
+
+ time_status |= STA_MODE;
+
+ return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
}
static void ntp_update_offset(long offset)
{
- long mtemp;
s64 freq_adj;
+ s64 offset64;
+ long secs;
if (!(time_status & STA_PLL))
return;
@@ -84,24 +142,23 @@ static void ntp_update_offset(long offset)
* Select how the frequency is to be controlled
* and in which mode (PLL or FLL).
*/
- if (time_status & STA_FREQHOLD || time_reftime == 0)
- time_reftime = xtime.tv_sec;
- mtemp = xtime.tv_sec - time_reftime;
+ secs = xtime.tv_sec - time_reftime;
+ if (unlikely(time_status & STA_FREQHOLD))
+ secs = 0;
+
time_reftime = xtime.tv_sec;
- freq_adj = (s64)offset * mtemp;
- freq_adj <<= NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant);
- time_status &= ~STA_MODE;
- if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
- freq_adj += div_s64((s64)offset << (NTP_SCALE_SHIFT - SHIFT_FLL),
- mtemp);
- time_status |= STA_MODE;
- }
- freq_adj += time_freq;
- freq_adj = min(freq_adj, MAXFREQ_SCALED);
- time_freq = max(freq_adj, -MAXFREQ_SCALED);
+ offset64 = offset;
+ freq_adj = (offset64 * secs) <<
+ (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
- time_offset = div_s64((s64)offset << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);
+ freq_adj += ntp_update_offset_fll(offset64, secs);
+
+ freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED);
+
+ time_freq = max(freq_adj, -MAXFREQ_SCALED);
+
+ time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);
}
/**
@@ -111,15 +168,15 @@ static void ntp_update_offset(long offset)
*/
void ntp_clear(void)
{
- time_adjust = 0; /* stop active adjtime() */
- time_status |= STA_UNSYNC;
- time_maxerror = NTP_PHASE_LIMIT;
- time_esterror = NTP_PHASE_LIMIT;
+ time_adjust = 0; /* stop active adjtime() */
+ time_status |= STA_UNSYNC;
+ time_maxerror = NTP_PHASE_LIMIT;
+ time_esterror = NTP_PHASE_LIMIT;
ntp_update_frequency();
- tick_length = tick_length_base;
- time_offset = 0;
+ tick_length = tick_length_base;
+ time_offset = 0;
}
/*
@@ -140,8 +197,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
xtime.tv_sec--;
wall_to_monotonic.tv_sec++;
time_state = TIME_OOP;
- printk(KERN_NOTICE "Clock: "
- "inserting leap second 23:59:60 UTC\n");
+ printk(KERN_NOTICE
+ "Clock: inserting leap second 23:59:60 UTC\n");
hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
res = HRTIMER_RESTART;
break;
@@ -150,8 +207,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
time_tai--;
wall_to_monotonic.tv_sec--;
time_state = TIME_WAIT;
- printk(KERN_NOTICE "Clock: "
- "deleting leap second 23:59:59 UTC\n");
+ printk(KERN_NOTICE
+ "Clock: deleting leap second 23:59:59 UTC\n");
break;
case TIME_OOP:
time_tai++;
@@ -179,7 +236,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
*/
void second_overflow(void)
{
- s64 time_adj;
+ s64 delta;
/* Bump the maxerror field */
time_maxerror += MAXFREQ / NSEC_PER_USEC;
@@ -192,24 +249,30 @@ void second_overflow(void)
* Compute the phase adjustment for the next second. The offset is
* reduced by a fixed factor times the time constant.
*/
- tick_length = tick_length_base;
- time_adj = shift_right(time_offset, SHIFT_PLL + time_constant);
- time_offset -= time_adj;
- tick_length += time_adj;
-
- if (unlikely(time_adjust)) {
- if (time_adjust > MAX_TICKADJ) {
- time_adjust -= MAX_TICKADJ;
- tick_length += MAX_TICKADJ_SCALED;
- } else if (time_adjust < -MAX_TICKADJ) {
- time_adjust += MAX_TICKADJ;
- tick_length -= MAX_TICKADJ_SCALED;
- } else {
- tick_length += (s64)(time_adjust * NSEC_PER_USEC /
- NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT;
- time_adjust = 0;
- }
+ tick_length = tick_length_base;
+
+ delta = shift_right(time_offset, SHIFT_PLL + time_constant);
+ time_offset -= delta;
+ tick_length += delta;
+
+ if (!time_adjust)
+ return;
+
+ if (time_adjust > MAX_TICKADJ) {
+ time_adjust -= MAX_TICKADJ;
+ tick_length += MAX_TICKADJ_SCALED;
+ return;
}
+
+ if (time_adjust < -MAX_TICKADJ) {
+ time_adjust += MAX_TICKADJ;
+ tick_length -= MAX_TICKADJ_SCALED;
+ return;
+ }
+
+ tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
+ << NTP_SCALE_SHIFT;
+ time_adjust = 0;
}
#ifdef CONFIG_GENERIC_CMOS_UPDATE
@@ -233,12 +296,13 @@ static void sync_cmos_clock(struct work_struct *work)
* This code is run on a timer. If the clock is set, that timer
* may not expire at the correct time. Thus, we adjust...
*/
- if (!ntp_synced())
+ if (!ntp_synced()) {
/*
* Not synced, exit, do not restart a timer (if one is
* running, let it run out).
*/
return;
+ }
getnstimeofday(&now);
if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
@@ -270,7 +334,116 @@ static void notify_cmos_timer(void)
static inline void notify_cmos_timer(void) { }
#endif
-/* adjtimex mainly allows reading (and writing, if superuser) of
+/*
+ * Start the leap seconds timer:
+ */
+static inline void ntp_start_leap_timer(struct timespec *ts)
+{
+ long now = ts->tv_sec;
+
+ if (time_status & STA_INS) {
+ time_state = TIME_INS;
+ now += 86400 - now % 86400;
+ hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
+
+ return;
+ }
+
+ if (time_status & STA_DEL) {
+ time_state = TIME_DEL;
+ now += 86400 - (now + 1) % 86400;
+ hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
+ }
+}
+
+/*
+ * Propagate a new txc->status value into the NTP state:
+ */
+static inline void process_adj_status(struct timex *txc, struct timespec *ts)
+{
+ if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
+ time_state = TIME_OK;
+ time_status = STA_UNSYNC;
+ }
+
+ /*
+ * If we turn on PLL adjustments then reset the
+ * reference time to current time.
+ */
+ if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
+ time_reftime = xtime.tv_sec;
+
+ /* only set allowed bits */
+ time_status &= STA_RONLY;
+ time_status |= txc->status & ~STA_RONLY;
+
+ switch (time_state) {
+ case TIME_OK:
+ ntp_start_leap_timer(ts);
+ break;
+ case TIME_INS:
+ case TIME_DEL:
+ time_state = TIME_OK;
+ ntp_start_leap_timer(ts);
+ case TIME_WAIT:
+ if (!(time_status & (STA_INS | STA_DEL)))
+ time_state = TIME_OK;
+ break;
+ case TIME_OOP:
+ hrtimer_restart(&leap_timer);
+ break;
+ }
+}
+/*
+ * Called with the xtime lock held, so we can access and modify
+ * all the global NTP state:
+ */
+static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts)
+{
+ if (txc->modes & ADJ_STATUS)
+ process_adj_status(txc, ts);
+
+ if (txc->modes & ADJ_NANO)
+ time_status |= STA_NANO;
+
+ if (txc->modes & ADJ_MICRO)
+ time_status &= ~STA_NANO;
+
+ if (txc->modes & ADJ_FREQUENCY) {
+ time_freq = txc->freq * PPM_SCALE;
+ time_freq = min(time_freq, MAXFREQ_SCALED);
+ time_freq = max(time_freq, -MAXFREQ_SCALED);
+ }
+
+ if (txc->modes & ADJ_MAXERROR)
+ time_maxerror = txc->maxerror;
+
+ if (txc->modes & ADJ_ESTERROR)
+ time_esterror = txc->esterror;
+
+ if (txc->modes & ADJ_TIMECONST) {
+ time_constant = txc->constant;
+ if (!(time_status & STA_NANO))
+ time_constant += 4;
+ time_constant = min(time_constant, (long)MAXTC);
+ time_constant = max(time_constant, 0l);
+ }
+
+ if (txc->modes & ADJ_TAI && txc->constant > 0)
+ time_tai = txc->constant;
+
+ if (txc->modes & ADJ_OFFSET)
+ ntp_update_offset(txc->offset);
+
+ if (txc->modes & ADJ_TICK)
+ tick_usec = txc->tick;
+
+ if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
+ ntp_update_frequency();
+}
+
+/*
+ * adjtimex mainly allows reading (and writing, if superuser) of
* kernel time-keeping variables. used by xntpd.
*/
int do_adjtimex(struct timex *txc)
@@ -291,11 +464,14 @@ int do_adjtimex(struct timex *txc)
if (txc->modes && !capable(CAP_SYS_TIME))
return -EPERM;
- /* if the quartz is off by more than 10% something is VERY wrong! */
+ /*
+ * if the quartz is off by more than 10% then
+ * something is VERY wrong!
+ */
if (txc->modes & ADJ_TICK &&
(txc->tick < 900000/USER_HZ ||
txc->tick > 1100000/USER_HZ))
- return -EINVAL;
+ return -EINVAL;
if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
hrtimer_cancel(&leap_timer);
@@ -305,7 +481,6 @@ int do_adjtimex(struct timex *txc)
write_seqlock_irq(&xtime_lock);
- /* If there are input parameters, then process them */
if (txc->modes & ADJ_ADJTIME) {
long save_adjust = time_adjust;
@@ -315,98 +490,24 @@ int do_adjtimex(struct timex *txc)
ntp_update_frequency();
}
txc->offset = save_adjust;
- goto adj_done;
- }
- if (txc->modes) {
- long sec;
-
- if (txc->modes & ADJ_STATUS) {
- if ((time_status & STA_PLL) &&
- !(txc->status & STA_PLL)) {
- time_state = TIME_OK;
- time_status = STA_UNSYNC;
- }
- /* only set allowed bits */
- time_status &= STA_RONLY;
- time_status |= txc->status & ~STA_RONLY;
-
- switch (time_state) {
- case TIME_OK:
- start_timer:
- sec = ts.tv_sec;
- if (time_status & STA_INS) {
- time_state = TIME_INS;
- sec += 86400 - sec % 86400;
- hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
- } else if (time_status & STA_DEL) {
- time_state = TIME_DEL;
- sec += 86400 - (sec + 1) % 86400;
- hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
- }
- break;
- case TIME_INS:
- case TIME_DEL:
- time_state = TIME_OK;
- goto start_timer;
- break;
- case TIME_WAIT:
- if (!(time_status & (STA_INS | STA_DEL)))
- time_state = TIME_OK;
- break;
- case TIME_OOP:
- hrtimer_restart(&leap_timer);
- break;
- }
- }
-
- if (txc->modes & ADJ_NANO)
- time_status |= STA_NANO;
- if (txc->modes & ADJ_MICRO)
- time_status &= ~STA_NANO;
-
- if (txc->modes & ADJ_FREQUENCY) {
- time_freq = (s64)txc->freq * PPM_SCALE;
- time_freq = min(time_freq, MAXFREQ_SCALED);
- time_freq = max(time_freq, -MAXFREQ_SCALED);
- }
-
- if (txc->modes & ADJ_MAXERROR)
- time_maxerror = txc->maxerror;
- if (txc->modes & ADJ_ESTERROR)
- time_esterror = txc->esterror;
-
- if (txc->modes & ADJ_TIMECONST) {
- time_constant = txc->constant;
- if (!(time_status & STA_NANO))
- time_constant += 4;
- time_constant = min(time_constant, (long)MAXTC);
- time_constant = max(time_constant, 0l);
- }
-
- if (txc->modes & ADJ_TAI && txc->constant > 0)
- time_tai = txc->constant;
-
- if (txc->modes & ADJ_OFFSET)
- ntp_update_offset(txc->offset);
- if (txc->modes & ADJ_TICK)
- tick_usec = txc->tick;
+ } else {
- if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
- ntp_update_frequency();
- }
+ /* If there are input parameters, then process them: */
+ if (txc->modes)
+ process_adjtimex_modes(txc, &ts);
- txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
+ txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
NTP_SCALE_SHIFT);
- if (!(time_status & STA_NANO))
- txc->offset /= NSEC_PER_USEC;
+ if (!(time_status & STA_NANO))
+ txc->offset /= NSEC_PER_USEC;
+ }
-adj_done:
result = time_state; /* mostly `TIME_OK' */
if (time_status & (STA_UNSYNC|STA_CLOCKERR))
result = TIME_ERROR;
txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
- (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT);
+ PPM_SCALE_INV, NTP_SCALE_SHIFT);
txc->maxerror = time_maxerror;
txc->esterror = time_esterror;
txc->status = time_status;
@@ -425,6 +526,7 @@ adj_done:
txc->calcnt = 0;
txc->errcnt = 0;
txc->stbcnt = 0;
+
write_sequnlock_irq(&xtime_lock);
txc->time.tv_sec = ts.tv_sec;
@@ -440,6 +542,8 @@ adj_done:
static int __init ntp_tick_adj_setup(char *str)
{
ntp_tick_adj = simple_strtol(str, NULL, 0);
+ ntp_tick_adj <<= NTP_SCALE_SHIFT;
+
return 1;
}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 118a3b3b3f9..c2ec25087a3 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -27,7 +27,7 @@
* timer stops in C3 state.
*/
-struct tick_device tick_broadcast_device;
+static struct tick_device tick_broadcast_device;
/* FIXME: Use cpumask_var_t. */
static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
static DECLARE_BITMAP(tmpmask, NR_CPUS);
@@ -205,11 +205,11 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
* Powerstate information: The system enters/leaves a state, where
* affected devices might stop
*/
-static void tick_do_broadcast_on_off(void *why)
+static void tick_do_broadcast_on_off(unsigned long *reason)
{
struct clock_event_device *bc, *dev;
struct tick_device *td;
- unsigned long flags, *reason = why;
+ unsigned long flags;
int cpu, bc_stopped;
spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -276,8 +276,7 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu)
printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
"offline CPU #%d\n", *oncpu);
else
- smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
- &reason, 1);
+ tick_do_broadcast_on_off(&reason);
}
/*
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 63e05d423a0..83c4417b6a3 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -93,7 +93,17 @@ void tick_handle_periodic(struct clock_event_device *dev)
for (;;) {
if (!clockevents_program_event(dev, next, ktime_get()))
return;
- tick_periodic(cpu);
+ /*
+ * Have to be careful here. If we're in oneshot mode,
+ * before we call tick_periodic() in a loop, we need
+ * to be sure we're using a real hardware clocksource.
+ * Otherwise we could get trapped in an infinite
+ * loop, as the tick_periodic() increments jiffies,
+ * when then will increment time, posibly causing
+ * the loop to trigger again and again.
+ */
+ if (timekeeping_valid_for_hres())
+ tick_periodic(cpu);
next = ktime_add(next, tick_period);
}
}
@@ -274,6 +284,21 @@ out_bc:
}
/*
+ * Transfer the do_timer job away from a dying cpu.
+ *
+ * Called with interrupts disabled.
+ */
+static void tick_handover_do_timer(int *cpup)
+{
+ if (*cpup == tick_do_timer_cpu) {
+ int cpu = cpumask_first(cpu_online_mask);
+
+ tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
+ TICK_DO_TIMER_NONE;
+ }
+}
+
+/*
* Shutdown an event device on a given cpu:
*
* This is called on a life CPU, when a CPU is dead. So we cannot
@@ -297,13 +322,6 @@ static void tick_shutdown(unsigned int *cpup)
clockevents_exchange_device(dev, NULL);
td->evtdev = NULL;
}
- /* Transfer the do_timer job away from this cpu */
- if (*cpup == tick_do_timer_cpu) {
- int cpu = cpumask_first(cpu_online_mask);
-
- tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
- TICK_DO_TIMER_NONE;
- }
spin_unlock_irqrestore(&tick_device_lock, flags);
}
@@ -357,6 +375,10 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason,
tick_broadcast_oneshot_control(reason);
break;
+ case CLOCK_EVT_NOTIFY_CPU_DYING:
+ tick_handover_do_timer(dev);
+ break;
+
case CLOCK_EVT_NOTIFY_CPU_DEAD:
tick_shutdown_broadcast_oneshot(dev);
tick_shutdown_broadcast(dev);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 2e8de678e76..a96c0e2b89c 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -128,6 +128,23 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
return 0;
}
+/**
+ * tick_check_oneshot_mode - check whether the system is in oneshot mode
+ *
+ * returns 1 when either nohz or highres are enabled. otherwise 0.
+ */
+int tick_oneshot_mode_active(void)
+{
+ unsigned long flags;
+ int ret;
+
+ local_irq_save(flags);
+ ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT;
+ local_irq_restore(flags);
+
+ return ret;
+}
+
#ifdef CONFIG_HIGH_RES_TIMERS
/**
* tick_init_highres - switch to high resolution mode
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1b6c05bd0d0..e0f59a21c06 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -134,7 +134,7 @@ __setup("nohz=", setup_tick_nohz);
* value. We do this unconditionally on any cpu, as we don't know whether the
* cpu, which has the update task assigned is in a long sleep.
*/
-void tick_nohz_update_jiffies(void)
+static void tick_nohz_update_jiffies(void)
{
int cpu = smp_processor_id();
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -222,6 +222,15 @@ void tick_nohz_stop_sched_tick(int inidle)
cpu = smp_processor_id();
ts = &per_cpu(tick_cpu_sched, cpu);
+
+ /*
+ * Call to tick_nohz_start_idle stops the last_update_time from being
+ * updated. Thus, it must not be called in the event we are called from
+ * irq_exit() with the prior state different than idle.
+ */
+ if (!inidle && !ts->inidle)
+ goto end;
+
now = tick_nohz_start_idle(ts);
/*
@@ -239,9 +248,6 @@ void tick_nohz_stop_sched_tick(int inidle)
if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
goto end;
- if (!inidle && !ts->inidle)
- goto end;
-
ts->inidle = 1;
if (need_resched())
@@ -349,7 +355,7 @@ void tick_nohz_stop_sched_tick(int inidle)
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
hrtimer_start(&ts->sched_timer, expires,
- HRTIMER_MODE_ABS);
+ HRTIMER_MODE_ABS_PINNED);
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
goto out;
@@ -395,7 +401,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
hrtimer_start_expires(&ts->sched_timer,
- HRTIMER_MODE_ABS);
+ HRTIMER_MODE_ABS_PINNED);
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
break;
@@ -698,7 +704,8 @@ void tick_setup_sched_timer(void)
for (;;) {
hrtimer_forward(&ts->sched_timer, now, tick_period);
- hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS);
+ hrtimer_start_expires(&ts->sched_timer,
+ HRTIMER_MODE_ABS_PINNED);
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
break;
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
new file mode 100644
index 00000000000..71e7f1a1915
--- /dev/null
+++ b/kernel/time/timecompare.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2009 Intel Corporation.
+ * Author: Patrick Ohly <patrick.ohly@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/timecompare.h>
+#include <linux/module.h>
+#include <linux/math64.h>
+
+/*
+ * fixed point arithmetic scale factor for skew
+ *
+ * Usually one would measure skew in ppb (parts per billion, 1e9), but
+ * using a factor of 2 simplifies the math.
+ */
+#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30)
+
+ktime_t timecompare_transform(struct timecompare *sync,
+ u64 source_tstamp)
+{
+ u64 nsec;
+
+ nsec = source_tstamp + sync->offset;
+ nsec += (s64)(source_tstamp - sync->last_update) * sync->skew /
+ TIMECOMPARE_SKEW_RESOLUTION;
+
+ return ns_to_ktime(nsec);
+}
+EXPORT_SYMBOL(timecompare_transform);
+
+int timecompare_offset(struct timecompare *sync,
+ s64 *offset,
+ u64 *source_tstamp)
+{
+ u64 start_source = 0, end_source = 0;
+ struct {
+ s64 offset;
+ s64 duration_target;
+ } buffer[10], sample, *samples;
+ int counter = 0, i;
+ int used;
+ int index;
+ int num_samples = sync->num_samples;
+
+ if (num_samples > sizeof(buffer)/sizeof(buffer[0])) {
+ samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
+ if (!samples) {
+ samples = buffer;
+ num_samples = sizeof(buffer)/sizeof(buffer[0]);
+ }
+ } else {
+ samples = buffer;
+ }
+
+ /* run until we have enough valid samples, but do not try forever */
+ i = 0;
+ counter = 0;
+ while (1) {
+ u64 ts;
+ ktime_t start, end;
+
+ start = sync->target();
+ ts = timecounter_read(sync->source);
+ end = sync->target();
+
+ if (!i)
+ start_source = ts;
+
+ /* ignore negative durations */
+ sample.duration_target = ktime_to_ns(ktime_sub(end, start));
+ if (sample.duration_target >= 0) {
+ /*
+ * assume symetric delay to and from source:
+ * average target time corresponds to measured
+ * source time
+ */
+ sample.offset =
+ ktime_to_ns(ktime_add(end, start)) / 2 -
+ ts;
+
+ /* simple insertion sort based on duration */
+ index = counter - 1;
+ while (index >= 0) {
+ if (samples[index].duration_target <
+ sample.duration_target)
+ break;
+ samples[index + 1] = samples[index];
+ index--;
+ }
+ samples[index + 1] = sample;
+ counter++;
+ }
+
+ i++;
+ if (counter >= num_samples || i >= 100000) {
+ end_source = ts;
+ break;
+ }
+ }
+
+ *source_tstamp = (end_source + start_source) / 2;
+
+ /* remove outliers by only using 75% of the samples */
+ used = counter * 3 / 4;
+ if (!used)
+ used = counter;
+ if (used) {
+ /* calculate average */
+ s64 off = 0;
+ for (index = 0; index < used; index++)
+ off += samples[index].offset;
+ *offset = div_s64(off, used);
+ }
+
+ if (samples && samples != buffer)
+ kfree(samples);
+
+ return used;
+}
+EXPORT_SYMBOL(timecompare_offset);
+
+void __timecompare_update(struct timecompare *sync,
+ u64 source_tstamp)
+{
+ s64 offset;
+ u64 average_time;
+
+ if (!timecompare_offset(sync, &offset, &average_time))
+ return;
+
+ if (!sync->last_update) {
+ sync->last_update = average_time;
+ sync->offset = offset;
+ sync->skew = 0;
+ } else {
+ s64 delta_nsec = average_time - sync->last_update;
+
+ /* avoid division by negative or small deltas */
+ if (delta_nsec >= 10000) {
+ s64 delta_offset_nsec = offset - sync->offset;
+ s64 skew; /* delta_offset_nsec *
+ TIMECOMPARE_SKEW_RESOLUTION /
+ delta_nsec */
+ u64 divisor;
+
+ /* div_s64() is limited to 32 bit divisor */
+ skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION;
+ divisor = delta_nsec;
+ while (unlikely(divisor >= ((s64)1) << 32)) {
+ /* divide both by 2; beware, right shift
+ of negative value has undefined
+ behavior and can only be used for
+ the positive divisor */
+ skew = div_s64(skew, 2);
+ divisor >>= 1;
+ }
+ skew = div_s64(skew, divisor);
+
+ /*
+ * Calculate new overall skew as 4/16 the
+ * old value and 12/16 the new one. This is
+ * a rather arbitrary tradeoff between
+ * only using the latest measurement (0/16 and
+ * 16/16) and even more weight on past measurements.
+ */
+#define TIMECOMPARE_NEW_SKEW_PER_16 12
+ sync->skew =
+ div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) *
+ sync->skew +
+ TIMECOMPARE_NEW_SKEW_PER_16 * skew,
+ 16);
+ sync->last_update = average_time;
+ sync->offset = offset;
+ }
+ }
+}
+EXPORT_SYMBOL(__timecompare_update);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 900f1b6598d..e8c77d9c633 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
/*
* This read-write spinlock protects us from races in SMP while
- * playing with xtime and avenrun.
+ * playing with xtime.
*/
__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
@@ -77,6 +77,10 @@ static void clocksource_forward_now(void)
clock->cycle_last = cycle_now;
nsec = cyc2ns(clock, cycle_delta);
+
+ /* If arch requires, add in gettimeoffset() */
+ nsec += arch_gettimeoffset();
+
timespec_add_ns(&xtime, nsec);
nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
@@ -111,6 +115,9 @@ void getnstimeofday(struct timespec *ts)
/* convert to nanoseconds: */
nsecs = cyc2ns(clock, cycle_delta);
+ /* If arch requires, add in gettimeoffset() */
+ nsecs += arch_gettimeoffset();
+
} while (read_seqretry(&xtime_lock, seq));
timespec_add_ns(ts, nsecs);
@@ -182,7 +189,7 @@ EXPORT_SYMBOL(do_settimeofday);
*/
static void change_clocksource(void)
{
- struct clocksource *new;
+ struct clocksource *new, *old;
new = clocksource_get_next();
@@ -191,11 +198,16 @@ static void change_clocksource(void)
clocksource_forward_now();
- new->raw_time = clock->raw_time;
+ if (clocksource_enable(new))
+ return;
+ new->raw_time = clock->raw_time;
+ old = clock;
clock = new;
+ clocksource_disable(old);
+
clock->cycle_last = 0;
- clock->cycle_last = clocksource_read(new);
+ clock->cycle_last = clocksource_read(clock);
clock->error = 0;
clock->xtime_nsec = 0;
clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@ -292,6 +304,7 @@ void __init timekeeping_init(void)
ntp_init();
clock = clocksource_get_next();
+ clocksource_enable(clock);
clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
clock->cycle_last = clocksource_read(clock);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index a999b92a127..fddd69d16e0 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -286,7 +286,7 @@ static int __init init_timer_list_procfs(void)
{
struct proc_dir_entry *pe;
- pe = proc_create("timer_list", 0644, NULL, &timer_list_fops);
+ pe = proc_create("timer_list", 0444, NULL, &timer_list_fops);
if (!pe)
return -ENOMEM;
return 0;
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c994530d166..4cde8b9c716 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -96,7 +96,7 @@ static DEFINE_MUTEX(show_mutex);
/*
* Collection status, active/inactive:
*/
-static int __read_mostly active;
+int __read_mostly timer_stats_active;
/*
* Beginning/end timestamps of measurement:
@@ -242,7 +242,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
struct entry *entry, input;
unsigned long flags;
- if (likely(!active))
+ if (likely(!timer_stats_active))
return;
lock = &per_cpu(lookup_lock, raw_smp_processor_id());
@@ -254,7 +254,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
input.timer_flag = timer_flag;
spin_lock_irqsave(lock, flags);
- if (!active)
+ if (!timer_stats_active)
goto out_unlock;
entry = tstat_lookup(&input, comm);
@@ -290,7 +290,7 @@ static int tstats_show(struct seq_file *m, void *v)
/*
* If still active then calculate up to now:
*/
- if (active)
+ if (timer_stats_active)
time_stop = ktime_get();
time = ktime_sub(time_stop, time_start);
@@ -368,18 +368,18 @@ static ssize_t tstats_write(struct file *file, const char __user *buf,
mutex_lock(&show_mutex);
switch (ctl[0]) {
case '0':
- if (active) {
- active = 0;
+ if (timer_stats_active) {
+ timer_stats_active = 0;
time_stop = ktime_get();
sync_access();
}
break;
case '1':
- if (!active) {
+ if (!timer_stats_active) {
reset_entries();
time_start = ktime_get();
smp_mb();
- active = 1;
+ timer_stats_active = 1;
}
break;
default:
diff --git a/kernel/timer.c b/kernel/timer.c
index 13dd64fe143..a3d25f41501 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,8 @@
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
+#include <linux/perf_counter.h>
+#include <linux/sched.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -378,6 +380,8 @@ static void timer_stats_account_timer(struct timer_list *timer)
{
unsigned int flag = 0;
+ if (likely(!timer->start_site))
+ return;
if (unlikely(tbase_get_deferrable(timer->base)))
flag |= TIMER_STATS_FLAG_DEFERRABLE;
@@ -491,14 +495,18 @@ static inline void debug_timer_free(struct timer_list *timer)
debug_object_free(timer, &timer_debug_descr);
}
-static void __init_timer(struct timer_list *timer);
+static void __init_timer(struct timer_list *timer,
+ const char *name,
+ struct lock_class_key *key);
-void init_timer_on_stack(struct timer_list *timer)
+void init_timer_on_stack_key(struct timer_list *timer,
+ const char *name,
+ struct lock_class_key *key)
{
debug_object_init_on_stack(timer, &timer_debug_descr);
- __init_timer(timer);
+ __init_timer(timer, name, key);
}
-EXPORT_SYMBOL_GPL(init_timer_on_stack);
+EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
void destroy_timer_on_stack(struct timer_list *timer)
{
@@ -512,7 +520,9 @@ static inline void debug_timer_activate(struct timer_list *timer) { }
static inline void debug_timer_deactivate(struct timer_list *timer) { }
#endif
-static void __init_timer(struct timer_list *timer)
+static void __init_timer(struct timer_list *timer,
+ const char *name,
+ struct lock_class_key *key)
{
timer->entry.next = NULL;
timer->base = __raw_get_cpu_var(tvec_bases);
@@ -521,28 +531,36 @@ static void __init_timer(struct timer_list *timer)
timer->start_pid = -1;
memset(timer->start_comm, 0, TASK_COMM_LEN);
#endif
+ lockdep_init_map(&timer->lockdep_map, name, key, 0);
}
/**
- * init_timer - initialize a timer.
+ * init_timer_key - initialize a timer
* @timer: the timer to be initialized
+ * @name: name of the timer
+ * @key: lockdep class key of the fake lock used for tracking timer
+ * sync lock dependencies
*
- * init_timer() must be done to a timer prior calling *any* of the
+ * init_timer_key() must be done to a timer prior calling *any* of the
* other timer functions.
*/
-void init_timer(struct timer_list *timer)
+void init_timer_key(struct timer_list *timer,
+ const char *name,
+ struct lock_class_key *key)
{
debug_timer_init(timer);
- __init_timer(timer);
+ __init_timer(timer, name, key);
}
-EXPORT_SYMBOL(init_timer);
+EXPORT_SYMBOL(init_timer_key);
-void init_timer_deferrable(struct timer_list *timer)
+void init_timer_deferrable_key(struct timer_list *timer,
+ const char *name,
+ struct lock_class_key *key)
{
- init_timer(timer);
+ init_timer_key(timer, name, key);
timer_set_deferrable(timer);
}
-EXPORT_SYMBOL(init_timer_deferrable);
+EXPORT_SYMBOL(init_timer_deferrable_key);
static inline void detach_timer(struct timer_list *timer,
int clear_pending)
@@ -589,11 +607,13 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
}
}
-int __mod_timer(struct timer_list *timer, unsigned long expires)
+static inline int
+__mod_timer(struct timer_list *timer, unsigned long expires,
+ bool pending_only, int pinned)
{
struct tvec_base *base, *new_base;
unsigned long flags;
- int ret = 0;
+ int ret = 0 , cpu;
timer_stats_timer_set_start_info(timer);
BUG_ON(!timer->function);
@@ -603,12 +623,27 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
if (timer_pending(timer)) {
detach_timer(timer, 0);
ret = 1;
+ } else {
+ if (pending_only)
+ goto out_unlock;
}
debug_timer_activate(timer);
new_base = __get_cpu_var(tvec_bases);
+ cpu = smp_processor_id();
+
+#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+ if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
+ int preferred_cpu = get_nohz_load_balancer();
+
+ if (preferred_cpu >= 0)
+ cpu = preferred_cpu;
+ }
+#endif
+ new_base = per_cpu(tvec_bases, cpu);
+
if (base != new_base) {
/*
* We are trying to schedule the timer on the local CPU.
@@ -629,42 +664,28 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
timer->expires = expires;
internal_add_timer(base, timer);
+
+out_unlock:
spin_unlock_irqrestore(&base->lock, flags);
return ret;
}
-EXPORT_SYMBOL(__mod_timer);
-
/**
- * add_timer_on - start a timer on a particular CPU
- * @timer: the timer to be added
- * @cpu: the CPU to start it on
+ * mod_timer_pending - modify a pending timer's timeout
+ * @timer: the pending timer to be modified
+ * @expires: new timeout in jiffies
*
- * This is not very scalable on SMP. Double adds are not possible.
+ * mod_timer_pending() is the same for pending timers as mod_timer(),
+ * but will not re-activate and modify already deleted timers.
+ *
+ * It is useful for unserialized use of timers.
*/
-void add_timer_on(struct timer_list *timer, int cpu)
+int mod_timer_pending(struct timer_list *timer, unsigned long expires)
{
- struct tvec_base *base = per_cpu(tvec_bases, cpu);
- unsigned long flags;
-
- timer_stats_timer_set_start_info(timer);
- BUG_ON(timer_pending(timer) || !timer->function);
- spin_lock_irqsave(&base->lock, flags);
- timer_set_base(timer, base);
- debug_timer_activate(timer);
- internal_add_timer(base, timer);
- /*
- * Check whether the other CPU is idle and needs to be
- * triggered to reevaluate the timer wheel when nohz is
- * active. We are protected against the other CPU fiddling
- * with the timer by holding the timer base lock. This also
- * makes sure that a CPU on the way to idle can not evaluate
- * the timer wheel.
- */
- wake_up_idle_cpu(cpu);
- spin_unlock_irqrestore(&base->lock, flags);
+ return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
}
+EXPORT_SYMBOL(mod_timer_pending);
/**
* mod_timer - modify a timer's timeout
@@ -688,21 +709,91 @@ void add_timer_on(struct timer_list *timer, int cpu)
*/
int mod_timer(struct timer_list *timer, unsigned long expires)
{
- BUG_ON(!timer->function);
-
- timer_stats_timer_set_start_info(timer);
/*
* This is a common optimization triggered by the
* networking code - if the timer is re-modified
* to be the same thing then just return:
*/
+ if (timer_pending(timer) && timer->expires == expires)
+ return 1;
+
+ return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
+}
+EXPORT_SYMBOL(mod_timer);
+
+/**
+ * mod_timer_pinned - modify a timer's timeout
+ * @timer: the timer to be modified
+ * @expires: new timeout in jiffies
+ *
+ * mod_timer_pinned() is a way to update the expire field of an
+ * active timer (if the timer is inactive it will be activated)
+ * and not allow the timer to be migrated to a different CPU.
+ *
+ * mod_timer_pinned(timer, expires) is equivalent to:
+ *
+ * del_timer(timer); timer->expires = expires; add_timer(timer);
+ */
+int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
+{
if (timer->expires == expires && timer_pending(timer))
return 1;
- return __mod_timer(timer, expires);
+ return __mod_timer(timer, expires, false, TIMER_PINNED);
}
+EXPORT_SYMBOL(mod_timer_pinned);
-EXPORT_SYMBOL(mod_timer);
+/**
+ * add_timer - start a timer
+ * @timer: the timer to be added
+ *
+ * The kernel will do a ->function(->data) callback from the
+ * timer interrupt at the ->expires point in the future. The
+ * current time is 'jiffies'.
+ *
+ * The timer's ->expires, ->function (and if the handler uses it, ->data)
+ * fields must be set prior calling this function.
+ *
+ * Timers with an ->expires field in the past will be executed in the next
+ * timer tick.
+ */
+void add_timer(struct timer_list *timer)
+{
+ BUG_ON(timer_pending(timer));
+ mod_timer(timer, timer->expires);
+}
+EXPORT_SYMBOL(add_timer);
+
+/**
+ * add_timer_on - start a timer on a particular CPU
+ * @timer: the timer to be added
+ * @cpu: the CPU to start it on
+ *
+ * This is not very scalable on SMP. Double adds are not possible.
+ */
+void add_timer_on(struct timer_list *timer, int cpu)
+{
+ struct tvec_base *base = per_cpu(tvec_bases, cpu);
+ unsigned long flags;
+
+ timer_stats_timer_set_start_info(timer);
+ BUG_ON(timer_pending(timer) || !timer->function);
+ spin_lock_irqsave(&base->lock, flags);
+ timer_set_base(timer, base);
+ debug_timer_activate(timer);
+ internal_add_timer(base, timer);
+ /*
+ * Check whether the other CPU is idle and needs to be
+ * triggered to reevaluate the timer wheel when nohz is
+ * active. We are protected against the other CPU fiddling
+ * with the timer by holding the timer base lock. This also
+ * makes sure that a CPU on the way to idle can not evaluate
+ * the timer wheel.
+ */
+ wake_up_idle_cpu(cpu);
+ spin_unlock_irqrestore(&base->lock, flags);
+}
+EXPORT_SYMBOL_GPL(add_timer_on);
/**
* del_timer - deactive a timer.
@@ -733,7 +824,6 @@ int del_timer(struct timer_list *timer)
return ret;
}
-
EXPORT_SYMBOL(del_timer);
#ifdef CONFIG_SMP
@@ -767,7 +857,6 @@ out:
return ret;
}
-
EXPORT_SYMBOL(try_to_del_timer_sync);
/**
@@ -789,6 +878,15 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
*/
int del_timer_sync(struct timer_list *timer)
{
+#ifdef CONFIG_LOCKDEP
+ unsigned long flags;
+
+ local_irq_save(flags);
+ lock_map_acquire(&timer->lockdep_map);
+ lock_map_release(&timer->lockdep_map);
+ local_irq_restore(flags);
+#endif
+
for (;;) {
int ret = try_to_del_timer_sync(timer);
if (ret >= 0)
@@ -796,7 +894,6 @@ int del_timer_sync(struct timer_list *timer)
cpu_relax();
}
}
-
EXPORT_SYMBOL(del_timer_sync);
#endif
@@ -861,10 +958,36 @@ static inline void __run_timers(struct tvec_base *base)
set_running_timer(base, timer);
detach_timer(timer, 1);
+
spin_unlock_irq(&base->lock);
{
int preempt_count = preempt_count();
+
+#ifdef CONFIG_LOCKDEP
+ /*
+ * It is permissible to free the timer from
+ * inside the function that is called from
+ * it, this we need to take into account for
+ * lockdep too. To avoid bogus "held lock
+ * freed" warnings as well as problems when
+ * looking into timer->lockdep_map, make a
+ * copy and use that here.
+ */
+ struct lockdep_map lockdep_map =
+ timer->lockdep_map;
+#endif
+ /*
+ * Couple the lock chain with the lock chain at
+ * del_timer_sync() by acquiring the lock_map
+ * around the fn() call here and in
+ * del_timer_sync().
+ */
+ lock_map_acquire(&lockdep_map);
+
fn(data);
+
+ lock_map_release(&lockdep_map);
+
if (preempt_count != preempt_count()) {
printk(KERN_ERR "huh, entered %p "
"with preempt_count %08x, exited"
@@ -930,6 +1053,9 @@ cascade:
index = slot = timer_jiffies & TVN_MASK;
do {
list_for_each_entry(nte, varp->vec + slot, entry) {
+ if (tbase_get_deferrable(nte->base))
+ continue;
+
found = 1;
if (time_before(nte->expires, expires))
expires = nte->expires;
@@ -1030,61 +1156,21 @@ void update_process_times(int user_tick)
/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick);
run_local_timers();
- if (rcu_pending(cpu))
- rcu_check_callbacks(cpu, user_tick);
+ rcu_check_callbacks(cpu, user_tick);
printk_tick();
scheduler_tick();
run_posix_cpu_timers(p);
}
/*
- * Nr of active tasks - counted in fixed-point numbers
- */
-static unsigned long count_active_tasks(void)
-{
- return nr_active() * FIXED_1;
-}
-
-/*
- * Hmm.. Changed this, as the GNU make sources (load.c) seems to
- * imply that avenrun[] is the standard name for this kind of thing.
- * Nothing else seems to be standardized: the fractional size etc
- * all seem to differ on different machines.
- *
- * Requires xtime_lock to access.
- */
-unsigned long avenrun[3];
-
-EXPORT_SYMBOL(avenrun);
-
-/*
- * calc_load - given tick count, update the avenrun load estimates.
- * This is called while holding a write_lock on xtime_lock.
- */
-static inline void calc_load(unsigned long ticks)
-{
- unsigned long active_tasks; /* fixed-point */
- static int count = LOAD_FREQ;
-
- count -= ticks;
- if (unlikely(count < 0)) {
- active_tasks = count_active_tasks();
- do {
- CALC_LOAD(avenrun[0], EXP_1, active_tasks);
- CALC_LOAD(avenrun[1], EXP_5, active_tasks);
- CALC_LOAD(avenrun[2], EXP_15, active_tasks);
- count += LOAD_FREQ;
- } while (count < 0);
- }
-}
-
-/*
* This function runs timers and the timer-tq in bottom half context.
*/
static void run_timer_softirq(struct softirq_action *h)
{
struct tvec_base *base = __get_cpu_var(tvec_bases);
+ perf_counter_do_pending();
+
hrtimer_run_pending();
if (time_after_eq(jiffies, base->timer_jiffies))
@@ -1102,16 +1188,6 @@ void run_local_timers(void)
}
/*
- * Called by the timer interrupt. xtime_lock must already be taken
- * by the timer IRQ!
- */
-static inline void update_times(unsigned long ticks)
-{
- update_wall_time();
- calc_load(ticks);
-}
-
-/*
* The 64-bit jiffies value is not atomic - you MUST NOT read it
* without sampling the sequence number in xtime_lock.
* jiffies is defined in the linker script...
@@ -1120,7 +1196,8 @@ static inline void update_times(unsigned long ticks)
void do_timer(unsigned long ticks)
{
jiffies_64 += ticks;
- update_times(ticks);
+ update_wall_time();
+ calc_global_load();
}
#ifdef __ARCH_WANT_SYS_ALARM
@@ -1268,7 +1345,7 @@ signed long __sched schedule_timeout(signed long timeout)
expire = timeout + jiffies;
setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
- __mod_timer(&timer, expire);
+ __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
schedule();
del_singleshot_timer_sync(&timer);
@@ -1321,37 +1398,17 @@ int do_sysinfo(struct sysinfo *info)
{
unsigned long mem_total, sav_total;
unsigned int mem_unit, bitcount;
- unsigned long seq;
+ struct timespec tp;
memset(info, 0, sizeof(struct sysinfo));
- do {
- struct timespec tp;
- seq = read_seqbegin(&xtime_lock);
-
- /*
- * This is annoying. The below is the same thing
- * posix_get_clock_monotonic() does, but it wants to
- * take the lock which we want to cover the loads stuff
- * too.
- */
-
- getnstimeofday(&tp);
- tp.tv_sec += wall_to_monotonic.tv_sec;
- tp.tv_nsec += wall_to_monotonic.tv_nsec;
- monotonic_to_bootbased(&tp);
- if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
- tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
- tp.tv_sec++;
- }
- info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+ ktime_get_ts(&tp);
+ monotonic_to_bootbased(&tp);
+ info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
- info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
- info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
- info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+ get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
- info->procs = nr_threads;
- } while (read_seqretry(&xtime_lock, seq));
+ info->procs = nr_threads;
si_meminfo(info);
si_swapinfo(info);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e2a4ff6fc3a..1ea0d1234f4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -9,12 +9,22 @@ config USER_STACKTRACE_SUPPORT
config NOP_TRACER
bool
+config HAVE_FTRACE_NMI_ENTER
+ bool
+
config HAVE_FUNCTION_TRACER
bool
config HAVE_FUNCTION_GRAPH_TRACER
bool
+config HAVE_FUNCTION_GRAPH_FP_TEST
+ bool
+ help
+ An arch may pass in a unique value (frame pointer) to both the
+ entering and exiting of a function. On exit, the value is compared
+ and if it does not match, then it will panic the kernel.
+
config HAVE_FUNCTION_TRACE_MCOUNT_TEST
bool
help
@@ -31,12 +41,40 @@ config HAVE_FTRACE_MCOUNT_RECORD
config HAVE_HW_BRANCH_TRACER
bool
+config HAVE_SYSCALL_TRACEPOINTS
+ bool
+
config TRACER_MAX_TRACE
bool
config RING_BUFFER
bool
+config FTRACE_NMI_ENTER
+ bool
+ depends on HAVE_FTRACE_NMI_ENTER
+ default y
+
+config EVENT_TRACING
+ select CONTEXT_SWITCH_TRACER
+ bool
+
+config CONTEXT_SWITCH_TRACER
+ bool
+
+config RING_BUFFER_ALLOW_SWAP
+ bool
+ help
+ Allow the use of ring_buffer_swap_cpu.
+ Adds a very slight overhead to tracing when enabled.
+
+# All tracer options should select GENERIC_TRACER. For those options that are
+# enabled by all tracers (context switch and event tracer) they select TRACING.
+# This allows those options to appear when no other tracer is selected. But the
+# options do not appear when something else selects it. We need the two options
+# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
+# hidding of the automatic options options.
+
config TRACING
bool
select DEBUG_FS
@@ -44,15 +82,43 @@ config TRACING
select STACKTRACE if STACKTRACE_SUPPORT
select TRACEPOINTS
select NOP_TRACER
+ select BINARY_PRINTF
+ select EVENT_TRACING
-menu "Tracers"
+config GENERIC_TRACER
+ bool
+ select TRACING
+
+#
+# Minimum requirements an architecture has to meet for us to
+# be able to offer generic tracing facilities:
+#
+config TRACING_SUPPORT
+ bool
+ # PPC32 has no irqflags tracing support, but it can use most of the
+ # tracers anyway, they were tested to build and work. Note that new
+ # exceptions to this list aren't welcomed, better implement the
+ # irqflags tracing for your architecture.
+ depends on TRACE_IRQFLAGS_SUPPORT || PPC32
+ depends on STACKTRACE_SUPPORT
+ default y
+
+if TRACING_SUPPORT
+
+menuconfig FTRACE
+ bool "Tracers"
+ default y if DEBUG_KERNEL
+ help
+ Enable the kernel tracing infrastructure.
+
+if FTRACE
config FUNCTION_TRACER
bool "Kernel Function Tracer"
depends on HAVE_FUNCTION_TRACER
- depends on DEBUG_KERNEL
select FRAME_POINTER
- select TRACING
+ select KALLSYMS
+ select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
help
Enable the kernel to trace every kernel function. This is done
@@ -67,25 +133,26 @@ config FUNCTION_GRAPH_TRACER
bool "Kernel Function Graph Tracer"
depends on HAVE_FUNCTION_GRAPH_TRACER
depends on FUNCTION_TRACER
+ depends on !X86_32 || !CC_OPTIMIZE_FOR_SIZE
default y
help
Enable the kernel to trace a function at both its return
and its entry.
- It's first purpose is to trace the duration of functions and
- draw a call graph for each thread with some informations like
- the return value.
- This is done by setting the current return address on the current
- task structure into a stack of calls.
+ Its first purpose is to trace the duration of functions and
+ draw a call graph for each thread with some information like
+ the return value. This is done by setting the current return
+ address on the current task structure into a stack of calls.
+
config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
default n
depends on TRACE_IRQFLAGS_SUPPORT
depends on GENERIC_TIME
- depends on DEBUG_KERNEL
select TRACE_IRQFLAGS
- select TRACING
+ select GENERIC_TRACER
select TRACER_MAX_TRACE
+ select RING_BUFFER_ALLOW_SWAP
help
This option measures the time spent in irqs-off critical
sections, with microsecond accuracy.
@@ -94,7 +161,7 @@ config IRQSOFF_TRACER
disabled by default and can be runtime (re-)started
via:
- echo 0 > /debugfs/tracing/tracing_max_latency
+ echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
(Note that kernel size and overhead increases with this option
enabled. This option and the preempt-off timing option can be
@@ -105,9 +172,9 @@ config PREEMPT_TRACER
default n
depends on GENERIC_TIME
depends on PREEMPT
- depends on DEBUG_KERNEL
- select TRACING
+ select GENERIC_TRACER
select TRACER_MAX_TRACE
+ select RING_BUFFER_ALLOW_SWAP
help
This option measures the time spent in preemption off critical
sections, with microsecond accuracy.
@@ -116,7 +183,7 @@ config PREEMPT_TRACER
disabled by default and can be runtime (re-)started
via:
- echo 0 > /debugfs/tracing/tracing_max_latency
+ echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
(Note that kernel size and overhead increases with this option
enabled. This option and the irqs-off timing option can be
@@ -125,79 +192,111 @@ config PREEMPT_TRACER
config SYSPROF_TRACER
bool "Sysprof Tracer"
depends on X86
- select TRACING
+ select GENERIC_TRACER
+ select CONTEXT_SWITCH_TRACER
help
This tracer provides the trace needed by the 'Sysprof' userspace
tool.
config SCHED_TRACER
bool "Scheduling Latency Tracer"
- depends on DEBUG_KERNEL
- select TRACING
+ select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
select TRACER_MAX_TRACE
help
This tracer tracks the latency of the highest priority task
to be scheduled in, starting from the point it has woken up.
-config CONTEXT_SWITCH_TRACER
- bool "Trace process context switches"
- depends on DEBUG_KERNEL
+config ENABLE_DEFAULT_TRACERS
+ bool "Trace process context switches and events"
+ depends on !GENERIC_TRACER
select TRACING
- select MARKERS
help
- This tracer gets called from the context switch and records
- all switching of tasks.
+ This tracer hooks to various trace points in the kernel
+ allowing the user to pick and choose which trace point they
+ want to trace. It also includes the sched_switch tracer plugin.
+
+config FTRACE_SYSCALLS
+ bool "Trace syscalls"
+ depends on HAVE_SYSCALL_TRACEPOINTS
+ select GENERIC_TRACER
+ select KALLSYMS
+ help
+ Basic tracer to catch the syscall entry and exit events.
config BOOT_TRACER
bool "Trace boot initcalls"
- depends on DEBUG_KERNEL
- select TRACING
+ select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
help
This tracer helps developers to optimize boot times: it records
the timings of the initcalls and traces key events and the identity
of tasks that can cause boot delays, such as context-switches.
- Its aim is to be parsed by the /scripts/bootgraph.pl tool to
+ Its aim is to be parsed by the scripts/bootgraph.pl tool to
produce pretty graphics about boot inefficiencies, giving a visual
representation of the delays during initcalls - but the raw
/debug/tracing/trace text output is readable too.
- ( Note that tracing self tests can't be enabled if this tracer is
- selected, because the self-tests are an initcall as well and that
- would invalidate the boot trace. )
+ You must pass in initcall_debug and ftrace=initcall to the kernel
+ command line to enable this on bootup.
config TRACE_BRANCH_PROFILING
+ bool
+ select GENERIC_TRACER
+
+choice
+ prompt "Branch Profiling"
+ default BRANCH_PROFILE_NONE
+ help
+ The branch profiling is a software profiler. It will add hooks
+ into the C conditionals to test which path a branch takes.
+
+ The likely/unlikely profiler only looks at the conditions that
+ are annotated with a likely or unlikely macro.
+
+ The "all branch" profiler will profile every if statement in the
+ kernel. This profiler will also enable the likely/unlikely
+ profiler as well.
+
+ Either of the above profilers add a bit of overhead to the system.
+ If unsure choose "No branch profiling".
+
+config BRANCH_PROFILE_NONE
+ bool "No branch profiling"
+ help
+ No branch profiling. Branch profiling adds a bit of overhead.
+ Only enable it if you want to analyse the branching behavior.
+ Otherwise keep it disabled.
+
+config PROFILE_ANNOTATED_BRANCHES
bool "Trace likely/unlikely profiler"
- depends on DEBUG_KERNEL
- select TRACING
+ select TRACE_BRANCH_PROFILING
help
This tracer profiles all the the likely and unlikely macros
in the kernel. It will display the results in:
- /debugfs/tracing/profile_annotated_branch
+ /sys/kernel/debug/tracing/profile_annotated_branch
Note: this will add a significant overhead, only turn this
on if you need to profile the system's use of these macros.
- Say N if unsure.
-
config PROFILE_ALL_BRANCHES
bool "Profile all if conditionals"
- depends on TRACE_BRANCH_PROFILING
+ select TRACE_BRANCH_PROFILING
help
This tracer profiles all branch conditions. Every if ()
taken in the kernel is recorded whether it hit or miss.
The results will be displayed in:
- /debugfs/tracing/profile_branch
+ /sys/kernel/debug/tracing/profile_branch
+
+ This option also enables the likely/unlikely profiler.
This configuration, when enabled, will impose a great overhead
on the system. This should only be enabled when the system
is to be analyzed
-
- Say N if unsure.
+endchoice
config TRACING_BRANCHES
bool
@@ -223,9 +322,8 @@ config BRANCH_TRACER
config POWER_TRACER
bool "Trace power consumption behavior"
- depends on DEBUG_KERNEL
depends on X86
- select TRACING
+ select GENERIC_TRACER
help
This tracer helps developers to analyze and optimize the kernels
power management decisions, specifically the C-state and P-state
@@ -235,12 +333,12 @@ config POWER_TRACER
config STACK_TRACER
bool "Trace max stack"
depends on HAVE_FUNCTION_TRACER
- depends on DEBUG_KERNEL
select FUNCTION_TRACER
select STACKTRACE
+ select KALLSYMS
help
This special tracer records the maximum stack footprint of the
- kernel and displays it in debugfs/tracing/stack_trace.
+ kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
This tracer works by hooking into every function call that the
kernel executes, and keeping a maximum stack depth value and
@@ -259,16 +357,71 @@ config STACK_TRACER
config HW_BRANCH_TRACER
depends on HAVE_HW_BRANCH_TRACER
bool "Trace hw branches"
- select TRACING
+ select GENERIC_TRACER
help
This tracer records all branches on the system in a circular
buffer giving access to the last N branches for each cpu.
+config KMEMTRACE
+ bool "Trace SLAB allocations"
+ select GENERIC_TRACER
+ help
+ kmemtrace provides tracing for slab allocator functions, such as
+ kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
+ data is then fed to the userspace application in order to analyse
+ allocation hotspots, internal fragmentation and so on, making it
+ possible to see how well an allocator performs, as well as debug
+ and profile kernel code.
+
+ This requires an userspace application to use. See
+ Documentation/trace/kmemtrace.txt for more information.
+
+ Saying Y will make the kernel somewhat larger and slower. However,
+ if you disable kmemtrace at run-time or boot-time, the performance
+ impact is minimal (depending on the arch the kernel is built for).
+
+ If unsure, say N.
+
+config WORKQUEUE_TRACER
+ bool "Trace workqueues"
+ select GENERIC_TRACER
+ help
+ The workqueue tracer provides some statistical informations
+ about each cpu workqueue thread such as the number of the
+ works inserted and executed since their creation. It can help
+ to evaluate the amount of work each of them have to perform.
+ For example it can help a developer to decide whether he should
+ choose a per cpu workqueue instead of a singlethreaded one.
+
+config BLK_DEV_IO_TRACE
+ bool "Support for tracing block io actions"
+ depends on SYSFS
+ depends on BLOCK
+ select RELAY
+ select DEBUG_FS
+ select TRACEPOINTS
+ select GENERIC_TRACER
+ select STACKTRACE
+ help
+ Say Y here if you want to be able to trace the block layer actions
+ on a given queue. Tracing allows you to see any traffic happening
+ on a block device queue. For more information (and the userspace
+ support tools needed), fetch the blktrace tools from:
+
+ git://git.kernel.dk/blktrace.git
+
+ Tracing also is possible using the ftrace interface, e.g.:
+
+ echo 1 > /sys/block/sda/sda1/trace/enable
+ echo blk > /sys/kernel/debug/tracing/current_tracer
+ cat /sys/kernel/debug/tracing/trace_pipe
+
+ If unsure, say N.
+
config DYNAMIC_FTRACE
bool "enable/disable ftrace tracepoints dynamically"
depends on FUNCTION_TRACER
depends on HAVE_DYNAMIC_FTRACE
- depends on DEBUG_KERNEL
default y
help
This option will modify all the calls to ftrace dynamically
@@ -284,6 +437,20 @@ config DYNAMIC_FTRACE
were made. If so, it runs stop_machine (stops all CPUS)
and modifies the code to jump over the call to ftrace.
+config FUNCTION_PROFILER
+ bool "Kernel function profiler"
+ depends on FUNCTION_TRACER
+ default n
+ help
+ This option enables the kernel function profiler. A file is created
+ in debugfs called function_profile_enabled which defaults to zero.
+ When a 1 is echoed into this file profiling begins, and when a
+ zero is entered, profiling stops. A file in the trace_stats
+ directory called functions, that show the list of functions that
+ have been hit and their counters.
+
+ If in doubt, say N
+
config FTRACE_MCOUNT_RECORD
def_bool y
depends on DYNAMIC_FTRACE
@@ -294,7 +461,7 @@ config FTRACE_SELFTEST
config FTRACE_STARTUP_TEST
bool "Perform a startup test on ftrace"
- depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER
+ depends on GENERIC_TRACER
select FTRACE_SELFTEST
help
This option performs a series of startup tests on ftrace. On bootup
@@ -302,4 +469,46 @@ config FTRACE_STARTUP_TEST
functioning properly. It will do tests on all the configured
tracers of ftrace.
-endmenu
+config MMIOTRACE
+ bool "Memory mapped IO tracing"
+ depends on HAVE_MMIOTRACE_SUPPORT && PCI
+ select GENERIC_TRACER
+ help
+ Mmiotrace traces Memory Mapped I/O access and is meant for
+ debugging and reverse engineering. It is called from the ioremap
+ implementation and works via page faults. Tracing is disabled by
+ default and can be enabled at run-time.
+
+ See Documentation/trace/mmiotrace.txt.
+ If you are not helping to develop drivers, say N.
+
+config MMIOTRACE_TEST
+ tristate "Test module for mmiotrace"
+ depends on MMIOTRACE && m
+ help
+ This is a dumb module for testing mmiotrace. It is very dangerous
+ as it will write garbage to IO memory starting at a given address.
+ However, it should be safe to use on e.g. unused portion of VRAM.
+
+ Say N, unless you absolutely know what you are doing.
+
+config RING_BUFFER_BENCHMARK
+ tristate "Ring buffer benchmark stress tester"
+ depends on RING_BUFFER
+ help
+ This option creates a test to stress the ring buffer and bench mark it.
+ It creates its own ring buffer such that it will not interfer with
+ any other users of the ring buffer (such as ftrace). It then creates
+ a producer and consumer that will run for 10 seconds and sleep for
+ 10 seconds. Each interval it will print out the number of events
+ it recorded and give a rough estimate of how long each iteration took.
+
+ It does not disable interrupts or raise its priority, so it may be
+ affected by processes that are running.
+
+ If unsure, say N
+
+endif # FTRACE
+
+endif # TRACING_SUPPORT
+
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 349d5a93653..844164dca90 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -15,10 +15,20 @@ ifdef CONFIG_TRACING_BRANCHES
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
endif
+#
+# Make the trace clocks available generally: it's infrastructure
+# relied on by ptrace for example:
+#
+obj-y += trace_clock.o
+
obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
+obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_TRACING) += trace_output.o
+obj-$(CONFIG_TRACING) += trace_stat.o
+obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
@@ -33,5 +43,16 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
obj-$(CONFIG_POWER_TRACER) += trace_power.o
+obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
+obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
+obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
+ifeq ($(CONFIG_BLOCK),y)
+obj-$(CONFIG_EVENT_TRACING) += blktrace.o
+endif
+obj-$(CONFIG_EVENT_TRACING) += trace_events.o
+obj-$(CONFIG_EVENT_TRACING) += trace_export.o
+obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
+obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
+obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
new file mode 100644
index 00000000000..3eb159c277c
--- /dev/null
+++ b/kernel/trace/blktrace.c
@@ -0,0 +1,1728 @@
+/*
+ * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blktrace_api.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+#include <linux/smp_lock.h>
+#include <linux/time.h>
+#include <linux/uaccess.h>
+
+#include <trace/events/block.h>
+
+#include "trace_output.h"
+
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+
+static unsigned int blktrace_seq __read_mostly = 1;
+
+static struct trace_array *blk_tr;
+static bool blk_tracer_enabled __read_mostly;
+
+/* Select an alternative, minimalistic output than the original one */
+#define TRACE_BLK_OPT_CLASSIC 0x1
+
+static struct tracer_opt blk_tracer_opts[] = {
+ /* Default disable the minimalistic output */
+ { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
+ { }
+};
+
+static struct tracer_flags blk_tracer_flags = {
+ .val = 0,
+ .opts = blk_tracer_opts,
+};
+
+/* Global reference count of probes */
+static atomic_t blk_probes_ref = ATOMIC_INIT(0);
+
+static void blk_register_tracepoints(void);
+static void blk_unregister_tracepoints(void);
+
+/*
+ * Send out a notify message.
+ */
+static void trace_note(struct blk_trace *bt, pid_t pid, int action,
+ const void *data, size_t len)
+{
+ struct blk_io_trace *t;
+ struct ring_buffer_event *event = NULL;
+ struct ring_buffer *buffer = NULL;
+ int pc = 0;
+ int cpu = smp_processor_id();
+ bool blk_tracer = blk_tracer_enabled;
+
+ if (blk_tracer) {
+ buffer = blk_tr->buffer;
+ pc = preempt_count();
+ event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
+ sizeof(*t) + len,
+ 0, pc);
+ if (!event)
+ return;
+ t = ring_buffer_event_data(event);
+ goto record_it;
+ }
+
+ if (!bt->rchan)
+ return;
+
+ t = relay_reserve(bt->rchan, sizeof(*t) + len);
+ if (t) {
+ t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+ t->time = ktime_to_ns(ktime_get());
+record_it:
+ t->device = bt->dev;
+ t->action = action;
+ t->pid = pid;
+ t->cpu = cpu;
+ t->pdu_len = len;
+ memcpy((void *) t + sizeof(*t), data, len);
+
+ if (blk_tracer)
+ trace_buffer_unlock_commit(buffer, event, 0, pc);
+ }
+}
+
+/*
+ * Send out a notify for this process, if we haven't done so since a trace
+ * started
+ */
+static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
+{
+ tsk->btrace_seq = blktrace_seq;
+ trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
+}
+
+static void trace_note_time(struct blk_trace *bt)
+{
+ struct timespec now;
+ unsigned long flags;
+ u32 words[2];
+
+ getnstimeofday(&now);
+ words[0] = now.tv_sec;
+ words[1] = now.tv_nsec;
+
+ local_irq_save(flags);
+ trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
+ local_irq_restore(flags);
+}
+
+void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
+{
+ int n;
+ va_list args;
+ unsigned long flags;
+ char *buf;
+
+ if (unlikely(bt->trace_state != Blktrace_running &&
+ !blk_tracer_enabled))
+ return;
+
+ local_irq_save(flags);
+ buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
+ va_start(args, fmt);
+ n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
+ va_end(args);
+
+ trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(__trace_note_message);
+
+static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
+ pid_t pid)
+{
+ if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
+ return 1;
+ if (sector && (sector < bt->start_lba || sector > bt->end_lba))
+ return 1;
+ if (bt->pid && pid != bt->pid)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Data direction bit lookup
+ */
+static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
+ BLK_TC_ACT(BLK_TC_WRITE) };
+
+/* The ilog2() calls fall out because they're constant */
+#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
+ (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name))
+
+/*
+ * The worker for the various blk_add_trace*() types. Fills out a
+ * blk_io_trace structure and places it in a per-cpu subbuffer.
+ */
+static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
+ int rw, u32 what, int error, int pdu_len, void *pdu_data)
+{
+ struct task_struct *tsk = current;
+ struct ring_buffer_event *event = NULL;
+ struct ring_buffer *buffer = NULL;
+ struct blk_io_trace *t;
+ unsigned long flags = 0;
+ unsigned long *sequence;
+ pid_t pid;
+ int cpu, pc = 0;
+ bool blk_tracer = blk_tracer_enabled;
+
+ if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
+ return;
+
+ what |= ddir_act[rw & WRITE];
+ what |= MASK_TC_BIT(rw, BARRIER);
+ what |= MASK_TC_BIT(rw, SYNCIO);
+ what |= MASK_TC_BIT(rw, AHEAD);
+ what |= MASK_TC_BIT(rw, META);
+ what |= MASK_TC_BIT(rw, DISCARD);
+
+ pid = tsk->pid;
+ if (act_log_check(bt, what, sector, pid))
+ return;
+ cpu = raw_smp_processor_id();
+
+ if (blk_tracer) {
+ tracing_record_cmdline(current);
+
+ buffer = blk_tr->buffer;
+ pc = preempt_count();
+ event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
+ sizeof(*t) + pdu_len,
+ 0, pc);
+ if (!event)
+ return;
+ t = ring_buffer_event_data(event);
+ goto record_it;
+ }
+
+ /*
+ * A word about the locking here - we disable interrupts to reserve
+ * some space in the relay per-cpu buffer, to prevent an irq
+ * from coming in and stepping on our toes.
+ */
+ local_irq_save(flags);
+
+ if (unlikely(tsk->btrace_seq != blktrace_seq))
+ trace_note_tsk(bt, tsk);
+
+ t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
+ if (t) {
+ sequence = per_cpu_ptr(bt->sequence, cpu);
+
+ t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+ t->sequence = ++(*sequence);
+ t->time = ktime_to_ns(ktime_get());
+record_it:
+ /*
+ * These two are not needed in ftrace as they are in the
+ * generic trace_entry, filled by tracing_generic_entry_update,
+ * but for the trace_event->bin() synthesizer benefit we do it
+ * here too.
+ */
+ t->cpu = cpu;
+ t->pid = pid;
+
+ t->sector = sector;
+ t->bytes = bytes;
+ t->action = what;
+ t->device = bt->dev;
+ t->error = error;
+ t->pdu_len = pdu_len;
+
+ if (pdu_len)
+ memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+
+ if (blk_tracer) {
+ trace_buffer_unlock_commit(buffer, event, 0, pc);
+ return;
+ }
+ }
+
+ local_irq_restore(flags);
+}
+
+static struct dentry *blk_tree_root;
+static DEFINE_MUTEX(blk_tree_mutex);
+
+static void blk_trace_free(struct blk_trace *bt)
+{
+ debugfs_remove(bt->msg_file);
+ debugfs_remove(bt->dropped_file);
+ relay_close(bt->rchan);
+ debugfs_remove(bt->dir);
+ free_percpu(bt->sequence);
+ free_percpu(bt->msg_data);
+ kfree(bt);
+}
+
+static void blk_trace_cleanup(struct blk_trace *bt)
+{
+ blk_trace_free(bt);
+ if (atomic_dec_and_test(&blk_probes_ref))
+ blk_unregister_tracepoints();
+}
+
+int blk_trace_remove(struct request_queue *q)
+{
+ struct blk_trace *bt;
+
+ bt = xchg(&q->blk_trace, NULL);
+ if (!bt)
+ return -EINVAL;
+
+ if (bt->trace_state != Blktrace_running)
+ blk_trace_cleanup(bt);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blk_trace_remove);
+
+static int blk_dropped_open(struct inode *inode, struct file *filp)
+{
+ filp->private_data = inode->i_private;
+
+ return 0;
+}
+
+static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ struct blk_trace *bt = filp->private_data;
+ char buf[16];
+
+ snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
+
+ return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
+}
+
+static const struct file_operations blk_dropped_fops = {
+ .owner = THIS_MODULE,
+ .open = blk_dropped_open,
+ .read = blk_dropped_read,
+};
+
+static int blk_msg_open(struct inode *inode, struct file *filp)
+{
+ filp->private_data = inode->i_private;
+
+ return 0;
+}
+
+static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ char *msg;
+ struct blk_trace *bt;
+
+ if (count >= BLK_TN_MAX_MSG)
+ return -EINVAL;
+
+ msg = kmalloc(count + 1, GFP_KERNEL);
+ if (msg == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(msg, buffer, count)) {
+ kfree(msg);
+ return -EFAULT;
+ }
+
+ msg[count] = '\0';
+ bt = filp->private_data;
+ __trace_note_message(bt, "%s", msg);
+ kfree(msg);
+
+ return count;
+}
+
+static const struct file_operations blk_msg_fops = {
+ .owner = THIS_MODULE,
+ .open = blk_msg_open,
+ .write = blk_msg_write,
+};
+
+/*
+ * Keep track of how many times we encountered a full subbuffer, to aid
+ * the user space app in telling how many lost events there were.
+ */
+static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
+ void *prev_subbuf, size_t prev_padding)
+{
+ struct blk_trace *bt;
+
+ if (!relay_buf_full(buf))
+ return 1;
+
+ bt = buf->chan->private_data;
+ atomic_inc(&bt->dropped);
+ return 0;
+}
+
+static int blk_remove_buf_file_callback(struct dentry *dentry)
+{
+ debugfs_remove(dentry);
+
+ return 0;
+}
+
+static struct dentry *blk_create_buf_file_callback(const char *filename,
+ struct dentry *parent,
+ int mode,
+ struct rchan_buf *buf,
+ int *is_global)
+{
+ return debugfs_create_file(filename, mode, parent, buf,
+ &relay_file_operations);
+}
+
+static struct rchan_callbacks blk_relay_callbacks = {
+ .subbuf_start = blk_subbuf_start_callback,
+ .create_buf_file = blk_create_buf_file_callback,
+ .remove_buf_file = blk_remove_buf_file_callback,
+};
+
+static void blk_trace_setup_lba(struct blk_trace *bt,
+ struct block_device *bdev)
+{
+ struct hd_struct *part = NULL;
+
+ if (bdev)
+ part = bdev->bd_part;
+
+ if (part) {
+ bt->start_lba = part->start_sect;
+ bt->end_lba = part->start_sect + part->nr_sects;
+ } else {
+ bt->start_lba = 0;
+ bt->end_lba = -1ULL;
+ }
+}
+
+/*
+ * Setup everything required to start tracing
+ */
+int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev,
+ struct blk_user_trace_setup *buts)
+{
+ struct blk_trace *old_bt, *bt = NULL;
+ struct dentry *dir = NULL;
+ int ret, i;
+
+ if (!buts->buf_size || !buts->buf_nr)
+ return -EINVAL;
+
+ strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
+ buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
+
+ /*
+ * some device names have larger paths - convert the slashes
+ * to underscores for this to work as expected
+ */
+ for (i = 0; i < strlen(buts->name); i++)
+ if (buts->name[i] == '/')
+ buts->name[i] = '_';
+
+ bt = kzalloc(sizeof(*bt), GFP_KERNEL);
+ if (!bt)
+ return -ENOMEM;
+
+ ret = -ENOMEM;
+ bt->sequence = alloc_percpu(unsigned long);
+ if (!bt->sequence)
+ goto err;
+
+ bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
+ if (!bt->msg_data)
+ goto err;
+
+ ret = -ENOENT;
+
+ mutex_lock(&blk_tree_mutex);
+ if (!blk_tree_root) {
+ blk_tree_root = debugfs_create_dir("block", NULL);
+ if (!blk_tree_root) {
+ mutex_unlock(&blk_tree_mutex);
+ goto err;
+ }
+ }
+ mutex_unlock(&blk_tree_mutex);
+
+ dir = debugfs_create_dir(buts->name, blk_tree_root);
+
+ if (!dir)
+ goto err;
+
+ bt->dir = dir;
+ bt->dev = dev;
+ atomic_set(&bt->dropped, 0);
+
+ ret = -EIO;
+ bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
+ &blk_dropped_fops);
+ if (!bt->dropped_file)
+ goto err;
+
+ bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
+ if (!bt->msg_file)
+ goto err;
+
+ bt->rchan = relay_open("trace", dir, buts->buf_size,
+ buts->buf_nr, &blk_relay_callbacks, bt);
+ if (!bt->rchan)
+ goto err;
+
+ bt->act_mask = buts->act_mask;
+ if (!bt->act_mask)
+ bt->act_mask = (u16) -1;
+
+ blk_trace_setup_lba(bt, bdev);
+
+ /* overwrite with user settings */
+ if (buts->start_lba)
+ bt->start_lba = buts->start_lba;
+ if (buts->end_lba)
+ bt->end_lba = buts->end_lba;
+
+ bt->pid = buts->pid;
+ bt->trace_state = Blktrace_setup;
+
+ ret = -EBUSY;
+ old_bt = xchg(&q->blk_trace, bt);
+ if (old_bt) {
+ (void) xchg(&q->blk_trace, old_bt);
+ goto err;
+ }
+
+ if (atomic_inc_return(&blk_probes_ref) == 1)
+ blk_register_tracepoints();
+
+ return 0;
+err:
+ blk_trace_free(bt);
+ return ret;
+}
+
+int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev,
+ char __user *arg)
+{
+ struct blk_user_trace_setup buts;
+ int ret;
+
+ ret = copy_from_user(&buts, arg, sizeof(buts));
+ if (ret)
+ return -EFAULT;
+
+ ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(arg, &buts, sizeof(buts)))
+ return -EFAULT;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blk_trace_setup);
+
+int blk_trace_startstop(struct request_queue *q, int start)
+{
+ int ret;
+ struct blk_trace *bt = q->blk_trace;
+
+ if (bt == NULL)
+ return -EINVAL;
+
+ /*
+ * For starting a trace, we can transition from a setup or stopped
+ * trace. For stopping a trace, the state must be running
+ */
+ ret = -EINVAL;
+ if (start) {
+ if (bt->trace_state == Blktrace_setup ||
+ bt->trace_state == Blktrace_stopped) {
+ blktrace_seq++;
+ smp_mb();
+ bt->trace_state = Blktrace_running;
+
+ trace_note_time(bt);
+ ret = 0;
+ }
+ } else {
+ if (bt->trace_state == Blktrace_running) {
+ bt->trace_state = Blktrace_stopped;
+ relay_flush(bt->rchan);
+ ret = 0;
+ }
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blk_trace_startstop);
+
+/**
+ * blk_trace_ioctl: - handle the ioctls associated with tracing
+ * @bdev: the block device
+ * @cmd: the ioctl cmd
+ * @arg: the argument data, if any
+ *
+ **/
+int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
+{
+ struct request_queue *q;
+ int ret, start = 0;
+ char b[BDEVNAME_SIZE];
+
+ q = bdev_get_queue(bdev);
+ if (!q)
+ return -ENXIO;
+
+ mutex_lock(&bdev->bd_mutex);
+
+ switch (cmd) {
+ case BLKTRACESETUP:
+ bdevname(bdev, b);
+ ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+ break;
+ case BLKTRACESTART:
+ start = 1;
+ case BLKTRACESTOP:
+ ret = blk_trace_startstop(q, start);
+ break;
+ case BLKTRACETEARDOWN:
+ ret = blk_trace_remove(q);
+ break;
+ default:
+ ret = -ENOTTY;
+ break;
+ }
+
+ mutex_unlock(&bdev->bd_mutex);
+ return ret;
+}
+
+/**
+ * blk_trace_shutdown: - stop and cleanup trace structures
+ * @q: the request queue associated with the device
+ *
+ **/
+void blk_trace_shutdown(struct request_queue *q)
+{
+ if (q->blk_trace) {
+ blk_trace_startstop(q, 0);
+ blk_trace_remove(q);
+ }
+}
+
+/*
+ * blktrace probes
+ */
+
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * @q: queue the io is for
+ * @rq: the source request
+ * @what: the action
+ *
+ * Description:
+ * Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+ u32 what)
+{
+ struct blk_trace *bt = q->blk_trace;
+ int rw = rq->cmd_flags & 0x03;
+
+ if (likely(!bt))
+ return;
+
+ if (blk_discard_rq(rq))
+ rw |= (1 << BIO_RW_DISCARD);
+
+ if (blk_pc_request(rq)) {
+ what |= BLK_TC_ACT(BLK_TC_PC);
+ __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
+ what, rq->errors, rq->cmd_len, rq->cmd);
+ } else {
+ what |= BLK_TC_ACT(BLK_TC_FS);
+ __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw,
+ what, rq->errors, 0, NULL);
+ }
+}
+
+static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
+{
+ blk_add_trace_rq(q, rq, BLK_TA_ABORT);
+}
+
+static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
+{
+ blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+}
+
+static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
+{
+ blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+}
+
+static void blk_add_trace_rq_requeue(struct request_queue *q,
+ struct request *rq)
+{
+ blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+}
+
+static void blk_add_trace_rq_complete(struct request_queue *q,
+ struct request *rq)
+{
+ blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * @q: queue the io is for
+ * @bio: the source bio
+ * @what: the action
+ *
+ * Description:
+ * Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
+ u32 what)
+{
+ struct blk_trace *bt = q->blk_trace;
+
+ if (likely(!bt))
+ return;
+
+ __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
+ !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
+{
+ blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
+}
+
+static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
+{
+ blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
+}
+
+static void blk_add_trace_bio_backmerge(struct request_queue *q,
+ struct bio *bio)
+{
+ blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+}
+
+static void blk_add_trace_bio_frontmerge(struct request_queue *q,
+ struct bio *bio)
+{
+ blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+}
+
+static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
+{
+ blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+}
+
+static void blk_add_trace_getrq(struct request_queue *q,
+ struct bio *bio, int rw)
+{
+ if (bio)
+ blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
+ else {
+ struct blk_trace *bt = q->blk_trace;
+
+ if (bt)
+ __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
+ }
+}
+
+
+static void blk_add_trace_sleeprq(struct request_queue *q,
+ struct bio *bio, int rw)
+{
+ if (bio)
+ blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
+ else {
+ struct blk_trace *bt = q->blk_trace;
+
+ if (bt)
+ __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
+ 0, 0, NULL);
+ }
+}
+
+static void blk_add_trace_plug(struct request_queue *q)
+{
+ struct blk_trace *bt = q->blk_trace;
+
+ if (bt)
+ __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+}
+
+static void blk_add_trace_unplug_io(struct request_queue *q)
+{
+ struct blk_trace *bt = q->blk_trace;
+
+ if (bt) {
+ unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+ __be64 rpdu = cpu_to_be64(pdu);
+
+ __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
+ sizeof(rpdu), &rpdu);
+ }
+}
+
+static void blk_add_trace_unplug_timer(struct request_queue *q)
+{
+ struct blk_trace *bt = q->blk_trace;
+
+ if (bt) {
+ unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+ __be64 rpdu = cpu_to_be64(pdu);
+
+ __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
+ sizeof(rpdu), &rpdu);
+ }
+}
+
+static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
+ unsigned int pdu)
+{
+ struct blk_trace *bt = q->blk_trace;
+
+ if (bt) {
+ __be64 rpdu = cpu_to_be64(pdu);
+
+ __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
+ BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
+ sizeof(rpdu), &rpdu);
+ }
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * @q: queue the io is for
+ * @bio: the source bio
+ * @dev: target device
+ * @from: source sector
+ *
+ * Description:
+ * Device mapper or raid target sometimes need to split a bio because
+ * it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
+ dev_t dev, sector_t from)
+{
+ struct blk_trace *bt = q->blk_trace;
+ struct blk_io_trace_remap r;
+
+ if (likely(!bt))
+ return;
+
+ r.device_from = cpu_to_be32(dev);
+ r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev);
+ r.sector_from = cpu_to_be64(from);
+
+ __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
+ BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE),
+ sizeof(r), &r);
+}
+
+/**
+ * blk_add_driver_data - Add binary message with driver-specific data
+ * @q: queue the io is for
+ * @rq: io request
+ * @data: driver-specific data
+ * @len: length of driver-specific data
+ *
+ * Description:
+ * Some drivers might want to write driver-specific data per request.
+ *
+ **/
+void blk_add_driver_data(struct request_queue *q,
+ struct request *rq,
+ void *data, size_t len)
+{
+ struct blk_trace *bt = q->blk_trace;
+
+ if (likely(!bt))
+ return;
+
+ if (blk_pc_request(rq))
+ __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
+ BLK_TA_DRV_DATA, rq->errors, len, data);
+ else
+ __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
+ BLK_TA_DRV_DATA, rq->errors, len, data);
+}
+EXPORT_SYMBOL_GPL(blk_add_driver_data);
+
+static void blk_register_tracepoints(void)
+{
+ int ret;
+
+ ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
+ WARN_ON(ret);
+ ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
+ WARN_ON(ret);
+ ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
+ WARN_ON(ret);
+ ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+ WARN_ON(ret);
+ ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
+ WARN_ON(ret);
+ ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+ WARN_ON(ret);
+ ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
+ WARN_ON(ret);
+ ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+ WARN_ON(ret);
+ ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+ WARN_ON(ret);
+ ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
+ WARN_ON(ret);
+ ret = register_trace_block_getrq(blk_add_trace_getrq);
+ WARN_ON(ret);
+ ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
+ WARN_ON(ret);
+ ret = register_trace_block_plug(blk_add_trace_plug);
+ WARN_ON(ret);
+ ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+ WARN_ON(ret);
+ ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
+ WARN_ON(ret);
+ ret = register_trace_block_split(blk_add_trace_split);
+ WARN_ON(ret);
+ ret = register_trace_block_remap(blk_add_trace_remap);
+ WARN_ON(ret);
+}
+
+static void blk_unregister_tracepoints(void)
+{
+ unregister_trace_block_remap(blk_add_trace_remap);
+ unregister_trace_block_split(blk_add_trace_split);
+ unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
+ unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+ unregister_trace_block_plug(blk_add_trace_plug);
+ unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
+ unregister_trace_block_getrq(blk_add_trace_getrq);
+ unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
+ unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+ unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+ unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
+ unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+ unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
+ unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+ unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
+ unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
+ unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
+
+ tracepoint_synchronize_unregister();
+}
+
+/*
+ * struct blk_io_tracer formatting routines
+ */
+
+static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
+{
+ int i = 0;
+ int tc = t->action >> BLK_TC_SHIFT;
+
+ if (t->action == BLK_TN_MESSAGE) {
+ rwbs[i++] = 'N';
+ goto out;
+ }
+
+ if (tc & BLK_TC_DISCARD)
+ rwbs[i++] = 'D';
+ else if (tc & BLK_TC_WRITE)
+ rwbs[i++] = 'W';
+ else if (t->bytes)
+ rwbs[i++] = 'R';
+ else
+ rwbs[i++] = 'N';
+
+ if (tc & BLK_TC_AHEAD)
+ rwbs[i++] = 'A';
+ if (tc & BLK_TC_BARRIER)
+ rwbs[i++] = 'B';
+ if (tc & BLK_TC_SYNC)
+ rwbs[i++] = 'S';
+ if (tc & BLK_TC_META)
+ rwbs[i++] = 'M';
+out:
+ rwbs[i] = '\0';
+}
+
+static inline
+const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
+{
+ return (const struct blk_io_trace *)ent;
+}
+
+static inline const void *pdu_start(const struct trace_entry *ent)
+{
+ return te_blk_io_trace(ent) + 1;
+}
+
+static inline u32 t_action(const struct trace_entry *ent)
+{
+ return te_blk_io_trace(ent)->action;
+}
+
+static inline u32 t_bytes(const struct trace_entry *ent)
+{
+ return te_blk_io_trace(ent)->bytes;
+}
+
+static inline u32 t_sec(const struct trace_entry *ent)
+{
+ return te_blk_io_trace(ent)->bytes >> 9;
+}
+
+static inline unsigned long long t_sector(const struct trace_entry *ent)
+{
+ return te_blk_io_trace(ent)->sector;
+}
+
+static inline __u16 t_error(const struct trace_entry *ent)
+{
+ return te_blk_io_trace(ent)->error;
+}
+
+static __u64 get_pdu_int(const struct trace_entry *ent)
+{
+ const __u64 *val = pdu_start(ent);
+ return be64_to_cpu(*val);
+}
+
+static void get_pdu_remap(const struct trace_entry *ent,
+ struct blk_io_trace_remap *r)
+{
+ const struct blk_io_trace_remap *__r = pdu_start(ent);
+ __u64 sector_from = __r->sector_from;
+
+ r->device_from = be32_to_cpu(__r->device_from);
+ r->device_to = be32_to_cpu(__r->device_to);
+ r->sector_from = be64_to_cpu(sector_from);
+}
+
+typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
+
+static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
+{
+ char rwbs[6];
+ unsigned long long ts = iter->ts;
+ unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
+ unsigned secs = (unsigned long)ts;
+ const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
+
+ fill_rwbs(rwbs, t);
+
+ return trace_seq_printf(&iter->seq,
+ "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
+ MAJOR(t->device), MINOR(t->device), iter->cpu,
+ secs, nsec_rem, iter->ent->pid, act, rwbs);
+}
+
+static int blk_log_action(struct trace_iterator *iter, const char *act)
+{
+ char rwbs[6];
+ const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
+
+ fill_rwbs(rwbs, t);
+ return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
+ MAJOR(t->device), MINOR(t->device), act, rwbs);
+}
+
+static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+{
+ const unsigned char *pdu_buf;
+ int pdu_len;
+ int i, end, ret;
+
+ pdu_buf = pdu_start(ent);
+ pdu_len = te_blk_io_trace(ent)->pdu_len;
+
+ if (!pdu_len)
+ return 1;
+
+ /* find the last zero that needs to be printed */
+ for (end = pdu_len - 1; end >= 0; end--)
+ if (pdu_buf[end])
+ break;
+ end++;
+
+ if (!trace_seq_putc(s, '('))
+ return 0;
+
+ for (i = 0; i < pdu_len; i++) {
+
+ ret = trace_seq_printf(s, "%s%02x",
+ i == 0 ? "" : " ", pdu_buf[i]);
+ if (!ret)
+ return ret;
+
+ /*
+ * stop when the rest is just zeroes and indicate so
+ * with a ".." appended
+ */
+ if (i == end && end != pdu_len - 1)
+ return trace_seq_puts(s, " ..) ");
+ }
+
+ return trace_seq_puts(s, ") ");
+}
+
+static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
+{
+ char cmd[TASK_COMM_LEN];
+
+ trace_find_cmdline(ent->pid, cmd);
+
+ if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
+ int ret;
+
+ ret = trace_seq_printf(s, "%u ", t_bytes(ent));
+ if (!ret)
+ return 0;
+ ret = blk_log_dump_pdu(s, ent);
+ if (!ret)
+ return 0;
+ return trace_seq_printf(s, "[%s]\n", cmd);
+ } else {
+ if (t_sec(ent))
+ return trace_seq_printf(s, "%llu + %u [%s]\n",
+ t_sector(ent), t_sec(ent), cmd);
+ return trace_seq_printf(s, "[%s]\n", cmd);
+ }
+}
+
+static int blk_log_with_error(struct trace_seq *s,
+ const struct trace_entry *ent)
+{
+ if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
+ int ret;
+
+ ret = blk_log_dump_pdu(s, ent);
+ if (ret)
+ return trace_seq_printf(s, "[%d]\n", t_error(ent));
+ return 0;
+ } else {
+ if (t_sec(ent))
+ return trace_seq_printf(s, "%llu + %u [%d]\n",
+ t_sector(ent),
+ t_sec(ent), t_error(ent));
+ return trace_seq_printf(s, "%llu [%d]\n",
+ t_sector(ent), t_error(ent));
+ }
+}
+
+static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
+{
+ struct blk_io_trace_remap r = { .device_from = 0, };
+
+ get_pdu_remap(ent, &r);
+ return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
+ t_sector(ent), t_sec(ent),
+ MAJOR(r.device_from), MINOR(r.device_from),
+ (unsigned long long)r.sector_from);
+}
+
+static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
+{
+ char cmd[TASK_COMM_LEN];
+
+ trace_find_cmdline(ent->pid, cmd);
+
+ return trace_seq_printf(s, "[%s]\n", cmd);
+}
+
+static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
+{
+ char cmd[TASK_COMM_LEN];
+
+ trace_find_cmdline(ent->pid, cmd);
+
+ return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
+}
+
+static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
+{
+ char cmd[TASK_COMM_LEN];
+
+ trace_find_cmdline(ent->pid, cmd);
+
+ return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
+ get_pdu_int(ent), cmd);
+}
+
+static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
+{
+ int ret;
+ const struct blk_io_trace *t = te_blk_io_trace(ent);
+
+ ret = trace_seq_putmem(s, t + 1, t->pdu_len);
+ if (ret)
+ return trace_seq_putc(s, '\n');
+ return ret;
+}
+
+/*
+ * struct tracer operations
+ */
+
+static void blk_tracer_print_header(struct seq_file *m)
+{
+ if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
+ return;
+ seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n"
+ "# | | | | | |\n");
+}
+
+static void blk_tracer_start(struct trace_array *tr)
+{
+ blk_tracer_enabled = true;
+}
+
+static int blk_tracer_init(struct trace_array *tr)
+{
+ blk_tr = tr;
+ blk_tracer_start(tr);
+ return 0;
+}
+
+static void blk_tracer_stop(struct trace_array *tr)
+{
+ blk_tracer_enabled = false;
+}
+
+static void blk_tracer_reset(struct trace_array *tr)
+{
+ blk_tracer_stop(tr);
+}
+
+static const struct {
+ const char *act[2];
+ int (*print)(struct trace_seq *s, const struct trace_entry *ent);
+} what2act[] = {
+ [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
+ [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
+ [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic },
+ [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic },
+ [__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic },
+ [__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error },
+ [__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic },
+ [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error },
+ [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug },
+ [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug },
+ [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug },
+ [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic },
+ [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split },
+ [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic },
+ [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap },
+};
+
+static enum print_line_t print_one_line(struct trace_iterator *iter,
+ bool classic)
+{
+ struct trace_seq *s = &iter->seq;
+ const struct blk_io_trace *t;
+ u16 what;
+ int ret;
+ bool long_act;
+ blk_log_action_t *log_action;
+
+ t = te_blk_io_trace(iter->ent);
+ what = t->action & ((1 << BLK_TC_SHIFT) - 1);
+ long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
+ log_action = classic ? &blk_log_action_classic : &blk_log_action;
+
+ if (t->action == BLK_TN_MESSAGE) {
+ ret = log_action(iter, long_act ? "message" : "m");
+ if (ret)
+ ret = blk_log_msg(s, iter->ent);
+ goto out;
+ }
+
+ if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
+ ret = trace_seq_printf(s, "Unknown action %x\n", what);
+ else {
+ ret = log_action(iter, what2act[what].act[long_act]);
+ if (ret)
+ ret = what2act[what].print(s, iter->ent);
+ }
+out:
+ return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
+ int flags)
+{
+ return print_one_line(iter, false);
+}
+
+static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
+ const int offset = offsetof(struct blk_io_trace, sector);
+ struct blk_io_trace old = {
+ .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
+ .time = iter->ts,
+ };
+
+ if (!trace_seq_putmem(s, &old, offset))
+ return 0;
+ return trace_seq_putmem(s, &t->sector,
+ sizeof(old) - offset + t->pdu_len);
+}
+
+static enum print_line_t
+blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
+{
+ return blk_trace_synthesize_old_trace(iter) ?
+ TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
+{
+ if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
+ return TRACE_TYPE_UNHANDLED;
+
+ return print_one_line(iter, true);
+}
+
+static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set)
+{
+ /* don't output context-info for blk_classic output */
+ if (bit == TRACE_BLK_OPT_CLASSIC) {
+ if (set)
+ trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
+ else
+ trace_flags |= TRACE_ITER_CONTEXT_INFO;
+ }
+ return 0;
+}
+
+static struct tracer blk_tracer __read_mostly = {
+ .name = "blk",
+ .init = blk_tracer_init,
+ .reset = blk_tracer_reset,
+ .start = blk_tracer_start,
+ .stop = blk_tracer_stop,
+ .print_header = blk_tracer_print_header,
+ .print_line = blk_tracer_print_line,
+ .flags = &blk_tracer_flags,
+ .set_flag = blk_tracer_set_flag,
+};
+
+static struct trace_event trace_blk_event = {
+ .type = TRACE_BLK,
+ .trace = blk_trace_event_print,
+ .binary = blk_trace_event_print_binary,
+};
+
+static int __init init_blk_tracer(void)
+{
+ if (!register_ftrace_event(&trace_blk_event)) {
+ pr_warning("Warning: could not register block events\n");
+ return 1;
+ }
+
+ if (register_tracer(&blk_tracer) != 0) {
+ pr_warning("Warning: could not register the block tracer\n");
+ unregister_ftrace_event(&trace_blk_event);
+ return 1;
+ }
+
+ return 0;
+}
+
+device_initcall(init_blk_tracer);
+
+static int blk_trace_remove_queue(struct request_queue *q)
+{
+ struct blk_trace *bt;
+
+ bt = xchg(&q->blk_trace, NULL);
+ if (bt == NULL)
+ return -EINVAL;
+
+ if (atomic_dec_and_test(&blk_probes_ref))
+ blk_unregister_tracepoints();
+
+ blk_trace_free(bt);
+ return 0;
+}
+
+/*
+ * Setup everything required to start tracing
+ */
+static int blk_trace_setup_queue(struct request_queue *q,
+ struct block_device *bdev)
+{
+ struct blk_trace *old_bt, *bt = NULL;
+ int ret = -ENOMEM;
+
+ bt = kzalloc(sizeof(*bt), GFP_KERNEL);
+ if (!bt)
+ return -ENOMEM;
+
+ bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
+ if (!bt->msg_data)
+ goto free_bt;
+
+ bt->dev = bdev->bd_dev;
+ bt->act_mask = (u16)-1;
+
+ blk_trace_setup_lba(bt, bdev);
+
+ old_bt = xchg(&q->blk_trace, bt);
+ if (old_bt != NULL) {
+ (void)xchg(&q->blk_trace, old_bt);
+ ret = -EBUSY;
+ goto free_bt;
+ }
+
+ if (atomic_inc_return(&blk_probes_ref) == 1)
+ blk_register_tracepoints();
+ return 0;
+
+free_bt:
+ blk_trace_free(bt);
+ return ret;
+}
+
+/*
+ * sysfs interface to enable and configure tracing
+ */
+
+static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf);
+static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count);
+#define BLK_TRACE_DEVICE_ATTR(_name) \
+ DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \
+ sysfs_blk_trace_attr_show, \
+ sysfs_blk_trace_attr_store)
+
+static BLK_TRACE_DEVICE_ATTR(enable);
+static BLK_TRACE_DEVICE_ATTR(act_mask);
+static BLK_TRACE_DEVICE_ATTR(pid);
+static BLK_TRACE_DEVICE_ATTR(start_lba);
+static BLK_TRACE_DEVICE_ATTR(end_lba);
+
+static struct attribute *blk_trace_attrs[] = {
+ &dev_attr_enable.attr,
+ &dev_attr_act_mask.attr,
+ &dev_attr_pid.attr,
+ &dev_attr_start_lba.attr,
+ &dev_attr_end_lba.attr,
+ NULL
+};
+
+struct attribute_group blk_trace_attr_group = {
+ .name = "trace",
+ .attrs = blk_trace_attrs,
+};
+
+static const struct {
+ int mask;
+ const char *str;
+} mask_maps[] = {
+ { BLK_TC_READ, "read" },
+ { BLK_TC_WRITE, "write" },
+ { BLK_TC_BARRIER, "barrier" },
+ { BLK_TC_SYNC, "sync" },
+ { BLK_TC_QUEUE, "queue" },
+ { BLK_TC_REQUEUE, "requeue" },
+ { BLK_TC_ISSUE, "issue" },
+ { BLK_TC_COMPLETE, "complete" },
+ { BLK_TC_FS, "fs" },
+ { BLK_TC_PC, "pc" },
+ { BLK_TC_AHEAD, "ahead" },
+ { BLK_TC_META, "meta" },
+ { BLK_TC_DISCARD, "discard" },
+ { BLK_TC_DRV_DATA, "drv_data" },
+};
+
+static int blk_trace_str2mask(const char *str)
+{
+ int i;
+ int mask = 0;
+ char *buf, *s, *token;
+
+ buf = kstrdup(str, GFP_KERNEL);
+ if (buf == NULL)
+ return -ENOMEM;
+ s = strstrip(buf);
+
+ while (1) {
+ token = strsep(&s, ",");
+ if (token == NULL)
+ break;
+
+ if (*token == '\0')
+ continue;
+
+ for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
+ if (strcasecmp(token, mask_maps[i].str) == 0) {
+ mask |= mask_maps[i].mask;
+ break;
+ }
+ }
+ if (i == ARRAY_SIZE(mask_maps)) {
+ mask = -EINVAL;
+ break;
+ }
+ }
+ kfree(buf);
+
+ return mask;
+}
+
+static ssize_t blk_trace_mask2str(char *buf, int mask)
+{
+ int i;
+ char *p = buf;
+
+ for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
+ if (mask & mask_maps[i].mask) {
+ p += sprintf(p, "%s%s",
+ (p == buf) ? "" : ",", mask_maps[i].str);
+ }
+ }
+ *p++ = '\n';
+
+ return p - buf;
+}
+
+static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
+{
+ if (bdev->bd_disk == NULL)
+ return NULL;
+
+ return bdev_get_queue(bdev);
+}
+
+static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct hd_struct *p = dev_to_part(dev);
+ struct request_queue *q;
+ struct block_device *bdev;
+ ssize_t ret = -ENXIO;
+
+ lock_kernel();
+ bdev = bdget(part_devt(p));
+ if (bdev == NULL)
+ goto out_unlock_kernel;
+
+ q = blk_trace_get_queue(bdev);
+ if (q == NULL)
+ goto out_bdput;
+
+ mutex_lock(&bdev->bd_mutex);
+
+ if (attr == &dev_attr_enable) {
+ ret = sprintf(buf, "%u\n", !!q->blk_trace);
+ goto out_unlock_bdev;
+ }
+
+ if (q->blk_trace == NULL)
+ ret = sprintf(buf, "disabled\n");
+ else if (attr == &dev_attr_act_mask)
+ ret = blk_trace_mask2str(buf, q->blk_trace->act_mask);
+ else if (attr == &dev_attr_pid)
+ ret = sprintf(buf, "%u\n", q->blk_trace->pid);
+ else if (attr == &dev_attr_start_lba)
+ ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
+ else if (attr == &dev_attr_end_lba)
+ ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
+
+out_unlock_bdev:
+ mutex_unlock(&bdev->bd_mutex);
+out_bdput:
+ bdput(bdev);
+out_unlock_kernel:
+ unlock_kernel();
+ return ret;
+}
+
+static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct block_device *bdev;
+ struct request_queue *q;
+ struct hd_struct *p;
+ u64 value;
+ ssize_t ret = -EINVAL;
+
+ if (count == 0)
+ goto out;
+
+ if (attr == &dev_attr_act_mask) {
+ if (sscanf(buf, "%llx", &value) != 1) {
+ /* Assume it is a list of trace category names */
+ ret = blk_trace_str2mask(buf);
+ if (ret < 0)
+ goto out;
+ value = ret;
+ }
+ } else if (sscanf(buf, "%llu", &value) != 1)
+ goto out;
+
+ ret = -ENXIO;
+
+ lock_kernel();
+ p = dev_to_part(dev);
+ bdev = bdget(part_devt(p));
+ if (bdev == NULL)
+ goto out_unlock_kernel;
+
+ q = blk_trace_get_queue(bdev);
+ if (q == NULL)
+ goto out_bdput;
+
+ mutex_lock(&bdev->bd_mutex);
+
+ if (attr == &dev_attr_enable) {
+ if (value)
+ ret = blk_trace_setup_queue(q, bdev);
+ else
+ ret = blk_trace_remove_queue(q);
+ goto out_unlock_bdev;
+ }
+
+ ret = 0;
+ if (q->blk_trace == NULL)
+ ret = blk_trace_setup_queue(q, bdev);
+
+ if (ret == 0) {
+ if (attr == &dev_attr_act_mask)
+ q->blk_trace->act_mask = value;
+ else if (attr == &dev_attr_pid)
+ q->blk_trace->pid = value;
+ else if (attr == &dev_attr_start_lba)
+ q->blk_trace->start_lba = value;
+ else if (attr == &dev_attr_end_lba)
+ q->blk_trace->end_lba = value;
+ }
+
+out_unlock_bdev:
+ mutex_unlock(&bdev->bd_mutex);
+out_bdput:
+ bdput(bdev);
+out_unlock_kernel:
+ unlock_kernel();
+out:
+ return ret ? ret : count;
+}
+
+int blk_trace_init_sysfs(struct device *dev)
+{
+ return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
+}
+
+#endif /* CONFIG_BLK_DEV_IO_TRACE */
+
+#ifdef CONFIG_EVENT_TRACING
+
+void blk_dump_cmd(char *buf, struct request *rq)
+{
+ int i, end;
+ int len = rq->cmd_len;
+ unsigned char *cmd = rq->cmd;
+
+ if (!blk_pc_request(rq)) {
+ buf[0] = '\0';
+ return;
+ }
+
+ for (end = len - 1; end >= 0; end--)
+ if (cmd[end])
+ break;
+ end++;
+
+ for (i = 0; i < len; i++) {
+ buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
+ if (i == end && end != len - 1) {
+ sprintf(buf, " ..");
+ break;
+ }
+ }
+}
+
+void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
+{
+ int i = 0;
+
+ if (rw & WRITE)
+ rwbs[i++] = 'W';
+ else if (rw & 1 << BIO_RW_DISCARD)
+ rwbs[i++] = 'D';
+ else if (bytes)
+ rwbs[i++] = 'R';
+ else
+ rwbs[i++] = 'N';
+
+ if (rw & 1 << BIO_RW_AHEAD)
+ rwbs[i++] = 'A';
+ if (rw & 1 << BIO_RW_BARRIER)
+ rwbs[i++] = 'B';
+ if (rw & 1 << BIO_RW_SYNCIO)
+ rwbs[i++] = 'S';
+ if (rw & 1 << BIO_RW_META)
+ rwbs[i++] = 'M';
+
+ rwbs[i] = '\0';
+}
+
+void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
+{
+ int rw = rq->cmd_flags & 0x03;
+ int bytes;
+
+ if (blk_discard_rq(rq))
+ rw |= (1 << BIO_RW_DISCARD);
+
+ bytes = blk_rq_bytes(rq);
+
+ blk_fill_rwbs(rwbs, rw, bytes);
+}
+
+#endif /* CONFIG_EVENT_TRACING */
+
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2f32969c09d..8c804e24f96 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -17,6 +17,7 @@
#include <linux/clocksource.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
+#include <linux/suspend.h>
#include <linux/debugfs.h>
#include <linux/hardirq.h>
#include <linux/kthread.h>
@@ -26,10 +27,15 @@
#include <linux/sysctl.h>
#include <linux/ctype.h>
#include <linux/list.h>
+#include <linux/hash.h>
+
+#include <trace/events/sched.h>
#include <asm/ftrace.h>
+#include <asm/setup.h>
-#include "trace.h"
+#include "trace_output.h"
+#include "trace_stat.h"
#define FTRACE_WARN_ON(cond) \
do { \
@@ -43,14 +49,14 @@
ftrace_kill(); \
} while (0)
+/* hash bits for specific function selection */
+#define FTRACE_HASH_BITS 7
+#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
+
/* ftrace_enabled is a method to turn ftrace on or off */
int ftrace_enabled __read_mostly;
static int last_ftrace_enabled;
-/* set when tracing only a pid */
-struct pid *ftrace_pid_trace;
-static struct pid * const ftrace_swapper_pid = &init_struct_pid;
-
/* Quick disabling of function tracer. */
int function_trace_stop;
@@ -60,13 +66,11 @@ int function_trace_stop;
*/
static int ftrace_disabled __read_mostly;
-static DEFINE_SPINLOCK(ftrace_lock);
-static DEFINE_MUTEX(ftrace_sysctl_lock);
-static DEFINE_MUTEX(ftrace_start_lock);
+static DEFINE_MUTEX(ftrace_lock);
static struct ftrace_ops ftrace_list_end __read_mostly =
{
- .func = ftrace_stub,
+ .func = ftrace_stub,
};
static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
@@ -133,9 +137,6 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
static int __register_ftrace_function(struct ftrace_ops *ops)
{
- /* should not be called from interrupt context */
- spin_lock(&ftrace_lock);
-
ops->next = ftrace_list;
/*
* We are entering ops into the ftrace_list but another
@@ -171,18 +172,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
#endif
}
- spin_unlock(&ftrace_lock);
-
return 0;
}
static int __unregister_ftrace_function(struct ftrace_ops *ops)
{
struct ftrace_ops **p;
- int ret = 0;
-
- /* should not be called from interrupt context */
- spin_lock(&ftrace_lock);
/*
* If we are removing the last function, then simply point
@@ -191,17 +186,15 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
if (ftrace_list == ops && ops->next == &ftrace_list_end) {
ftrace_trace_function = ftrace_stub;
ftrace_list = &ftrace_list_end;
- goto out;
+ return 0;
}
for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
if (*p == ops)
break;
- if (*p != ops) {
- ret = -1;
- goto out;
- }
+ if (*p != ops)
+ return -1;
*p = (*p)->next;
@@ -222,21 +215,15 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
}
}
- out:
- spin_unlock(&ftrace_lock);
-
- return ret;
+ return 0;
}
static void ftrace_update_pid_func(void)
{
ftrace_func_t func;
- /* should not be called from interrupt context */
- spin_lock(&ftrace_lock);
-
if (ftrace_trace_function == ftrace_stub)
- goto out;
+ return;
func = ftrace_trace_function;
@@ -253,23 +240,603 @@ static void ftrace_update_pid_func(void)
#else
__ftrace_trace_function = func;
#endif
+}
+
+#ifdef CONFIG_FUNCTION_PROFILER
+struct ftrace_profile {
+ struct hlist_node node;
+ unsigned long ip;
+ unsigned long counter;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ unsigned long long time;
+#endif
+};
+
+struct ftrace_profile_page {
+ struct ftrace_profile_page *next;
+ unsigned long index;
+ struct ftrace_profile records[];
+};
+
+struct ftrace_profile_stat {
+ atomic_t disabled;
+ struct hlist_head *hash;
+ struct ftrace_profile_page *pages;
+ struct ftrace_profile_page *start;
+ struct tracer_stat stat;
+};
+
+#define PROFILE_RECORDS_SIZE \
+ (PAGE_SIZE - offsetof(struct ftrace_profile_page, records))
+
+#define PROFILES_PER_PAGE \
+ (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
+
+static int ftrace_profile_bits __read_mostly;
+static int ftrace_profile_enabled __read_mostly;
+
+/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
+static DEFINE_MUTEX(ftrace_profile_lock);
+
+static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
+
+#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
+
+static void *
+function_stat_next(void *v, int idx)
+{
+ struct ftrace_profile *rec = v;
+ struct ftrace_profile_page *pg;
+
+ pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
+
+ again:
+ if (idx != 0)
+ rec++;
+
+ if ((void *)rec >= (void *)&pg->records[pg->index]) {
+ pg = pg->next;
+ if (!pg)
+ return NULL;
+ rec = &pg->records[0];
+ if (!rec->counter)
+ goto again;
+ }
+
+ return rec;
+}
+
+static void *function_stat_start(struct tracer_stat *trace)
+{
+ struct ftrace_profile_stat *stat =
+ container_of(trace, struct ftrace_profile_stat, stat);
+
+ if (!stat || !stat->start)
+ return NULL;
+
+ return function_stat_next(&stat->start->records[0], 0);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/* function graph compares on total time */
+static int function_stat_cmp(void *p1, void *p2)
+{
+ struct ftrace_profile *a = p1;
+ struct ftrace_profile *b = p2;
+
+ if (a->time < b->time)
+ return -1;
+ if (a->time > b->time)
+ return 1;
+ else
+ return 0;
+}
+#else
+/* not function graph compares against hits */
+static int function_stat_cmp(void *p1, void *p2)
+{
+ struct ftrace_profile *a = p1;
+ struct ftrace_profile *b = p2;
+
+ if (a->counter < b->counter)
+ return -1;
+ if (a->counter > b->counter)
+ return 1;
+ else
+ return 0;
+}
+#endif
+
+static int function_stat_headers(struct seq_file *m)
+{
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ seq_printf(m, " Function "
+ "Hit Time Avg\n"
+ " -------- "
+ "--- ---- ---\n");
+#else
+ seq_printf(m, " Function Hit\n"
+ " -------- ---\n");
+#endif
+ return 0;
+}
+
+static int function_stat_show(struct seq_file *m, void *v)
+{
+ struct ftrace_profile *rec = v;
+ char str[KSYM_SYMBOL_LEN];
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ static DEFINE_MUTEX(mutex);
+ static struct trace_seq s;
+ unsigned long long avg;
+#endif
+
+ kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+ seq_printf(m, " %-30.30s %10lu", str, rec->counter);
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ seq_printf(m, " ");
+ avg = rec->time;
+ do_div(avg, rec->counter);
+
+ mutex_lock(&mutex);
+ trace_seq_init(&s);
+ trace_print_graph_duration(rec->time, &s);
+ trace_seq_puts(&s, " ");
+ trace_print_graph_duration(avg, &s);
+ trace_print_seq(m, &s);
+ mutex_unlock(&mutex);
+#endif
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
+{
+ struct ftrace_profile_page *pg;
+
+ pg = stat->pages = stat->start;
+
+ while (pg) {
+ memset(pg->records, 0, PROFILE_RECORDS_SIZE);
+ pg->index = 0;
+ pg = pg->next;
+ }
+
+ memset(stat->hash, 0,
+ FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
+}
+
+int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
+{
+ struct ftrace_profile_page *pg;
+ int functions;
+ int pages;
+ int i;
+
+ /* If we already allocated, do nothing */
+ if (stat->pages)
+ return 0;
+
+ stat->pages = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!stat->pages)
+ return -ENOMEM;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+ functions = ftrace_update_tot_cnt;
+#else
+ /*
+ * We do not know the number of functions that exist because
+ * dynamic tracing is what counts them. With past experience
+ * we have around 20K functions. That should be more than enough.
+ * It is highly unlikely we will execute every function in
+ * the kernel.
+ */
+ functions = 20000;
+#endif
+
+ pg = stat->start = stat->pages;
+
+ pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
+
+ for (i = 0; i < pages; i++) {
+ pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!pg->next)
+ goto out_free;
+ pg = pg->next;
+ }
+
+ return 0;
+
+ out_free:
+ pg = stat->start;
+ while (pg) {
+ unsigned long tmp = (unsigned long)pg;
+
+ pg = pg->next;
+ free_page(tmp);
+ }
+
+ free_page((unsigned long)stat->pages);
+ stat->pages = NULL;
+ stat->start = NULL;
+
+ return -ENOMEM;
+}
+
+static int ftrace_profile_init_cpu(int cpu)
+{
+ struct ftrace_profile_stat *stat;
+ int size;
+
+ stat = &per_cpu(ftrace_profile_stats, cpu);
+
+ if (stat->hash) {
+ /* If the profile is already created, simply reset it */
+ ftrace_profile_reset(stat);
+ return 0;
+ }
+
+ /*
+ * We are profiling all functions, but usually only a few thousand
+ * functions are hit. We'll make a hash of 1024 items.
+ */
+ size = FTRACE_PROFILE_HASH_SIZE;
+
+ stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
+
+ if (!stat->hash)
+ return -ENOMEM;
+
+ if (!ftrace_profile_bits) {
+ size--;
+
+ for (; size; size >>= 1)
+ ftrace_profile_bits++;
+ }
+
+ /* Preallocate the function profiling pages */
+ if (ftrace_profile_pages_init(stat) < 0) {
+ kfree(stat->hash);
+ stat->hash = NULL;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int ftrace_profile_init(void)
+{
+ int cpu;
+ int ret = 0;
+
+ for_each_online_cpu(cpu) {
+ ret = ftrace_profile_init_cpu(cpu);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+/* interrupts must be disabled */
+static struct ftrace_profile *
+ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
+{
+ struct ftrace_profile *rec;
+ struct hlist_head *hhd;
+ struct hlist_node *n;
+ unsigned long key;
+
+ key = hash_long(ip, ftrace_profile_bits);
+ hhd = &stat->hash[key];
+
+ if (hlist_empty(hhd))
+ return NULL;
+ hlist_for_each_entry_rcu(rec, n, hhd, node) {
+ if (rec->ip == ip)
+ return rec;
+ }
+
+ return NULL;
+}
+
+static void ftrace_add_profile(struct ftrace_profile_stat *stat,
+ struct ftrace_profile *rec)
+{
+ unsigned long key;
+
+ key = hash_long(rec->ip, ftrace_profile_bits);
+ hlist_add_head_rcu(&rec->node, &stat->hash[key]);
+}
+
+/*
+ * The memory is already allocated, this simply finds a new record to use.
+ */
+static struct ftrace_profile *
+ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
+{
+ struct ftrace_profile *rec = NULL;
+
+ /* prevent recursion (from NMIs) */
+ if (atomic_inc_return(&stat->disabled) != 1)
+ goto out;
+
+ /*
+ * Try to find the function again since an NMI
+ * could have added it
+ */
+ rec = ftrace_find_profiled_func(stat, ip);
+ if (rec)
+ goto out;
+
+ if (stat->pages->index == PROFILES_PER_PAGE) {
+ if (!stat->pages->next)
+ goto out;
+ stat->pages = stat->pages->next;
+ }
+
+ rec = &stat->pages->records[stat->pages->index++];
+ rec->ip = ip;
+ ftrace_add_profile(stat, rec);
+
+ out:
+ atomic_dec(&stat->disabled);
+
+ return rec;
+}
+
+static void
+function_profile_call(unsigned long ip, unsigned long parent_ip)
+{
+ struct ftrace_profile_stat *stat;
+ struct ftrace_profile *rec;
+ unsigned long flags;
+
+ if (!ftrace_profile_enabled)
+ return;
+
+ local_irq_save(flags);
+
+ stat = &__get_cpu_var(ftrace_profile_stats);
+ if (!stat->hash || !ftrace_profile_enabled)
+ goto out;
+
+ rec = ftrace_find_profiled_func(stat, ip);
+ if (!rec) {
+ rec = ftrace_profile_alloc(stat, ip);
+ if (!rec)
+ goto out;
+ }
+
+ rec->counter++;
out:
- spin_unlock(&ftrace_lock);
+ local_irq_restore(flags);
}
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int profile_graph_entry(struct ftrace_graph_ent *trace)
+{
+ function_profile_call(trace->func, 0);
+ return 1;
+}
+
+static void profile_graph_return(struct ftrace_graph_ret *trace)
+{
+ struct ftrace_profile_stat *stat;
+ unsigned long long calltime;
+ struct ftrace_profile *rec;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ stat = &__get_cpu_var(ftrace_profile_stats);
+ if (!stat->hash || !ftrace_profile_enabled)
+ goto out;
+
+ calltime = trace->rettime - trace->calltime;
+
+ if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
+ int index;
+
+ index = trace->depth;
+
+ /* Append this call time to the parent time to subtract */
+ if (index)
+ current->ret_stack[index - 1].subtime += calltime;
+
+ if (current->ret_stack[index].subtime < calltime)
+ calltime -= current->ret_stack[index].subtime;
+ else
+ calltime = 0;
+ }
+
+ rec = ftrace_find_profiled_func(stat, trace->func);
+ if (rec)
+ rec->time += calltime;
+
+ out:
+ local_irq_restore(flags);
+}
+
+static int register_ftrace_profiler(void)
+{
+ return register_ftrace_graph(&profile_graph_return,
+ &profile_graph_entry);
+}
+
+static void unregister_ftrace_profiler(void)
+{
+ unregister_ftrace_graph();
+}
+#else
+static struct ftrace_ops ftrace_profile_ops __read_mostly =
+{
+ .func = function_profile_call,
+};
+
+static int register_ftrace_profiler(void)
+{
+ return register_ftrace_function(&ftrace_profile_ops);
+}
+
+static void unregister_ftrace_profiler(void)
+{
+ unregister_ftrace_function(&ftrace_profile_ops);
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+static ssize_t
+ftrace_profile_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ unsigned long val;
+ char buf[64]; /* big enough to hold a number */
+ int ret;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ val = !!val;
+
+ mutex_lock(&ftrace_profile_lock);
+ if (ftrace_profile_enabled ^ val) {
+ if (val) {
+ ret = ftrace_profile_init();
+ if (ret < 0) {
+ cnt = ret;
+ goto out;
+ }
+
+ ret = register_ftrace_profiler();
+ if (ret < 0) {
+ cnt = ret;
+ goto out;
+ }
+ ftrace_profile_enabled = 1;
+ } else {
+ ftrace_profile_enabled = 0;
+ /*
+ * unregister_ftrace_profiler calls stop_machine
+ * so this acts like an synchronize_sched.
+ */
+ unregister_ftrace_profiler();
+ }
+ }
+ out:
+ mutex_unlock(&ftrace_profile_lock);
+
+ filp->f_pos += cnt;
+
+ return cnt;
+}
+
+static ssize_t
+ftrace_profile_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64]; /* big enough to hold a number */
+ int r;
+
+ r = sprintf(buf, "%u\n", ftrace_profile_enabled);
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static const struct file_operations ftrace_profile_fops = {
+ .open = tracing_open_generic,
+ .read = ftrace_profile_read,
+ .write = ftrace_profile_write,
+};
+
+/* used to initialize the real stat files */
+static struct tracer_stat function_stats __initdata = {
+ .name = "functions",
+ .stat_start = function_stat_start,
+ .stat_next = function_stat_next,
+ .stat_cmp = function_stat_cmp,
+ .stat_headers = function_stat_headers,
+ .stat_show = function_stat_show
+};
+
+static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
+{
+ struct ftrace_profile_stat *stat;
+ struct dentry *entry;
+ char *name;
+ int ret;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ stat = &per_cpu(ftrace_profile_stats, cpu);
+
+ /* allocate enough for function name + cpu number */
+ name = kmalloc(32, GFP_KERNEL);
+ if (!name) {
+ /*
+ * The files created are permanent, if something happens
+ * we still do not free memory.
+ */
+ WARN(1,
+ "Could not allocate stat file for cpu %d\n",
+ cpu);
+ return;
+ }
+ stat->stat = function_stats;
+ snprintf(name, 32, "function%d", cpu);
+ stat->stat.name = name;
+ ret = register_stat_tracer(&stat->stat);
+ if (ret) {
+ WARN(1,
+ "Could not register function stat for cpu %d\n",
+ cpu);
+ kfree(name);
+ return;
+ }
+ }
+
+ entry = debugfs_create_file("function_profile_enabled", 0644,
+ d_tracer, NULL, &ftrace_profile_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'function_profile_enabled' entry\n");
+}
+
+#else /* CONFIG_FUNCTION_PROFILER */
+static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
+{
+}
+#endif /* CONFIG_FUNCTION_PROFILER */
+
+/* set when tracing only a pid */
+struct pid *ftrace_pid_trace;
+static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+
#ifdef CONFIG_DYNAMIC_FTRACE
+
#ifndef CONFIG_FTRACE_MCOUNT_RECORD
# error Dynamic ftrace depends on MCOUNT_RECORD
#endif
-/*
- * Since MCOUNT_ADDR may point to mcount itself, we do not want
- * to get it confused by reading a reference in the code as we
- * are parsing on objcopy output of text. Use a variable for
- * it instead.
- */
-static unsigned long mcount_addr = MCOUNT_ADDR;
+static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
+
+struct ftrace_func_probe {
+ struct hlist_node node;
+ struct ftrace_probe_ops *ops;
+ unsigned long flags;
+ unsigned long ip;
+ void *data;
+ struct rcu_head rcu;
+};
enum {
FTRACE_ENABLE_CALLS = (1 << 0),
@@ -283,13 +850,13 @@ enum {
static int ftrace_filtered;
-static LIST_HEAD(ftrace_new_addrs);
+static struct dyn_ftrace *ftrace_new_addrs;
static DEFINE_MUTEX(ftrace_regex_lock);
struct ftrace_page {
struct ftrace_page *next;
- unsigned long index;
+ int index;
struct dyn_ftrace records[];
};
@@ -304,6 +871,19 @@ static struct ftrace_page *ftrace_pages;
static struct dyn_ftrace *ftrace_free_records;
+/*
+ * This is a double for. Do not use 'break' to break out of the loop,
+ * you must use a goto.
+ */
+#define do_for_each_ftrace_rec(pg, rec) \
+ for (pg = ftrace_pages_start; pg; pg = pg->next) { \
+ int _____i; \
+ for (_____i = 0; _____i < pg->index; _____i++) { \
+ rec = &pg->records[_____i];
+
+#define while_for_each_ftrace_rec() \
+ } \
+ }
#ifdef CONFIG_KPROBES
@@ -337,36 +917,11 @@ static inline int record_frozen(struct dyn_ftrace *rec)
static void ftrace_free_rec(struct dyn_ftrace *rec)
{
- rec->ip = (unsigned long)ftrace_free_records;
+ rec->freelist = ftrace_free_records;
ftrace_free_records = rec;
rec->flags |= FTRACE_FL_FREE;
}
-void ftrace_release(void *start, unsigned long size)
-{
- struct dyn_ftrace *rec;
- struct ftrace_page *pg;
- unsigned long s = (unsigned long)start;
- unsigned long e = s + size;
- int i;
-
- if (ftrace_disabled || !start)
- return;
-
- /* should not be called from interrupt context */
- spin_lock(&ftrace_lock);
-
- for (pg = ftrace_pages_start; pg; pg = pg->next) {
- for (i = 0; i < pg->index; i++) {
- rec = &pg->records[i];
-
- if ((rec->ip >= s) && (rec->ip < e))
- ftrace_free_rec(rec);
- }
- }
- spin_unlock(&ftrace_lock);
-}
-
static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
{
struct dyn_ftrace *rec;
@@ -381,7 +936,7 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
return NULL;
}
- ftrace_free_records = (void *)rec->ip;
+ ftrace_free_records = rec->freelist;
memset(rec, 0, sizeof(*rec));
return rec;
}
@@ -413,8 +968,8 @@ ftrace_record_ip(unsigned long ip)
return NULL;
rec->ip = ip;
-
- list_add(&rec->list, &ftrace_new_addrs);
+ rec->newlist = ftrace_new_addrs;
+ ftrace_new_addrs = rec;
return rec;
}
@@ -460,111 +1015,75 @@ static void ftrace_bug(int failed, unsigned long ip)
static int
__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
{
- unsigned long ip, fl;
unsigned long ftrace_addr;
+ unsigned long flag = 0UL;
- ftrace_addr = (unsigned long)ftrace_caller;
-
- ip = rec->ip;
+ ftrace_addr = (unsigned long)FTRACE_ADDR;
/*
- * If this record is not to be traced and
- * it is not enabled then do nothing.
+ * If this record is not to be traced or we want to disable it,
+ * then disable it.
*
- * If this record is not to be traced and
- * it is enabled then disabled it.
+ * If we want to enable it and filtering is off, then enable it.
*
+ * If we want to enable it and filtering is on, enable it only if
+ * it's filtered
*/
- if (rec->flags & FTRACE_FL_NOTRACE) {
- if (rec->flags & FTRACE_FL_ENABLED)
- rec->flags &= ~FTRACE_FL_ENABLED;
- else
- return 0;
-
- } else if (ftrace_filtered && enable) {
- /*
- * Filtering is on:
- */
-
- fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
-
- /* Record is filtered and enabled, do nothing */
- if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
- return 0;
-
- /* Record is not filtered and is not enabled do nothing */
- if (!fl)
- return 0;
-
- /* Record is not filtered but enabled, disable it */
- if (fl == FTRACE_FL_ENABLED)
- rec->flags &= ~FTRACE_FL_ENABLED;
- else
- /* Otherwise record is filtered but not enabled, enable it */
- rec->flags |= FTRACE_FL_ENABLED;
- } else {
- /* Disable or not filtered */
-
- if (enable) {
- /* if record is enabled, do nothing */
- if (rec->flags & FTRACE_FL_ENABLED)
- return 0;
-
- rec->flags |= FTRACE_FL_ENABLED;
-
- } else {
+ if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) {
+ if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER))
+ flag = FTRACE_FL_ENABLED;
+ }
- /* if record is not enabled do nothing */
- if (!(rec->flags & FTRACE_FL_ENABLED))
- return 0;
+ /* If the state of this record hasn't changed, then do nothing */
+ if ((rec->flags & FTRACE_FL_ENABLED) == flag)
+ return 0;
- rec->flags &= ~FTRACE_FL_ENABLED;
- }
+ if (flag) {
+ rec->flags |= FTRACE_FL_ENABLED;
+ return ftrace_make_call(rec, ftrace_addr);
}
- if (rec->flags & FTRACE_FL_ENABLED)
- return ftrace_make_call(rec, ftrace_addr);
- else
- return ftrace_make_nop(NULL, rec, ftrace_addr);
+ rec->flags &= ~FTRACE_FL_ENABLED;
+ return ftrace_make_nop(NULL, rec, ftrace_addr);
}
static void ftrace_replace_code(int enable)
{
- int i, failed;
struct dyn_ftrace *rec;
struct ftrace_page *pg;
+ int failed;
- for (pg = ftrace_pages_start; pg; pg = pg->next) {
- for (i = 0; i < pg->index; i++) {
- rec = &pg->records[i];
-
- /*
- * Skip over free records and records that have
- * failed.
- */
- if (rec->flags & FTRACE_FL_FREE ||
- rec->flags & FTRACE_FL_FAILED)
- continue;
+ do_for_each_ftrace_rec(pg, rec) {
+ /*
+ * Skip over free records, records that have
+ * failed and not converted.
+ */
+ if (rec->flags & FTRACE_FL_FREE ||
+ rec->flags & FTRACE_FL_FAILED ||
+ !(rec->flags & FTRACE_FL_CONVERTED))
+ continue;
- /* ignore updates to this record's mcount site */
- if (get_kprobe((void *)rec->ip)) {
- freeze_record(rec);
- continue;
- } else {
- unfreeze_record(rec);
- }
+ /* ignore updates to this record's mcount site */
+ if (get_kprobe((void *)rec->ip)) {
+ freeze_record(rec);
+ continue;
+ } else {
+ unfreeze_record(rec);
+ }
- failed = __ftrace_replace_code(rec, enable);
- if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
- rec->flags |= FTRACE_FL_FAILED;
- if ((system_state == SYSTEM_BOOTING) ||
- !core_kernel_text(rec->ip)) {
- ftrace_free_rec(rec);
- } else
- ftrace_bug(failed, rec->ip);
- }
+ failed = __ftrace_replace_code(rec, enable);
+ if (failed) {
+ rec->flags |= FTRACE_FL_FAILED;
+ if ((system_state == SYSTEM_BOOTING) ||
+ !core_kernel_text(rec->ip)) {
+ ftrace_free_rec(rec);
+ } else {
+ ftrace_bug(failed, rec->ip);
+ /* Stop processing */
+ return;
+ }
}
- }
+ } while_for_each_ftrace_rec();
}
static int
@@ -575,7 +1094,7 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
ip = rec->ip;
- ret = ftrace_make_nop(mod, rec, mcount_addr);
+ ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
if (ret) {
ftrace_bug(ret, ip);
rec->flags |= FTRACE_FL_FAILED;
@@ -584,6 +1103,24 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
return 1;
}
+/*
+ * archs can override this function if they must do something
+ * before the modifying code is performed.
+ */
+int __weak ftrace_arch_code_modify_prepare(void)
+{
+ return 0;
+}
+
+/*
+ * archs can override this function if they must do something
+ * after the modifying code is performed.
+ */
+int __weak ftrace_arch_code_modify_post_process(void)
+{
+ return 0;
+}
+
static int __ftrace_modify_code(void *data)
{
int *command = data;
@@ -606,7 +1143,17 @@ static int __ftrace_modify_code(void *data)
static void ftrace_run_update_code(int command)
{
+ int ret;
+
+ ret = ftrace_arch_code_modify_prepare();
+ FTRACE_WARN_ON(ret);
+ if (ret)
+ return;
+
stop_machine(__ftrace_modify_code, &command, NULL);
+
+ ret = ftrace_arch_code_modify_post_process();
+ FTRACE_WARN_ON(ret);
}
static ftrace_func_t saved_ftrace_func;
@@ -630,13 +1177,10 @@ static void ftrace_startup(int command)
if (unlikely(ftrace_disabled))
return;
- mutex_lock(&ftrace_start_lock);
ftrace_start_up++;
command |= FTRACE_ENABLE_CALLS;
ftrace_startup_enable(command);
-
- mutex_unlock(&ftrace_start_lock);
}
static void ftrace_shutdown(int command)
@@ -644,8 +1188,14 @@ static void ftrace_shutdown(int command)
if (unlikely(ftrace_disabled))
return;
- mutex_lock(&ftrace_start_lock);
ftrace_start_up--;
+ /*
+ * Just warn in case of unbalance, no need to kill ftrace, it's not
+ * critical but the ftrace_call callers may be never nopped again after
+ * further ftrace uses.
+ */
+ WARN_ON_ONCE(ftrace_start_up < 0);
+
if (!ftrace_start_up)
command |= FTRACE_DISABLE_CALLS;
@@ -655,11 +1205,9 @@ static void ftrace_shutdown(int command)
}
if (!command || !ftrace_enabled)
- goto out;
+ return;
ftrace_run_update_code(command);
- out:
- mutex_unlock(&ftrace_start_lock);
}
static void ftrace_startup_sysctl(void)
@@ -669,7 +1217,6 @@ static void ftrace_startup_sysctl(void)
if (unlikely(ftrace_disabled))
return;
- mutex_lock(&ftrace_start_lock);
/* Force update next time */
saved_ftrace_func = NULL;
/* ftrace_start_up is true if we want ftrace running */
@@ -677,7 +1224,6 @@ static void ftrace_startup_sysctl(void)
command |= FTRACE_ENABLE_CALLS;
ftrace_run_update_code(command);
- mutex_unlock(&ftrace_start_lock);
}
static void ftrace_shutdown_sysctl(void)
@@ -687,13 +1233,11 @@ static void ftrace_shutdown_sysctl(void)
if (unlikely(ftrace_disabled))
return;
- mutex_lock(&ftrace_start_lock);
/* ftrace_start_up is true if ftrace is running */
if (ftrace_start_up)
command |= FTRACE_DISABLE_CALLS;
ftrace_run_update_code(command);
- mutex_unlock(&ftrace_start_lock);
}
static cycle_t ftrace_update_time;
@@ -702,19 +1246,21 @@ unsigned long ftrace_update_tot_cnt;
static int ftrace_update_code(struct module *mod)
{
- struct dyn_ftrace *p, *t;
+ struct dyn_ftrace *p;
cycle_t start, stop;
start = ftrace_now(raw_smp_processor_id());
ftrace_update_cnt = 0;
- list_for_each_entry_safe(p, t, &ftrace_new_addrs, list) {
+ while (ftrace_new_addrs) {
/* If something went wrong, bail without enabling anything */
if (unlikely(ftrace_disabled))
return -1;
- list_del_init(&p->list);
+ p = ftrace_new_addrs;
+ ftrace_new_addrs = p->newlist;
+ p->flags = 0L;
/* convert record (i.e, patch mcount-call with NOP) */
if (ftrace_code_disable(mod, p)) {
@@ -780,37 +1326,116 @@ enum {
FTRACE_ITER_CONT = (1 << 1),
FTRACE_ITER_NOTRACE = (1 << 2),
FTRACE_ITER_FAILURES = (1 << 3),
+ FTRACE_ITER_PRINTALL = (1 << 4),
+ FTRACE_ITER_HASH = (1 << 5),
};
#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
struct ftrace_iterator {
struct ftrace_page *pg;
- unsigned idx;
+ int hidx;
+ int idx;
unsigned flags;
unsigned char buffer[FTRACE_BUFF_MAX+1];
unsigned buffer_idx;
- unsigned filtered;
};
static void *
+t_hash_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct ftrace_iterator *iter = m->private;
+ struct hlist_node *hnd = v;
+ struct hlist_head *hhd;
+
+ WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
+
+ (*pos)++;
+
+ retry:
+ if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
+ return NULL;
+
+ hhd = &ftrace_func_hash[iter->hidx];
+
+ if (hlist_empty(hhd)) {
+ iter->hidx++;
+ hnd = NULL;
+ goto retry;
+ }
+
+ if (!hnd)
+ hnd = hhd->first;
+ else {
+ hnd = hnd->next;
+ if (!hnd) {
+ iter->hidx++;
+ goto retry;
+ }
+ }
+
+ return hnd;
+}
+
+static void *t_hash_start(struct seq_file *m, loff_t *pos)
+{
+ struct ftrace_iterator *iter = m->private;
+ void *p = NULL;
+ loff_t l;
+
+ if (!(iter->flags & FTRACE_ITER_HASH))
+ *pos = 0;
+
+ iter->flags |= FTRACE_ITER_HASH;
+
+ iter->hidx = 0;
+ for (l = 0; l <= *pos; ) {
+ p = t_hash_next(m, p, &l);
+ if (!p)
+ break;
+ }
+ return p;
+}
+
+static int t_hash_show(struct seq_file *m, void *v)
+{
+ struct ftrace_func_probe *rec;
+ struct hlist_node *hnd = v;
+
+ rec = hlist_entry(hnd, struct ftrace_func_probe, node);
+
+ if (rec->ops->print)
+ return rec->ops->print(m, rec->ip, rec->ops, rec->data);
+
+ seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func);
+
+ if (rec->data)
+ seq_printf(m, ":%p", rec->data);
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
struct dyn_ftrace *rec = NULL;
+ if (iter->flags & FTRACE_ITER_HASH)
+ return t_hash_next(m, v, pos);
+
(*pos)++;
- /* should not be called from interrupt context */
- spin_lock(&ftrace_lock);
+ if (iter->flags & FTRACE_ITER_PRINTALL)
+ return NULL;
+
retry:
if (iter->idx >= iter->pg->index) {
if (iter->pg->next) {
iter->pg = iter->pg->next;
iter->idx = 0;
goto retry;
- } else {
- iter->idx = -1;
}
} else {
rec = &iter->pg->records[iter->idx++];
@@ -831,7 +1456,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
goto retry;
}
}
- spin_unlock(&ftrace_lock);
return rec;
}
@@ -840,34 +1464,60 @@ static void *t_start(struct seq_file *m, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
void *p = NULL;
+ loff_t l;
+
+ mutex_lock(&ftrace_lock);
+ /*
+ * For set_ftrace_filter reading, if we have the filter
+ * off, we can short cut and just print out that all
+ * functions are enabled.
+ */
+ if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) {
+ if (*pos > 0)
+ return t_hash_start(m, pos);
+ iter->flags |= FTRACE_ITER_PRINTALL;
+ return iter;
+ }
+
+ if (iter->flags & FTRACE_ITER_HASH)
+ return t_hash_start(m, pos);
- if (*pos > 0) {
- if (iter->idx < 0)
- return p;
- (*pos)--;
- iter->idx--;
+ iter->pg = ftrace_pages_start;
+ iter->idx = 0;
+ for (l = 0; l <= *pos; ) {
+ p = t_next(m, p, &l);
+ if (!p)
+ break;
}
- p = t_next(m, p, pos);
+ if (!p && iter->flags & FTRACE_ITER_FILTER)
+ return t_hash_start(m, pos);
return p;
}
static void t_stop(struct seq_file *m, void *p)
{
+ mutex_unlock(&ftrace_lock);
}
static int t_show(struct seq_file *m, void *v)
{
+ struct ftrace_iterator *iter = m->private;
struct dyn_ftrace *rec = v;
- char str[KSYM_SYMBOL_LEN];
- if (!rec)
+ if (iter->flags & FTRACE_ITER_HASH)
+ return t_hash_show(m, v);
+
+ if (iter->flags & FTRACE_ITER_PRINTALL) {
+ seq_printf(m, "#### all functions enabled ####\n");
return 0;
+ }
- kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+ if (!rec)
+ return 0;
- seq_printf(m, "%s\n", str);
+ seq_printf(m, "%pf\n", (void *)rec->ip);
return 0;
}
@@ -906,17 +1556,6 @@ ftrace_avail_open(struct inode *inode, struct file *file)
return ret;
}
-int ftrace_avail_release(struct inode *inode, struct file *file)
-{
- struct seq_file *m = (struct seq_file *)file->private_data;
- struct ftrace_iterator *iter = m->private;
-
- seq_release(inode, file);
- kfree(iter);
-
- return 0;
-}
-
static int
ftrace_failures_open(struct inode *inode, struct file *file)
{
@@ -940,23 +1579,16 @@ static void ftrace_filter_reset(int enable)
struct ftrace_page *pg;
struct dyn_ftrace *rec;
unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
- unsigned i;
- /* should not be called from interrupt context */
- spin_lock(&ftrace_lock);
+ mutex_lock(&ftrace_lock);
if (enable)
ftrace_filtered = 0;
- pg = ftrace_pages_start;
- while (pg) {
- for (i = 0; i < pg->index; i++) {
- rec = &pg->records[i];
- if (rec->flags & FTRACE_FL_FAILED)
- continue;
- rec->flags &= ~type;
- }
- pg = pg->next;
- }
- spin_unlock(&ftrace_lock);
+ do_for_each_ftrace_rec(pg, rec) {
+ if (rec->flags & FTRACE_FL_FAILED)
+ continue;
+ rec->flags &= ~type;
+ } while_for_each_ftrace_rec();
+ mutex_unlock(&ftrace_lock);
}
static int
@@ -974,7 +1606,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
mutex_lock(&ftrace_regex_lock);
if ((file->f_mode & FMODE_WRITE) &&
- !(file->f_flags & O_APPEND))
+ (file->f_flags & O_TRUNC))
ftrace_filter_reset(enable);
if (file->f_mode & FMODE_READ) {
@@ -1007,16 +1639,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
return ftrace_regex_open(inode, file, 0);
}
-static ssize_t
-ftrace_regex_read(struct file *file, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- if (file->f_mode & FMODE_READ)
- return seq_read(file, ubuf, cnt, ppos);
- else
- return -EPERM;
-}
-
static loff_t
ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
{
@@ -1037,86 +1659,536 @@ enum {
MATCH_END_ONLY,
};
-static void
-ftrace_match(unsigned char *buff, int len, int enable)
+/*
+ * (static function - no need for kernel doc)
+ *
+ * Pass in a buffer containing a glob and this function will
+ * set search to point to the search part of the buffer and
+ * return the type of search it is (see enum above).
+ * This does modify buff.
+ *
+ * Returns enum type.
+ * search returns the pointer to use for comparison.
+ * not returns 1 if buff started with a '!'
+ * 0 otherwise.
+ */
+static int
+ftrace_setup_glob(char *buff, int len, char **search, int *not)
{
- char str[KSYM_SYMBOL_LEN];
- char *search = NULL;
- struct ftrace_page *pg;
- struct dyn_ftrace *rec;
int type = MATCH_FULL;
- unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
- unsigned i, match = 0, search_len = 0;
- int not = 0;
+ int i;
if (buff[0] == '!') {
- not = 1;
+ *not = 1;
buff++;
len--;
- }
+ } else
+ *not = 0;
+
+ *search = buff;
for (i = 0; i < len; i++) {
if (buff[i] == '*') {
if (!i) {
- search = buff + i + 1;
+ *search = buff + 1;
type = MATCH_END_ONLY;
- search_len = len - (i + 1);
} else {
- if (type == MATCH_END_ONLY) {
+ if (type == MATCH_END_ONLY)
type = MATCH_MIDDLE_ONLY;
- } else {
- match = i;
+ else
type = MATCH_FRONT_ONLY;
- }
buff[i] = 0;
break;
}
}
}
- /* should not be called from interrupt context */
- spin_lock(&ftrace_lock);
- if (enable)
- ftrace_filtered = 1;
- pg = ftrace_pages_start;
- while (pg) {
- for (i = 0; i < pg->index; i++) {
- int matched = 0;
- char *ptr;
+ return type;
+}
+
+static int ftrace_match(char *str, char *regex, int len, int type)
+{
+ int matched = 0;
+ char *ptr;
+
+ switch (type) {
+ case MATCH_FULL:
+ if (strcmp(str, regex) == 0)
+ matched = 1;
+ break;
+ case MATCH_FRONT_ONLY:
+ if (strncmp(str, regex, len) == 0)
+ matched = 1;
+ break;
+ case MATCH_MIDDLE_ONLY:
+ if (strstr(str, regex))
+ matched = 1;
+ break;
+ case MATCH_END_ONLY:
+ ptr = strstr(str, regex);
+ if (ptr && (ptr[len] == 0))
+ matched = 1;
+ break;
+ }
+
+ return matched;
+}
+
+static int
+ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
+{
+ char str[KSYM_SYMBOL_LEN];
+
+ kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+ return ftrace_match(str, regex, len, type);
+}
+
+static void ftrace_match_records(char *buff, int len, int enable)
+{
+ unsigned int search_len;
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+ unsigned long flag;
+ char *search;
+ int type;
+ int not;
+
+ flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+ type = ftrace_setup_glob(buff, len, &search, &not);
+
+ search_len = strlen(search);
+
+ mutex_lock(&ftrace_lock);
+ do_for_each_ftrace_rec(pg, rec) {
+
+ if (rec->flags & FTRACE_FL_FAILED)
+ continue;
+
+ if (ftrace_match_record(rec, search, search_len, type)) {
+ if (not)
+ rec->flags &= ~flag;
+ else
+ rec->flags |= flag;
+ }
+ /*
+ * Only enable filtering if we have a function that
+ * is filtered on.
+ */
+ if (enable && (rec->flags & FTRACE_FL_FILTER))
+ ftrace_filtered = 1;
+ } while_for_each_ftrace_rec();
+ mutex_unlock(&ftrace_lock);
+}
+
+static int
+ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
+ char *regex, int len, int type)
+{
+ char str[KSYM_SYMBOL_LEN];
+ char *modname;
+
+ kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
+
+ if (!modname || strcmp(modname, mod))
+ return 0;
+
+ /* blank search means to match all funcs in the mod */
+ if (len)
+ return ftrace_match(str, regex, len, type);
+ else
+ return 1;
+}
+
+static void ftrace_match_module_records(char *buff, char *mod, int enable)
+{
+ unsigned search_len = 0;
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+ int type = MATCH_FULL;
+ char *search = buff;
+ unsigned long flag;
+ int not = 0;
+
+ flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+
+ /* blank or '*' mean the same */
+ if (strcmp(buff, "*") == 0)
+ buff[0] = 0;
- rec = &pg->records[i];
- if (rec->flags & FTRACE_FL_FAILED)
+ /* handle the case of 'dont filter this module' */
+ if (strcmp(buff, "!") == 0 || strcmp(buff, "!*") == 0) {
+ buff[0] = 0;
+ not = 1;
+ }
+
+ if (strlen(buff)) {
+ type = ftrace_setup_glob(buff, strlen(buff), &search, &not);
+ search_len = strlen(search);
+ }
+
+ mutex_lock(&ftrace_lock);
+ do_for_each_ftrace_rec(pg, rec) {
+
+ if (rec->flags & FTRACE_FL_FAILED)
+ continue;
+
+ if (ftrace_match_module_record(rec, mod,
+ search, search_len, type)) {
+ if (not)
+ rec->flags &= ~flag;
+ else
+ rec->flags |= flag;
+ }
+ if (enable && (rec->flags & FTRACE_FL_FILTER))
+ ftrace_filtered = 1;
+
+ } while_for_each_ftrace_rec();
+ mutex_unlock(&ftrace_lock);
+}
+
+/*
+ * We register the module command as a template to show others how
+ * to register the a command as well.
+ */
+
+static int
+ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
+{
+ char *mod;
+
+ /*
+ * cmd == 'mod' because we only registered this func
+ * for the 'mod' ftrace_func_command.
+ * But if you register one func with multiple commands,
+ * you can tell which command was used by the cmd
+ * parameter.
+ */
+
+ /* we must have a module name */
+ if (!param)
+ return -EINVAL;
+
+ mod = strsep(&param, ":");
+ if (!strlen(mod))
+ return -EINVAL;
+
+ ftrace_match_module_records(func, mod, enable);
+ return 0;
+}
+
+static struct ftrace_func_command ftrace_mod_cmd = {
+ .name = "mod",
+ .func = ftrace_mod_callback,
+};
+
+static int __init ftrace_mod_cmd_init(void)
+{
+ return register_ftrace_command(&ftrace_mod_cmd);
+}
+device_initcall(ftrace_mod_cmd_init);
+
+static void
+function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
+{
+ struct ftrace_func_probe *entry;
+ struct hlist_head *hhd;
+ struct hlist_node *n;
+ unsigned long key;
+ int resched;
+
+ key = hash_long(ip, FTRACE_HASH_BITS);
+
+ hhd = &ftrace_func_hash[key];
+
+ if (hlist_empty(hhd))
+ return;
+
+ /*
+ * Disable preemption for these calls to prevent a RCU grace
+ * period. This syncs the hash iteration and freeing of items
+ * on the hash. rcu_read_lock is too dangerous here.
+ */
+ resched = ftrace_preempt_disable();
+ hlist_for_each_entry_rcu(entry, n, hhd, node) {
+ if (entry->ip == ip)
+ entry->ops->func(ip, parent_ip, &entry->data);
+ }
+ ftrace_preempt_enable(resched);
+}
+
+static struct ftrace_ops trace_probe_ops __read_mostly =
+{
+ .func = function_trace_probe_call,
+};
+
+static int ftrace_probe_registered;
+
+static void __enable_ftrace_function_probe(void)
+{
+ int i;
+
+ if (ftrace_probe_registered)
+ return;
+
+ for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+ struct hlist_head *hhd = &ftrace_func_hash[i];
+ if (hhd->first)
+ break;
+ }
+ /* Nothing registered? */
+ if (i == FTRACE_FUNC_HASHSIZE)
+ return;
+
+ __register_ftrace_function(&trace_probe_ops);
+ ftrace_startup(0);
+ ftrace_probe_registered = 1;
+}
+
+static void __disable_ftrace_function_probe(void)
+{
+ int i;
+
+ if (!ftrace_probe_registered)
+ return;
+
+ for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+ struct hlist_head *hhd = &ftrace_func_hash[i];
+ if (hhd->first)
+ return;
+ }
+
+ /* no more funcs left */
+ __unregister_ftrace_function(&trace_probe_ops);
+ ftrace_shutdown(0);
+ ftrace_probe_registered = 0;
+}
+
+
+static void ftrace_free_entry_rcu(struct rcu_head *rhp)
+{
+ struct ftrace_func_probe *entry =
+ container_of(rhp, struct ftrace_func_probe, rcu);
+
+ if (entry->ops->free)
+ entry->ops->free(&entry->data);
+ kfree(entry);
+}
+
+
+int
+register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
+ void *data)
+{
+ struct ftrace_func_probe *entry;
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+ int type, len, not;
+ unsigned long key;
+ int count = 0;
+ char *search;
+
+ type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
+ len = strlen(search);
+
+ /* we do not support '!' for function probes */
+ if (WARN_ON(not))
+ return -EINVAL;
+
+ mutex_lock(&ftrace_lock);
+ do_for_each_ftrace_rec(pg, rec) {
+
+ if (rec->flags & FTRACE_FL_FAILED)
+ continue;
+
+ if (!ftrace_match_record(rec, search, len, type))
+ continue;
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry) {
+ /* If we did not process any, then return error */
+ if (!count)
+ count = -ENOMEM;
+ goto out_unlock;
+ }
+
+ count++;
+
+ entry->data = data;
+
+ /*
+ * The caller might want to do something special
+ * for each function we find. We call the callback
+ * to give the caller an opportunity to do so.
+ */
+ if (ops->callback) {
+ if (ops->callback(rec->ip, &entry->data) < 0) {
+ /* caller does not like this func */
+ kfree(entry);
continue;
- kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
- switch (type) {
- case MATCH_FULL:
- if (strcmp(str, buff) == 0)
- matched = 1;
- break;
- case MATCH_FRONT_ONLY:
- if (memcmp(str, buff, match) == 0)
- matched = 1;
- break;
- case MATCH_MIDDLE_ONLY:
- if (strstr(str, search))
- matched = 1;
- break;
- case MATCH_END_ONLY:
- ptr = strstr(str, search);
- if (ptr && (ptr[search_len] == 0))
- matched = 1;
- break;
}
- if (matched) {
- if (not)
- rec->flags &= ~flag;
- else
- rec->flags |= flag;
+ }
+
+ entry->ops = ops;
+ entry->ip = rec->ip;
+
+ key = hash_long(entry->ip, FTRACE_HASH_BITS);
+ hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
+
+ } while_for_each_ftrace_rec();
+ __enable_ftrace_function_probe();
+
+ out_unlock:
+ mutex_unlock(&ftrace_lock);
+
+ return count;
+}
+
+enum {
+ PROBE_TEST_FUNC = 1,
+ PROBE_TEST_DATA = 2
+};
+
+static void
+__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
+ void *data, int flags)
+{
+ struct ftrace_func_probe *entry;
+ struct hlist_node *n, *tmp;
+ char str[KSYM_SYMBOL_LEN];
+ int type = MATCH_FULL;
+ int i, len = 0;
+ char *search;
+
+ if (glob && (strcmp(glob, "*") || !strlen(glob)))
+ glob = NULL;
+ else {
+ int not;
+
+ type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
+ len = strlen(search);
+
+ /* we do not support '!' for function probes */
+ if (WARN_ON(not))
+ return;
+ }
+
+ mutex_lock(&ftrace_lock);
+ for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+ struct hlist_head *hhd = &ftrace_func_hash[i];
+
+ hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
+
+ /* break up if statements for readability */
+ if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
+ continue;
+
+ if ((flags & PROBE_TEST_DATA) && entry->data != data)
+ continue;
+
+ /* do this last, since it is the most expensive */
+ if (glob) {
+ kallsyms_lookup(entry->ip, NULL, NULL,
+ NULL, str);
+ if (!ftrace_match(str, glob, len, type))
+ continue;
}
+
+ hlist_del(&entry->node);
+ call_rcu(&entry->rcu, ftrace_free_entry_rcu);
+ }
+ }
+ __disable_ftrace_function_probe();
+ mutex_unlock(&ftrace_lock);
+}
+
+void
+unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
+ void *data)
+{
+ __unregister_ftrace_function_probe(glob, ops, data,
+ PROBE_TEST_FUNC | PROBE_TEST_DATA);
+}
+
+void
+unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops)
+{
+ __unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC);
+}
+
+void unregister_ftrace_function_probe_all(char *glob)
+{
+ __unregister_ftrace_function_probe(glob, NULL, NULL, 0);
+}
+
+static LIST_HEAD(ftrace_commands);
+static DEFINE_MUTEX(ftrace_cmd_mutex);
+
+int register_ftrace_command(struct ftrace_func_command *cmd)
+{
+ struct ftrace_func_command *p;
+ int ret = 0;
+
+ mutex_lock(&ftrace_cmd_mutex);
+ list_for_each_entry(p, &ftrace_commands, list) {
+ if (strcmp(cmd->name, p->name) == 0) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ }
+ list_add(&cmd->list, &ftrace_commands);
+ out_unlock:
+ mutex_unlock(&ftrace_cmd_mutex);
+
+ return ret;
+}
+
+int unregister_ftrace_command(struct ftrace_func_command *cmd)
+{
+ struct ftrace_func_command *p, *n;
+ int ret = -ENODEV;
+
+ mutex_lock(&ftrace_cmd_mutex);
+ list_for_each_entry_safe(p, n, &ftrace_commands, list) {
+ if (strcmp(cmd->name, p->name) == 0) {
+ ret = 0;
+ list_del_init(&p->list);
+ goto out_unlock;
+ }
+ }
+ out_unlock:
+ mutex_unlock(&ftrace_cmd_mutex);
+
+ return ret;
+}
+
+static int ftrace_process_regex(char *buff, int len, int enable)
+{
+ char *func, *command, *next = buff;
+ struct ftrace_func_command *p;
+ int ret = -EINVAL;
+
+ func = strsep(&next, ":");
+
+ if (!next) {
+ ftrace_match_records(func, len, enable);
+ return 0;
+ }
+
+ /* command found */
+
+ command = strsep(&next, ":");
+
+ mutex_lock(&ftrace_cmd_mutex);
+ list_for_each_entry(p, &ftrace_commands, list) {
+ if (strcmp(p->name, command) == 0) {
+ ret = p->func(func, command, next, enable);
+ goto out_unlock;
}
- pg = pg->next;
}
- spin_unlock(&ftrace_lock);
+ out_unlock:
+ mutex_unlock(&ftrace_cmd_mutex);
+
+ return ret;
}
static ssize_t
@@ -1150,7 +2222,11 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
read++;
cnt--;
- if (!(iter->flags & ~FTRACE_ITER_CONT)) {
+ /*
+ * If the parser haven't finished with the last write,
+ * continue reading the user input without skipping spaces.
+ */
+ if (!(iter->flags & FTRACE_ITER_CONT)) {
/* skip white space */
while (cnt && isspace(ch)) {
ret = get_user(ch, ubuf++);
@@ -1160,8 +2236,9 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
cnt--;
}
+ /* only spaces were written */
if (isspace(ch)) {
- file->f_pos += read;
+ *ppos += read;
ret = read;
goto out;
}
@@ -1184,16 +2261,18 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
}
if (isspace(ch)) {
- iter->filtered++;
iter->buffer[iter->buffer_idx] = 0;
- ftrace_match(iter->buffer, iter->buffer_idx, enable);
+ ret = ftrace_process_regex(iter->buffer,
+ iter->buffer_idx, enable);
+ if (ret)
+ goto out;
iter->buffer_idx = 0;
- } else
+ } else {
iter->flags |= FTRACE_ITER_CONT;
+ iter->buffer[iter->buffer_idx++] = ch;
+ }
-
- file->f_pos += read;
-
+ *ppos += read;
ret = read;
out:
mutex_unlock(&ftrace_regex_lock);
@@ -1225,7 +2304,7 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
if (reset)
ftrace_filter_reset(enable);
if (buf)
- ftrace_match(buf, len, enable);
+ ftrace_match_records(buf, len, enable);
mutex_unlock(&ftrace_regex_lock);
}
@@ -1258,6 +2337,45 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)
ftrace_set_regex(buf, len, reset, 0);
}
+/*
+ * command line interface to allow users to set filters on boot up.
+ */
+#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE
+static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
+static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
+
+static int __init set_ftrace_notrace(char *str)
+{
+ strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+ return 1;
+}
+__setup("ftrace_notrace=", set_ftrace_notrace);
+
+static int __init set_ftrace_filter(char *str)
+{
+ strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+ return 1;
+}
+__setup("ftrace_filter=", set_ftrace_filter);
+
+static void __init set_ftrace_early_filter(char *buf, int enable)
+{
+ char *func;
+
+ while (buf) {
+ func = strsep(&buf, ",");
+ ftrace_set_regex(func, strlen(func), 0, enable);
+ }
+}
+
+static void __init set_ftrace_early_filters(void)
+{
+ if (ftrace_filter_buf[0])
+ set_ftrace_early_filter(ftrace_filter_buf, 1);
+ if (ftrace_notrace_buf[0])
+ set_ftrace_early_filter(ftrace_notrace_buf, 0);
+}
+
static int
ftrace_regex_release(struct inode *inode, struct file *file, int enable)
{
@@ -1273,17 +2391,14 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
iter = file->private_data;
if (iter->buffer_idx) {
- iter->filtered++;
iter->buffer[iter->buffer_idx] = 0;
- ftrace_match(iter->buffer, iter->buffer_idx, enable);
+ ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
}
- mutex_lock(&ftrace_sysctl_lock);
- mutex_lock(&ftrace_start_lock);
+ mutex_lock(&ftrace_lock);
if (ftrace_start_up && ftrace_enabled)
ftrace_run_update_code(FTRACE_ENABLE_CALLS);
- mutex_unlock(&ftrace_start_lock);
- mutex_unlock(&ftrace_sysctl_lock);
+ mutex_unlock(&ftrace_lock);
kfree(iter);
mutex_unlock(&ftrace_regex_lock);
@@ -1302,31 +2417,31 @@ ftrace_notrace_release(struct inode *inode, struct file *file)
return ftrace_regex_release(inode, file, 0);
}
-static struct file_operations ftrace_avail_fops = {
+static const struct file_operations ftrace_avail_fops = {
.open = ftrace_avail_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = ftrace_avail_release,
+ .release = seq_release_private,
};
-static struct file_operations ftrace_failures_fops = {
+static const struct file_operations ftrace_failures_fops = {
.open = ftrace_failures_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = ftrace_avail_release,
+ .release = seq_release_private,
};
-static struct file_operations ftrace_filter_fops = {
+static const struct file_operations ftrace_filter_fops = {
.open = ftrace_filter_open,
- .read = ftrace_regex_read,
+ .read = seq_read,
.write = ftrace_filter_write,
.llseek = ftrace_regex_lseek,
.release = ftrace_filter_release,
};
-static struct file_operations ftrace_notrace_fops = {
+static const struct file_operations ftrace_notrace_fops = {
.open = ftrace_notrace_open,
- .read = ftrace_regex_read,
+ .read = seq_read,
.write = ftrace_notrace_write,
.llseek = ftrace_regex_lseek,
.release = ftrace_notrace_release,
@@ -1340,28 +2455,31 @@ int ftrace_graph_count;
unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
static void *
-g_next(struct seq_file *m, void *v, loff_t *pos)
+__g_next(struct seq_file *m, loff_t *pos)
{
unsigned long *array = m->private;
- int index = *pos;
-
- (*pos)++;
- if (index >= ftrace_graph_count)
+ if (*pos >= ftrace_graph_count)
return NULL;
+ return &array[*pos];
+}
- return &array[index];
+static void *
+g_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return __g_next(m, pos);
}
static void *g_start(struct seq_file *m, loff_t *pos)
{
- void *p = NULL;
-
mutex_lock(&graph_lock);
- p = g_next(m, p, pos);
+ /* Nothing, tell g_show to print all functions are enabled */
+ if (!ftrace_graph_count && !*pos)
+ return (void *)1;
- return p;
+ return __g_next(m, pos);
}
static void g_stop(struct seq_file *m, void *p)
@@ -1372,14 +2490,16 @@ static void g_stop(struct seq_file *m, void *p)
static int g_show(struct seq_file *m, void *v)
{
unsigned long *ptr = v;
- char str[KSYM_SYMBOL_LEN];
if (!ptr)
return 0;
- kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
+ if (ptr == (unsigned long *)1) {
+ seq_printf(m, "#### all functions enabled ####\n");
+ return 0;
+ }
- seq_printf(m, "%s\n", str);
+ seq_printf(m, "%pf\n", v);
return 0;
}
@@ -1401,7 +2521,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
mutex_lock(&graph_lock);
if ((file->f_mode & FMODE_WRITE) &&
- !(file->f_flags & O_APPEND)) {
+ (file->f_flags & O_TRUNC)) {
ftrace_graph_count = 0;
memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
}
@@ -1419,53 +2539,61 @@ ftrace_graph_open(struct inode *inode, struct file *file)
return ret;
}
-static ssize_t
-ftrace_graph_read(struct file *file, char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static int
+ftrace_graph_release(struct inode *inode, struct file *file)
{
if (file->f_mode & FMODE_READ)
- return seq_read(file, ubuf, cnt, ppos);
- else
- return -EPERM;
+ seq_release(inode, file);
+ return 0;
}
static int
-ftrace_set_func(unsigned long *array, int idx, char *buffer)
+ftrace_set_func(unsigned long *array, int *idx, char *buffer)
{
- char str[KSYM_SYMBOL_LEN];
struct dyn_ftrace *rec;
struct ftrace_page *pg;
+ int search_len;
int found = 0;
- int i, j;
+ int type, not;
+ char *search;
+ bool exists;
+ int i;
if (ftrace_disabled)
return -ENODEV;
- /* should not be called from interrupt context */
- spin_lock(&ftrace_lock);
+ /* decode regex */
+ type = ftrace_setup_glob(buffer, strlen(buffer), &search, &not);
+ if (not)
+ return -EINVAL;
- for (pg = ftrace_pages_start; pg; pg = pg->next) {
- for (i = 0; i < pg->index; i++) {
- rec = &pg->records[i];
+ search_len = strlen(search);
- if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
- continue;
+ mutex_lock(&ftrace_lock);
+ do_for_each_ftrace_rec(pg, rec) {
+
+ if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
+ break;
+
+ if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
+ continue;
- kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
- if (strcmp(str, buffer) == 0) {
+ if (ftrace_match_record(rec, search, search_len, type)) {
+ /* ensure it is not already in the array */
+ exists = false;
+ for (i = 0; i < *idx; i++)
+ if (array[i] == rec->ip) {
+ exists = true;
+ break;
+ }
+ if (!exists) {
+ array[(*idx)++] = rec->ip;
found = 1;
- for (j = 0; j < idx; j++)
- if (array[j] == rec->ip) {
- found = 0;
- break;
- }
- if (found)
- array[idx] = rec->ip;
- break;
}
}
- }
- spin_unlock(&ftrace_lock);
+ } while_for_each_ftrace_rec();
+
+ mutex_unlock(&ftrace_lock);
return found ? 0 : -EINVAL;
}
@@ -1533,13 +2661,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
}
buffer[index] = 0;
- /* we allow only one at a time */
- ret = ftrace_set_func(array, ftrace_graph_count, buffer);
+ /* we allow only one expression at a time */
+ ret = ftrace_set_func(array, &ftrace_graph_count, buffer);
if (ret)
goto out;
- ftrace_graph_count++;
-
file->f_pos += read;
ret = read;
@@ -1550,46 +2676,32 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
}
static const struct file_operations ftrace_graph_fops = {
- .open = ftrace_graph_open,
- .read = ftrace_graph_read,
- .write = ftrace_graph_write,
+ .open = ftrace_graph_open,
+ .read = seq_read,
+ .write = ftrace_graph_write,
+ .release = ftrace_graph_release,
};
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
{
- struct dentry *entry;
- entry = debugfs_create_file("available_filter_functions", 0444,
- d_tracer, NULL, &ftrace_avail_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'available_filter_functions' entry\n");
+ trace_create_file("available_filter_functions", 0444,
+ d_tracer, NULL, &ftrace_avail_fops);
- entry = debugfs_create_file("failures", 0444,
- d_tracer, NULL, &ftrace_failures_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'failures' entry\n");
+ trace_create_file("failures", 0444,
+ d_tracer, NULL, &ftrace_failures_fops);
- entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
- NULL, &ftrace_filter_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'set_ftrace_filter' entry\n");
+ trace_create_file("set_ftrace_filter", 0644, d_tracer,
+ NULL, &ftrace_filter_fops);
- entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer,
+ trace_create_file("set_ftrace_notrace", 0644, d_tracer,
NULL, &ftrace_notrace_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'set_ftrace_notrace' entry\n");
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- entry = debugfs_create_file("set_graph_function", 0444, d_tracer,
+ trace_create_file("set_graph_function", 0444, d_tracer,
NULL,
&ftrace_graph_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'set_graph_function' entry\n");
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
return 0;
@@ -1603,7 +2715,7 @@ static int ftrace_convert_nops(struct module *mod,
unsigned long addr;
unsigned long flags;
- mutex_lock(&ftrace_start_lock);
+ mutex_lock(&ftrace_lock);
p = start;
while (p < end) {
addr = ftrace_call_adjust(*p++);
@@ -1622,19 +2734,77 @@ static int ftrace_convert_nops(struct module *mod,
local_irq_save(flags);
ftrace_update_code(mod);
local_irq_restore(flags);
- mutex_unlock(&ftrace_start_lock);
+ mutex_unlock(&ftrace_lock);
return 0;
}
-void ftrace_init_module(struct module *mod,
- unsigned long *start, unsigned long *end)
+#ifdef CONFIG_MODULES
+void ftrace_release(void *start, void *end)
+{
+ struct dyn_ftrace *rec;
+ struct ftrace_page *pg;
+ unsigned long s = (unsigned long)start;
+ unsigned long e = (unsigned long)end;
+
+ if (ftrace_disabled || !start || start == end)
+ return;
+
+ mutex_lock(&ftrace_lock);
+ do_for_each_ftrace_rec(pg, rec) {
+ if ((rec->ip >= s) && (rec->ip < e)) {
+ /*
+ * rec->ip is changed in ftrace_free_rec()
+ * It should not between s and e if record was freed.
+ */
+ FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
+ ftrace_free_rec(rec);
+ }
+ } while_for_each_ftrace_rec();
+ mutex_unlock(&ftrace_lock);
+}
+
+static void ftrace_init_module(struct module *mod,
+ unsigned long *start, unsigned long *end)
{
if (ftrace_disabled || start == end)
return;
ftrace_convert_nops(mod, start, end);
}
+static int ftrace_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+
+ switch (val) {
+ case MODULE_STATE_COMING:
+ ftrace_init_module(mod, mod->ftrace_callsites,
+ mod->ftrace_callsites +
+ mod->num_ftrace_callsites);
+ break;
+ case MODULE_STATE_GOING:
+ ftrace_release(mod->ftrace_callsites,
+ mod->ftrace_callsites +
+ mod->num_ftrace_callsites);
+ break;
+ }
+
+ return 0;
+}
+#else
+static int ftrace_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULES */
+
+struct notifier_block ftrace_module_nb = {
+ .notifier_call = ftrace_module_notify,
+ .priority = 0,
+};
+
extern unsigned long __start_mcount_loc[];
extern unsigned long __stop_mcount_loc[];
@@ -1666,6 +2836,12 @@ void __init ftrace_init(void)
__start_mcount_loc,
__stop_mcount_loc);
+ ret = register_module_notifier(&ftrace_module_nb);
+ if (ret)
+ pr_warning("Failed to register trace ftrace module notifier\n");
+
+ set_ftrace_early_filters();
+
return;
failed:
ftrace_disabled = 1;
@@ -1699,7 +2875,7 @@ ftrace_pid_read(struct file *file, char __user *ubuf,
if (ftrace_pid_trace == ftrace_swapper_pid)
r = sprintf(buf, "swapper tasks\n");
else if (ftrace_pid_trace)
- r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace));
+ r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace));
else
r = sprintf(buf, "no pid\n");
@@ -1736,9 +2912,12 @@ static void clear_ftrace_pid(struct pid *pid)
{
struct task_struct *p;
+ rcu_read_lock();
do_each_pid_task(pid, PIDTYPE_PID, p) {
clear_tsk_trace_trace(p);
} while_each_pid_task(pid, PIDTYPE_PID, p);
+ rcu_read_unlock();
+
put_pid(pid);
}
@@ -1746,9 +2925,11 @@ static void set_ftrace_pid(struct pid *pid)
{
struct task_struct *p;
+ rcu_read_lock();
do_each_pid_task(pid, PIDTYPE_PID, p) {
set_tsk_trace_trace(p);
} while_each_pid_task(pid, PIDTYPE_PID, p);
+ rcu_read_unlock();
}
static void clear_ftrace_pid_task(struct pid **pid)
@@ -1790,7 +2971,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
if (ret < 0)
return ret;
- mutex_lock(&ftrace_start_lock);
+ mutex_lock(&ftrace_lock);
if (val < 0) {
/* disable pid tracing */
if (!ftrace_pid_trace)
@@ -1829,12 +3010,12 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
ftrace_startup_enable(0);
out:
- mutex_unlock(&ftrace_start_lock);
+ mutex_unlock(&ftrace_lock);
return cnt;
}
-static struct file_operations ftrace_pid_fops = {
+static const struct file_operations ftrace_pid_fops = {
.read = ftrace_pid_read,
.write = ftrace_pid_write,
};
@@ -1842,7 +3023,6 @@ static struct file_operations ftrace_pid_fops = {
static __init int ftrace_init_debugfs(void)
{
struct dentry *d_tracer;
- struct dentry *entry;
d_tracer = tracing_init_dentry();
if (!d_tracer)
@@ -1850,14 +3030,13 @@ static __init int ftrace_init_debugfs(void)
ftrace_init_dyn_debugfs(d_tracer);
- entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer,
- NULL, &ftrace_pid_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'set_ftrace_pid' entry\n");
+ trace_create_file("set_ftrace_pid", 0644, d_tracer,
+ NULL, &ftrace_pid_fops);
+
+ ftrace_profile_debugfs(d_tracer);
+
return 0;
}
-
fs_initcall(ftrace_init_debugfs);
/**
@@ -1892,17 +3071,17 @@ int register_ftrace_function(struct ftrace_ops *ops)
if (unlikely(ftrace_disabled))
return -1;
- mutex_lock(&ftrace_sysctl_lock);
+ mutex_lock(&ftrace_lock);
ret = __register_ftrace_function(ops);
ftrace_startup(0);
- mutex_unlock(&ftrace_sysctl_lock);
+ mutex_unlock(&ftrace_lock);
return ret;
}
/**
- * unregister_ftrace_function - unresgister a function for profiling.
+ * unregister_ftrace_function - unregister a function for profiling.
* @ops - ops structure that holds the function to unregister
*
* Unregister a function that was added to be called by ftrace profiling.
@@ -1911,10 +3090,10 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
{
int ret;
- mutex_lock(&ftrace_sysctl_lock);
+ mutex_lock(&ftrace_lock);
ret = __unregister_ftrace_function(ops);
ftrace_shutdown(0);
- mutex_unlock(&ftrace_sysctl_lock);
+ mutex_unlock(&ftrace_lock);
return ret;
}
@@ -1929,14 +3108,14 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
if (unlikely(ftrace_disabled))
return -ENODEV;
- mutex_lock(&ftrace_sysctl_lock);
+ mutex_lock(&ftrace_lock);
ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
- if (ret || !write || (last_ftrace_enabled == ftrace_enabled))
+ if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
goto out;
- last_ftrace_enabled = ftrace_enabled;
+ last_ftrace_enabled = !!ftrace_enabled;
if (ftrace_enabled) {
@@ -1958,13 +3137,14 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
}
out:
- mutex_unlock(&ftrace_sysctl_lock);
+ mutex_unlock(&ftrace_lock);
return ret;
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static atomic_t ftrace_graph_active;
+static int ftrace_graph_active;
+static struct notifier_block ftrace_suspend_notifier;
int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
{
@@ -2005,12 +3185,12 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
}
if (t->ret_stack == NULL) {
- t->curr_ret_stack = -1;
- /* Make sure IRQs see the -1 first: */
- barrier();
- t->ret_stack = ret_stack_list[start++];
atomic_set(&t->tracing_graph_pause, 0);
atomic_set(&t->trace_overrun, 0);
+ t->curr_ret_stack = -1;
+ /* Make sure the tasks see the -1 first: */
+ smp_wmb();
+ t->ret_stack = ret_stack_list[start++];
}
} while_each_thread(g, t);
@@ -2022,11 +3202,43 @@ free:
return ret;
}
+static void
+ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
+ struct task_struct *next)
+{
+ unsigned long long timestamp;
+ int index;
+
+ /*
+ * Does the user want to count the time a function was asleep.
+ * If so, do not update the time stamps.
+ */
+ if (trace_flags & TRACE_ITER_SLEEP_TIME)
+ return;
+
+ timestamp = trace_clock_local();
+
+ prev->ftrace_timestamp = timestamp;
+
+ /* only process tasks that we timestamped */
+ if (!next->ftrace_timestamp)
+ return;
+
+ /*
+ * Update all the counters in next to make up for the
+ * time next was sleeping.
+ */
+ timestamp -= next->ftrace_timestamp;
+
+ for (index = next->curr_ret_stack; index >= 0; index--)
+ next->ret_stack[index].calltime += timestamp;
+}
+
/* Allocate a return stack for each task */
static int start_graph_tracing(void)
{
struct ftrace_ret_stack **ret_stack_list;
- int ret;
+ int ret, cpu;
ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE *
sizeof(struct ftrace_ret_stack *),
@@ -2035,25 +3247,68 @@ static int start_graph_tracing(void)
if (!ret_stack_list)
return -ENOMEM;
+ /* The cpu_boot init_task->ret_stack will never be freed */
+ for_each_online_cpu(cpu) {
+ if (!idle_task(cpu)->ret_stack)
+ ftrace_graph_init_task(idle_task(cpu));
+ }
+
do {
ret = alloc_retstack_tasklist(ret_stack_list);
} while (ret == -EAGAIN);
+ if (!ret) {
+ ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch);
+ if (ret)
+ pr_info("ftrace_graph: Couldn't activate tracepoint"
+ " probe to kernel_sched_switch\n");
+ }
+
kfree(ret_stack_list);
return ret;
}
+/*
+ * Hibernation protection.
+ * The state of the current task is too much unstable during
+ * suspend/restore to disk. We want to protect against that.
+ */
+static int
+ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
+ void *unused)
+{
+ switch (state) {
+ case PM_HIBERNATION_PREPARE:
+ pause_graph_tracing();
+ break;
+
+ case PM_POST_HIBERNATION:
+ unpause_graph_tracing();
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
int register_ftrace_graph(trace_func_graph_ret_t retfunc,
trace_func_graph_ent_t entryfunc)
{
int ret = 0;
- mutex_lock(&ftrace_sysctl_lock);
+ mutex_lock(&ftrace_lock);
- atomic_inc(&ftrace_graph_active);
+ /* we currently allow only one tracer registered at a time */
+ if (ftrace_graph_active) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
+ register_pm_notifier(&ftrace_suspend_notifier);
+
+ ftrace_graph_active++;
ret = start_graph_tracing();
if (ret) {
- atomic_dec(&ftrace_graph_active);
+ ftrace_graph_active--;
goto out;
}
@@ -2063,36 +3318,50 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
ftrace_startup(FTRACE_START_FUNC_RET);
out:
- mutex_unlock(&ftrace_sysctl_lock);
+ mutex_unlock(&ftrace_lock);
return ret;
}
void unregister_ftrace_graph(void)
{
- mutex_lock(&ftrace_sysctl_lock);
+ mutex_lock(&ftrace_lock);
+
+ if (unlikely(!ftrace_graph_active))
+ goto out;
- atomic_dec(&ftrace_graph_active);
+ ftrace_graph_active--;
+ unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
ftrace_graph_entry = ftrace_graph_entry_stub;
ftrace_shutdown(FTRACE_STOP_FUNC_RET);
+ unregister_pm_notifier(&ftrace_suspend_notifier);
- mutex_unlock(&ftrace_sysctl_lock);
+ out:
+ mutex_unlock(&ftrace_lock);
}
/* Allocate a return stack for newly created task */
void ftrace_graph_init_task(struct task_struct *t)
{
- if (atomic_read(&ftrace_graph_active)) {
- t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+ /* Make sure we do not use the parent ret_stack */
+ t->ret_stack = NULL;
+
+ if (ftrace_graph_active) {
+ struct ftrace_ret_stack *ret_stack;
+
+ ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
* sizeof(struct ftrace_ret_stack),
GFP_KERNEL);
- if (!t->ret_stack)
+ if (!ret_stack)
return;
t->curr_ret_stack = -1;
atomic_set(&t->tracing_graph_pause, 0);
atomic_set(&t->trace_overrun, 0);
- } else
- t->ret_stack = NULL;
+ t->ftrace_timestamp = 0;
+ /* make curr_ret_stack visable before we add the ret_stack */
+ smp_wmb();
+ t->ret_stack = ret_stack;
+ }
}
void ftrace_graph_exit_task(struct task_struct *t)
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
new file mode 100644
index 00000000000..81b1645c854
--- /dev/null
+++ b/kernel/trace/kmemtrace.c
@@ -0,0 +1,511 @@
+/*
+ * Memory allocator tracing
+ *
+ * Copyright (C) 2008 Eduard - Gabriel Munteanu
+ * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+#include <linux/tracepoint.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+
+#include <linux/kmemtrace.h>
+
+#include "trace_output.h"
+#include "trace.h"
+
+/* Select an alternative, minimalistic output than the original one */
+#define TRACE_KMEM_OPT_MINIMAL 0x1
+
+static struct tracer_opt kmem_opts[] = {
+ /* Default disable the minimalistic output */
+ { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
+ { }
+};
+
+static struct tracer_flags kmem_tracer_flags = {
+ .val = 0,
+ .opts = kmem_opts
+};
+
+static struct trace_array *kmemtrace_array;
+
+/* Trace allocations */
+static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
+ unsigned long call_site,
+ const void *ptr,
+ size_t bytes_req,
+ size_t bytes_alloc,
+ gfp_t gfp_flags,
+ int node)
+{
+ struct ftrace_event_call *call = &event_kmem_alloc;
+ struct trace_array *tr = kmemtrace_array;
+ struct kmemtrace_alloc_entry *entry;
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+ if (!event)
+ return;
+
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, 0, 0);
+
+ entry->ent.type = TRACE_KMEM_ALLOC;
+ entry->type_id = type_id;
+ entry->call_site = call_site;
+ entry->ptr = ptr;
+ entry->bytes_req = bytes_req;
+ entry->bytes_alloc = bytes_alloc;
+ entry->gfp_flags = gfp_flags;
+ entry->node = node;
+
+ if (!filter_check_discard(call, entry, tr->buffer, event))
+ ring_buffer_unlock_commit(tr->buffer, event);
+
+ trace_wake_up();
+}
+
+static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
+ unsigned long call_site,
+ const void *ptr)
+{
+ struct ftrace_event_call *call = &event_kmem_free;
+ struct trace_array *tr = kmemtrace_array;
+ struct kmemtrace_free_entry *entry;
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ tracing_generic_entry_update(&entry->ent, 0, 0);
+
+ entry->ent.type = TRACE_KMEM_FREE;
+ entry->type_id = type_id;
+ entry->call_site = call_site;
+ entry->ptr = ptr;
+
+ if (!filter_check_discard(call, entry, tr->buffer, event))
+ ring_buffer_unlock_commit(tr->buffer, event);
+
+ trace_wake_up();
+}
+
+static void kmemtrace_kmalloc(unsigned long call_site,
+ const void *ptr,
+ size_t bytes_req,
+ size_t bytes_alloc,
+ gfp_t gfp_flags)
+{
+ kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
+ bytes_req, bytes_alloc, gfp_flags, -1);
+}
+
+static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
+ const void *ptr,
+ size_t bytes_req,
+ size_t bytes_alloc,
+ gfp_t gfp_flags)
+{
+ kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
+ bytes_req, bytes_alloc, gfp_flags, -1);
+}
+
+static void kmemtrace_kmalloc_node(unsigned long call_site,
+ const void *ptr,
+ size_t bytes_req,
+ size_t bytes_alloc,
+ gfp_t gfp_flags,
+ int node)
+{
+ kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
+ bytes_req, bytes_alloc, gfp_flags, node);
+}
+
+static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
+ const void *ptr,
+ size_t bytes_req,
+ size_t bytes_alloc,
+ gfp_t gfp_flags,
+ int node)
+{
+ kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
+ bytes_req, bytes_alloc, gfp_flags, node);
+}
+
+static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
+{
+ kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
+}
+
+static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
+{
+ kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
+}
+
+static int kmemtrace_start_probes(void)
+{
+ int err;
+
+ err = register_trace_kmalloc(kmemtrace_kmalloc);
+ if (err)
+ return err;
+ err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
+ if (err)
+ return err;
+ err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
+ if (err)
+ return err;
+ err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
+ if (err)
+ return err;
+ err = register_trace_kfree(kmemtrace_kfree);
+ if (err)
+ return err;
+ err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+
+ return err;
+}
+
+static void kmemtrace_stop_probes(void)
+{
+ unregister_trace_kmalloc(kmemtrace_kmalloc);
+ unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
+ unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
+ unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
+ unregister_trace_kfree(kmemtrace_kfree);
+ unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+}
+
+static int kmem_trace_init(struct trace_array *tr)
+{
+ kmemtrace_array = tr;
+
+ tracing_reset_online_cpus(tr);
+
+ kmemtrace_start_probes();
+
+ return 0;
+}
+
+static void kmem_trace_reset(struct trace_array *tr)
+{
+ kmemtrace_stop_probes();
+}
+
+static void kmemtrace_headers(struct seq_file *s)
+{
+ /* Don't need headers for the original kmemtrace output */
+ if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
+ return;
+
+ seq_printf(s, "#\n");
+ seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS "
+ " POINTER NODE CALLER\n");
+ seq_printf(s, "# FREE | | | | "
+ " | | | |\n");
+ seq_printf(s, "# |\n\n");
+}
+
+/*
+ * The following functions give the original output from kmemtrace,
+ * plus the origin CPU, since reordering occurs in-kernel now.
+ */
+
+#define KMEMTRACE_USER_ALLOC 0
+#define KMEMTRACE_USER_FREE 1
+
+struct kmemtrace_user_event {
+ u8 event_id;
+ u8 type_id;
+ u16 event_size;
+ u32 cpu;
+ u64 timestamp;
+ unsigned long call_site;
+ unsigned long ptr;
+};
+
+struct kmemtrace_user_event_alloc {
+ size_t bytes_req;
+ size_t bytes_alloc;
+ unsigned gfp_flags;
+ int node;
+};
+
+static enum print_line_t
+kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
+{
+ struct trace_seq *s = &iter->seq;
+ struct kmemtrace_alloc_entry *entry;
+ int ret;
+
+ trace_assign_type(entry, iter->ent);
+
+ ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
+ "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
+ entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
+ (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
+ (unsigned long)entry->gfp_flags, entry->node);
+
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+kmemtrace_print_free(struct trace_iterator *iter, int flags)
+{
+ struct trace_seq *s = &iter->seq;
+ struct kmemtrace_free_entry *entry;
+ int ret;
+
+ trace_assign_type(entry, iter->ent);
+
+ ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
+ entry->type_id, (void *)entry->call_site,
+ (unsigned long)entry->ptr);
+
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
+{
+ struct trace_seq *s = &iter->seq;
+ struct kmemtrace_alloc_entry *entry;
+ struct kmemtrace_user_event *ev;
+ struct kmemtrace_user_event_alloc *ev_alloc;
+
+ trace_assign_type(entry, iter->ent);
+
+ ev = trace_seq_reserve(s, sizeof(*ev));
+ if (!ev)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ ev->event_id = KMEMTRACE_USER_ALLOC;
+ ev->type_id = entry->type_id;
+ ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
+ ev->cpu = iter->cpu;
+ ev->timestamp = iter->ts;
+ ev->call_site = entry->call_site;
+ ev->ptr = (unsigned long)entry->ptr;
+
+ ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
+ if (!ev_alloc)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ ev_alloc->bytes_req = entry->bytes_req;
+ ev_alloc->bytes_alloc = entry->bytes_alloc;
+ ev_alloc->gfp_flags = entry->gfp_flags;
+ ev_alloc->node = entry->node;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
+{
+ struct trace_seq *s = &iter->seq;
+ struct kmemtrace_free_entry *entry;
+ struct kmemtrace_user_event *ev;
+
+ trace_assign_type(entry, iter->ent);
+
+ ev = trace_seq_reserve(s, sizeof(*ev));
+ if (!ev)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ ev->event_id = KMEMTRACE_USER_FREE;
+ ev->type_id = entry->type_id;
+ ev->event_size = sizeof(*ev);
+ ev->cpu = iter->cpu;
+ ev->timestamp = iter->ts;
+ ev->call_site = entry->call_site;
+ ev->ptr = (unsigned long)entry->ptr;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+/* The two other following provide a more minimalistic output */
+static enum print_line_t
+kmemtrace_print_alloc_compress(struct trace_iterator *iter)
+{
+ struct kmemtrace_alloc_entry *entry;
+ struct trace_seq *s = &iter->seq;
+ int ret;
+
+ trace_assign_type(entry, iter->ent);
+
+ /* Alloc entry */
+ ret = trace_seq_printf(s, " + ");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Type */
+ switch (entry->type_id) {
+ case KMEMTRACE_TYPE_KMALLOC:
+ ret = trace_seq_printf(s, "K ");
+ break;
+ case KMEMTRACE_TYPE_CACHE:
+ ret = trace_seq_printf(s, "C ");
+ break;
+ case KMEMTRACE_TYPE_PAGES:
+ ret = trace_seq_printf(s, "P ");
+ break;
+ default:
+ ret = trace_seq_printf(s, "? ");
+ }
+
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Requested */
+ ret = trace_seq_printf(s, "%4zu ", entry->bytes_req);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Allocated */
+ ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Flags
+ * TODO: would be better to see the name of the GFP flag names
+ */
+ ret = trace_seq_printf(s, "%08x ", entry->gfp_flags);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Pointer to allocated */
+ ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Node and call site*/
+ ret = trace_seq_printf(s, "%4d %pf\n", entry->node,
+ (void *)entry->call_site);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+kmemtrace_print_free_compress(struct trace_iterator *iter)
+{
+ struct kmemtrace_free_entry *entry;
+ struct trace_seq *s = &iter->seq;
+ int ret;
+
+ trace_assign_type(entry, iter->ent);
+
+ /* Free entry */
+ ret = trace_seq_printf(s, " - ");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Type */
+ switch (entry->type_id) {
+ case KMEMTRACE_TYPE_KMALLOC:
+ ret = trace_seq_printf(s, "K ");
+ break;
+ case KMEMTRACE_TYPE_CACHE:
+ ret = trace_seq_printf(s, "C ");
+ break;
+ case KMEMTRACE_TYPE_PAGES:
+ ret = trace_seq_printf(s, "P ");
+ break;
+ default:
+ ret = trace_seq_printf(s, "? ");
+ }
+
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Skip requested/allocated/flags */
+ ret = trace_seq_printf(s, " ");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Pointer to allocated */
+ ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Skip node and print call site*/
+ ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
+{
+ struct trace_entry *entry = iter->ent;
+
+ if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
+ return TRACE_TYPE_UNHANDLED;
+
+ switch (entry->type) {
+ case TRACE_KMEM_ALLOC:
+ return kmemtrace_print_alloc_compress(iter);
+ case TRACE_KMEM_FREE:
+ return kmemtrace_print_free_compress(iter);
+ default:
+ return TRACE_TYPE_UNHANDLED;
+ }
+}
+
+static struct trace_event kmem_trace_alloc = {
+ .type = TRACE_KMEM_ALLOC,
+ .trace = kmemtrace_print_alloc,
+ .binary = kmemtrace_print_alloc_user,
+};
+
+static struct trace_event kmem_trace_free = {
+ .type = TRACE_KMEM_FREE,
+ .trace = kmemtrace_print_free,
+ .binary = kmemtrace_print_free_user,
+};
+
+static struct tracer kmem_tracer __read_mostly = {
+ .name = "kmemtrace",
+ .init = kmem_trace_init,
+ .reset = kmem_trace_reset,
+ .print_line = kmemtrace_print_line,
+ .print_header = kmemtrace_headers,
+ .flags = &kmem_tracer_flags
+};
+
+void kmemtrace_init(void)
+{
+ /* earliest opportunity to start kmem tracing */
+}
+
+static int __init init_kmem_tracer(void)
+{
+ if (!register_ftrace_event(&kmem_trace_alloc)) {
+ pr_warning("Warning: could not register kmem events\n");
+ return 1;
+ }
+
+ if (!register_ftrace_event(&kmem_trace_free)) {
+ pr_warning("Warning: could not register kmem events\n");
+ return 1;
+ }
+
+ if (!register_tracer(&kmem_tracer)) {
+ pr_warning("Warning: could not register the kmem tracer\n");
+ return 1;
+ }
+
+ return 0;
+}
+device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8b0daf0662e..454e74e718c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,21 +4,115 @@
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*/
#include <linux/ring_buffer.h>
+#include <linux/trace_clock.h>
+#include <linux/ftrace_irq.h>
#include <linux/spinlock.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
+#include <linux/hardirq.h>
+#include <linux/kmemcheck.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/mutex.h>
-#include <linux/sched.h> /* used for sched_clock() (for now) */
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/list.h>
+#include <linux/cpu.h>
#include <linux/fs.h>
#include "trace.h"
/*
+ * The ring buffer header is special. We must manually up keep it.
+ */
+int ring_buffer_print_entry_header(struct trace_seq *s)
+{
+ int ret;
+
+ ret = trace_seq_printf(s, "# compressed entry header\n");
+ ret = trace_seq_printf(s, "\ttype_len : 5 bits\n");
+ ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n");
+ ret = trace_seq_printf(s, "\tarray : 32 bits\n");
+ ret = trace_seq_printf(s, "\n");
+ ret = trace_seq_printf(s, "\tpadding : type == %d\n",
+ RINGBUF_TYPE_PADDING);
+ ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
+ RINGBUF_TYPE_TIME_EXTEND);
+ ret = trace_seq_printf(s, "\tdata max type_len == %d\n",
+ RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
+
+ return ret;
+}
+
+/*
+ * The ring buffer is made up of a list of pages. A separate list of pages is
+ * allocated for each CPU. A writer may only write to a buffer that is
+ * associated with the CPU it is currently executing on. A reader may read
+ * from any per cpu buffer.
+ *
+ * The reader is special. For each per cpu buffer, the reader has its own
+ * reader page. When a reader has read the entire reader page, this reader
+ * page is swapped with another page in the ring buffer.
+ *
+ * Now, as long as the writer is off the reader page, the reader can do what
+ * ever it wants with that page. The writer will never write to that page
+ * again (as long as it is out of the ring buffer).
+ *
+ * Here's some silly ASCII art.
+ *
+ * +------+
+ * |reader| RING BUFFER
+ * |page |
+ * +------+ +---+ +---+ +---+
+ * | |-->| |-->| |
+ * +---+ +---+ +---+
+ * ^ |
+ * | |
+ * +---------------+
+ *
+ *
+ * +------+
+ * |reader| RING BUFFER
+ * |page |------------------v
+ * +------+ +---+ +---+ +---+
+ * | |-->| |-->| |
+ * +---+ +---+ +---+
+ * ^ |
+ * | |
+ * +---------------+
+ *
+ *
+ * +------+
+ * |reader| RING BUFFER
+ * |page |------------------v
+ * +------+ +---+ +---+ +---+
+ * ^ | |-->| |-->| |
+ * | +---+ +---+ +---+
+ * | |
+ * | |
+ * +------------------------------+
+ *
+ *
+ * +------+
+ * |buffer| RING BUFFER
+ * |page |------------------v
+ * +------+ +---+ +---+ +---+
+ * ^ | | | |-->| |
+ * | New +---+ +---+ +---+
+ * | Reader------^ |
+ * | page |
+ * +------------------------------+
+ *
+ *
+ * After we make this swap, the reader can hand this page off to the splice
+ * code and be done with it. It can even allocate a new page if it needs to
+ * and swap that into the ring buffer.
+ *
+ * We will be using cmpxchg soon to make all this lockless.
+ *
+ */
+
+/*
* A fast way to enable or disable all ring buffers is to
* call tracing_on or tracing_off. Turning off the ring buffers
* prevents all ring buffers from being recorded to.
@@ -57,7 +151,9 @@ enum {
RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT,
};
-static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
+static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
+
+#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
/**
* tracing_on - enable all tracing buffers
@@ -89,59 +185,71 @@ EXPORT_SYMBOL_GPL(tracing_off);
* tracing_off_permanent - permanently disable ring buffers
*
* This function, once called, will disable all ring buffers
- * permanenty.
+ * permanently.
*/
void tracing_off_permanent(void)
{
set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
}
+/**
+ * tracing_is_on - show state of ring buffers enabled
+ */
+int tracing_is_on(void)
+{
+ return ring_buffer_flags == RB_BUFFERS_ON;
+}
+EXPORT_SYMBOL_GPL(tracing_is_on);
+
#include "trace.h"
-/* Up this if you want to test the TIME_EXTENTS and normalization */
-#define DEBUG_SHIFT 0
+#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
+#define RB_ALIGNMENT 4U
+#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
+#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
-/* FIXME!!! */
-u64 ring_buffer_time_stamp(int cpu)
-{
- u64 time;
+/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
+#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
- preempt_disable_notrace();
- /* shift to debug/test normalization and TIME_EXTENTS */
- time = sched_clock() << DEBUG_SHIFT;
- preempt_enable_no_resched_notrace();
+enum {
+ RB_LEN_TIME_EXTEND = 8,
+ RB_LEN_TIME_STAMP = 16,
+};
- return time;
+static inline int rb_null_event(struct ring_buffer_event *event)
+{
+ return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
}
-EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
-void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
+static void rb_event_set_padding(struct ring_buffer_event *event)
{
- /* Just stupid testing the normalize function and deltas */
- *ts >>= DEBUG_SHIFT;
+ /* padding has a NULL time_delta */
+ event->type_len = RINGBUF_TYPE_PADDING;
+ event->time_delta = 0;
}
-EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
-#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
-#define RB_ALIGNMENT_SHIFT 2
-#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT)
-#define RB_MAX_SMALL_DATA 28
+static unsigned
+rb_event_data_length(struct ring_buffer_event *event)
+{
+ unsigned length;
-enum {
- RB_LEN_TIME_EXTEND = 8,
- RB_LEN_TIME_STAMP = 16,
-};
+ if (event->type_len)
+ length = event->type_len * RB_ALIGNMENT;
+ else
+ length = event->array[0];
+ return length + RB_EVNT_HDR_SIZE;
+}
/* inline for ring buffer fast paths */
-static inline unsigned
+static unsigned
rb_event_length(struct ring_buffer_event *event)
{
- unsigned length;
-
- switch (event->type) {
+ switch (event->type_len) {
case RINGBUF_TYPE_PADDING:
- /* undefined */
- return -1;
+ if (rb_null_event(event))
+ /* undefined */
+ return -1;
+ return event->array[0] + RB_EVNT_HDR_SIZE;
case RINGBUF_TYPE_TIME_EXTEND:
return RB_LEN_TIME_EXTEND;
@@ -150,11 +258,7 @@ rb_event_length(struct ring_buffer_event *event)
return RB_LEN_TIME_STAMP;
case RINGBUF_TYPE_DATA:
- if (event->len)
- length = event->len << RB_ALIGNMENT_SHIFT;
- else
- length = event->array[0];
- return length + RB_EVNT_HDR_SIZE;
+ return rb_event_data_length(event);
default:
BUG();
}
@@ -169,7 +273,7 @@ rb_event_length(struct ring_buffer_event *event)
unsigned ring_buffer_event_length(struct ring_buffer_event *event)
{
unsigned length = rb_event_length(event);
- if (event->type != RINGBUF_TYPE_DATA)
+ if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
return length;
length -= RB_EVNT_HDR_SIZE;
if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
@@ -179,12 +283,12 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
EXPORT_SYMBOL_GPL(ring_buffer_event_length);
/* inline for ring buffer fast paths */
-static inline void *
+static void *
rb_event_data(struct ring_buffer_event *event)
{
- BUG_ON(event->type != RINGBUF_TYPE_DATA);
+ BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
/* If length is in len field, then array[0] has the data */
- if (event->len)
+ if (event->type_len)
return (void *)&event->array[0];
/* Otherwise length is in array[0] and array[1] has the data */
return (void *)&event->array[1];
@@ -209,30 +313,65 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
struct buffer_data_page {
u64 time_stamp; /* page time stamp */
- local_t commit; /* write commited index */
+ local_t commit; /* write committed index */
unsigned char data[]; /* data of buffer page */
};
+/*
+ * Note, the buffer_page list must be first. The buffer pages
+ * are allocated in cache lines, which means that each buffer
+ * page will be at the beginning of a cache line, and thus
+ * the least significant bits will be zero. We use this to
+ * add flags in the list struct pointers, to make the ring buffer
+ * lockless.
+ */
struct buffer_page {
+ struct list_head list; /* list of buffer pages */
local_t write; /* index for next write */
unsigned read; /* index for next read */
- struct list_head list; /* list of free pages */
+ local_t entries; /* entries on this page */
struct buffer_data_page *page; /* Actual data page */
};
+/*
+ * The buffer page counters, write and entries, must be reset
+ * atomically when crossing page boundaries. To synchronize this
+ * update, two counters are inserted into the number. One is
+ * the actual counter for the write position or count on the page.
+ *
+ * The other is a counter of updaters. Before an update happens
+ * the update partition of the counter is incremented. This will
+ * allow the updater to update the counter atomically.
+ *
+ * The counter is 20 bits, and the state data is 12.
+ */
+#define RB_WRITE_MASK 0xfffff
+#define RB_WRITE_INTCNT (1 << 20)
+
static void rb_init_page(struct buffer_data_page *bpage)
{
local_set(&bpage->commit, 0);
}
+/**
+ * ring_buffer_page_len - the size of data on the page.
+ * @page: The page to read
+ *
+ * Returns the amount of data on the page, including buffer page header.
+ */
+size_t ring_buffer_page_len(void *page)
+{
+ return local_read(&((struct buffer_data_page *)page)->commit)
+ + BUF_PAGE_HDR_SIZE;
+}
+
/*
* Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
* this issue out.
*/
-static inline void free_buffer_page(struct buffer_page *bpage)
+static void free_buffer_page(struct buffer_page *bpage)
{
- if (bpage->page)
- free_page((unsigned long)bpage->page);
+ free_page((unsigned long)bpage->page);
kfree(bpage);
}
@@ -246,7 +385,35 @@ static inline int test_time_stamp(u64 delta)
return 0;
}
-#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page))
+#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
+
+/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
+#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
+
+/* Max number of timestamps that can fit on a page */
+#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
+
+int ring_buffer_print_page_header(struct trace_seq *s)
+{
+ struct buffer_data_page field;
+ int ret;
+
+ ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+ "offset:0;\tsize:%u;\n",
+ (unsigned int)sizeof(field.time_stamp));
+
+ ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
+ "offset:%u;\tsize:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ (unsigned int)sizeof(field.commit));
+
+ ret = trace_seq_printf(s, "\tfield: char data;\t"
+ "offset:%u;\tsize:%u;\n",
+ (unsigned int)offsetof(typeof(field), data),
+ (unsigned int)BUF_PAGE_SIZE);
+
+ return ret;
+}
/*
* head_page == tail_page && head == tail then buffer is empty.
@@ -254,16 +421,20 @@ static inline int test_time_stamp(u64 delta)
struct ring_buffer_per_cpu {
int cpu;
struct ring_buffer *buffer;
- spinlock_t reader_lock; /* serialize readers */
+ spinlock_t reader_lock; /* serialize readers */
raw_spinlock_t lock;
struct lock_class_key lock_key;
- struct list_head pages;
+ struct list_head *pages;
struct buffer_page *head_page; /* read from head */
struct buffer_page *tail_page; /* write to tail */
- struct buffer_page *commit_page; /* commited pages */
+ struct buffer_page *commit_page; /* committed pages */
struct buffer_page *reader_page;
- unsigned long overrun;
- unsigned long entries;
+ local_t commit_overrun;
+ local_t overrun;
+ local_t entries;
+ local_t committing;
+ local_t commits;
+ unsigned long read;
u64 write_stamp;
u64 read_stamp;
atomic_t record_disabled;
@@ -273,12 +444,19 @@ struct ring_buffer {
unsigned pages;
unsigned flags;
int cpus;
- cpumask_var_t cpumask;
atomic_t record_disabled;
+ cpumask_var_t cpumask;
+
+ struct lock_class_key *reader_lock_key;
struct mutex mutex;
struct ring_buffer_per_cpu **buffers;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ struct notifier_block cpu_notify;
+#endif
+ u64 (*clock)(void);
};
struct ring_buffer_iter {
@@ -289,33 +467,456 @@ struct ring_buffer_iter {
};
/* buffer may be either ring_buffer or ring_buffer_per_cpu */
-#define RB_WARN_ON(buffer, cond) \
- ({ \
- int _____ret = unlikely(cond); \
- if (_____ret) { \
- atomic_inc(&buffer->record_disabled); \
- WARN_ON(1); \
- } \
- _____ret; \
+#define RB_WARN_ON(b, cond) \
+ ({ \
+ int _____ret = unlikely(cond); \
+ if (_____ret) { \
+ if (__same_type(*(b), struct ring_buffer_per_cpu)) { \
+ struct ring_buffer_per_cpu *__b = \
+ (void *)b; \
+ atomic_inc(&__b->buffer->record_disabled); \
+ } else \
+ atomic_inc(&b->record_disabled); \
+ WARN_ON(1); \
+ } \
+ _____ret; \
})
+/* Up this if you want to test the TIME_EXTENTS and normalization */
+#define DEBUG_SHIFT 0
+
+static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu)
+{
+ /* shift to debug/test normalization and TIME_EXTENTS */
+ return buffer->clock() << DEBUG_SHIFT;
+}
+
+u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
+{
+ u64 time;
+
+ preempt_disable_notrace();
+ time = rb_time_stamp(buffer, cpu);
+ preempt_enable_no_resched_notrace();
+
+ return time;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
+
+void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
+ int cpu, u64 *ts)
+{
+ /* Just stupid testing the normalize function and deltas */
+ *ts >>= DEBUG_SHIFT;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
+
+/*
+ * Making the ring buffer lockless makes things tricky.
+ * Although writes only happen on the CPU that they are on,
+ * and they only need to worry about interrupts. Reads can
+ * happen on any CPU.
+ *
+ * The reader page is always off the ring buffer, but when the
+ * reader finishes with a page, it needs to swap its page with
+ * a new one from the buffer. The reader needs to take from
+ * the head (writes go to the tail). But if a writer is in overwrite
+ * mode and wraps, it must push the head page forward.
+ *
+ * Here lies the problem.
+ *
+ * The reader must be careful to replace only the head page, and
+ * not another one. As described at the top of the file in the
+ * ASCII art, the reader sets its old page to point to the next
+ * page after head. It then sets the page after head to point to
+ * the old reader page. But if the writer moves the head page
+ * during this operation, the reader could end up with the tail.
+ *
+ * We use cmpxchg to help prevent this race. We also do something
+ * special with the page before head. We set the LSB to 1.
+ *
+ * When the writer must push the page forward, it will clear the
+ * bit that points to the head page, move the head, and then set
+ * the bit that points to the new head page.
+ *
+ * We also don't want an interrupt coming in and moving the head
+ * page on another writer. Thus we use the second LSB to catch
+ * that too. Thus:
+ *
+ * head->list->prev->next bit 1 bit 0
+ * ------- -------
+ * Normal page 0 0
+ * Points to head page 0 1
+ * New head page 1 0
+ *
+ * Note we can not trust the prev pointer of the head page, because:
+ *
+ * +----+ +-----+ +-----+
+ * | |------>| T |---X--->| N |
+ * | |<------| | | |
+ * +----+ +-----+ +-----+
+ * ^ ^ |
+ * | +-----+ | |
+ * +----------| R |----------+ |
+ * | |<-----------+
+ * +-----+
+ *
+ * Key: ---X--> HEAD flag set in pointer
+ * T Tail page
+ * R Reader page
+ * N Next page
+ *
+ * (see __rb_reserve_next() to see where this happens)
+ *
+ * What the above shows is that the reader just swapped out
+ * the reader page with a page in the buffer, but before it
+ * could make the new header point back to the new page added
+ * it was preempted by a writer. The writer moved forward onto
+ * the new page added by the reader and is about to move forward
+ * again.
+ *
+ * You can see, it is legitimate for the previous pointer of
+ * the head (or any page) not to point back to itself. But only
+ * temporarially.
+ */
+
+#define RB_PAGE_NORMAL 0UL
+#define RB_PAGE_HEAD 1UL
+#define RB_PAGE_UPDATE 2UL
+
+
+#define RB_FLAG_MASK 3UL
+
+/* PAGE_MOVED is not part of the mask */
+#define RB_PAGE_MOVED 4UL
+
+/*
+ * rb_list_head - remove any bit
+ */
+static struct list_head *rb_list_head(struct list_head *list)
+{
+ unsigned long val = (unsigned long)list;
+
+ return (struct list_head *)(val & ~RB_FLAG_MASK);
+}
+
+/*
+ * rb_is_head_page - test if the give page is the head page
+ *
+ * Because the reader may move the head_page pointer, we can
+ * not trust what the head page is (it may be pointing to
+ * the reader page). But if the next page is a header page,
+ * its flags will be non zero.
+ */
+static int inline
+rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *page, struct list_head *list)
+{
+ unsigned long val;
+
+ val = (unsigned long)list->next;
+
+ if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list)
+ return RB_PAGE_MOVED;
+
+ return val & RB_FLAG_MASK;
+}
+
+/*
+ * rb_is_reader_page
+ *
+ * The unique thing about the reader page, is that, if the
+ * writer is ever on it, the previous pointer never points
+ * back to the reader page.
+ */
+static int rb_is_reader_page(struct buffer_page *page)
+{
+ struct list_head *list = page->list.prev;
+
+ return rb_list_head(list->next) != &page->list;
+}
+
+/*
+ * rb_set_list_to_head - set a list_head to be pointing to head.
+ */
+static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
+ struct list_head *list)
+{
+ unsigned long *ptr;
+
+ ptr = (unsigned long *)&list->next;
+ *ptr |= RB_PAGE_HEAD;
+ *ptr &= ~RB_PAGE_UPDATE;
+}
+
+/*
+ * rb_head_page_activate - sets up head page
+ */
+static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *head;
+
+ head = cpu_buffer->head_page;
+ if (!head)
+ return;
+
+ /*
+ * Set the previous list pointer to have the HEAD flag.
+ */
+ rb_set_list_to_head(cpu_buffer, head->list.prev);
+}
+
+static void rb_list_head_clear(struct list_head *list)
+{
+ unsigned long *ptr = (unsigned long *)&list->next;
+
+ *ptr &= ~RB_FLAG_MASK;
+}
+
+/*
+ * rb_head_page_dactivate - clears head page ptr (for free list)
+ */
+static void
+rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *hd;
+
+ /* Go through the whole list and clear any pointers found. */
+ rb_list_head_clear(cpu_buffer->pages);
+
+ list_for_each(hd, cpu_buffer->pages)
+ rb_list_head_clear(hd);
+}
+
+static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *head,
+ struct buffer_page *prev,
+ int old_flag, int new_flag)
+{
+ struct list_head *list;
+ unsigned long val = (unsigned long)&head->list;
+ unsigned long ret;
+
+ list = &prev->list;
+
+ val &= ~RB_FLAG_MASK;
+
+ ret = (unsigned long)cmpxchg(&list->next,
+ val | old_flag, val | new_flag);
+
+ /* check if the reader took the page */
+ if ((ret & ~RB_FLAG_MASK) != val)
+ return RB_PAGE_MOVED;
+
+ return ret & RB_FLAG_MASK;
+}
+
+static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *head,
+ struct buffer_page *prev,
+ int old_flag)
+{
+ return rb_head_page_set(cpu_buffer, head, prev,
+ old_flag, RB_PAGE_UPDATE);
+}
+
+static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *head,
+ struct buffer_page *prev,
+ int old_flag)
+{
+ return rb_head_page_set(cpu_buffer, head, prev,
+ old_flag, RB_PAGE_HEAD);
+}
+
+static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *head,
+ struct buffer_page *prev,
+ int old_flag)
+{
+ return rb_head_page_set(cpu_buffer, head, prev,
+ old_flag, RB_PAGE_NORMAL);
+}
+
+static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page **bpage)
+{
+ struct list_head *p = rb_list_head((*bpage)->list.next);
+
+ *bpage = list_entry(p, struct buffer_page, list);
+}
+
+static struct buffer_page *
+rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *head;
+ struct buffer_page *page;
+ struct list_head *list;
+ int i;
+
+ if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
+ return NULL;
+
+ /* sanity check */
+ list = cpu_buffer->pages;
+ if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list))
+ return NULL;
+
+ page = head = cpu_buffer->head_page;
+ /*
+ * It is possible that the writer moves the header behind
+ * where we started, and we miss in one loop.
+ * A second loop should grab the header, but we'll do
+ * three loops just because I'm paranoid.
+ */
+ for (i = 0; i < 3; i++) {
+ do {
+ if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
+ cpu_buffer->head_page = page;
+ return page;
+ }
+ rb_inc_page(cpu_buffer, &page);
+ } while (page != head);
+ }
+
+ RB_WARN_ON(cpu_buffer, 1);
+
+ return NULL;
+}
+
+static int rb_head_page_replace(struct buffer_page *old,
+ struct buffer_page *new)
+{
+ unsigned long *ptr = (unsigned long *)&old->list.prev->next;
+ unsigned long val;
+ unsigned long ret;
+
+ val = *ptr & ~RB_FLAG_MASK;
+ val |= RB_PAGE_HEAD;
+
+ ret = cmpxchg(ptr, val, &new->list);
+
+ return ret == val;
+}
+
+/*
+ * rb_tail_page_update - move the tail page forward
+ *
+ * Returns 1 if moved tail page, 0 if someone else did.
+ */
+static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *tail_page,
+ struct buffer_page *next_page)
+{
+ struct buffer_page *old_tail;
+ unsigned long old_entries;
+ unsigned long old_write;
+ int ret = 0;
+
+ /*
+ * The tail page now needs to be moved forward.
+ *
+ * We need to reset the tail page, but without messing
+ * with possible erasing of data brought in by interrupts
+ * that have moved the tail page and are currently on it.
+ *
+ * We add a counter to the write field to denote this.
+ */
+ old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
+ old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
+
+ /*
+ * Just make sure we have seen our old_write and synchronize
+ * with any interrupts that come in.
+ */
+ barrier();
+
+ /*
+ * If the tail page is still the same as what we think
+ * it is, then it is up to us to update the tail
+ * pointer.
+ */
+ if (tail_page == cpu_buffer->tail_page) {
+ /* Zero the write counter */
+ unsigned long val = old_write & ~RB_WRITE_MASK;
+ unsigned long eval = old_entries & ~RB_WRITE_MASK;
+
+ /*
+ * This will only succeed if an interrupt did
+ * not come in and change it. In which case, we
+ * do not want to modify it.
+ *
+ * We add (void) to let the compiler know that we do not care
+ * about the return value of these functions. We use the
+ * cmpxchg to only update if an interrupt did not already
+ * do it for us. If the cmpxchg fails, we don't care.
+ */
+ (void)local_cmpxchg(&next_page->write, old_write, val);
+ (void)local_cmpxchg(&next_page->entries, old_entries, eval);
+
+ /*
+ * No need to worry about races with clearing out the commit.
+ * it only can increment when a commit takes place. But that
+ * only happens in the outer most nested commit.
+ */
+ local_set(&next_page->page->commit, 0);
+
+ old_tail = cmpxchg(&cpu_buffer->tail_page,
+ tail_page, next_page);
+
+ if (old_tail == tail_page)
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *bpage)
+{
+ unsigned long val = (unsigned long)bpage;
+
+ if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * rb_check_list - make sure a pointer to a list has the last bits zero
+ */
+static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
+ struct list_head *list)
+{
+ if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
+ return 1;
+ if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
+ return 1;
+ return 0;
+}
+
/**
* check_pages - integrity check of buffer pages
* @cpu_buffer: CPU buffer with pages to test
*
- * As a safty measure we check to make sure the data pages have not
+ * As a safety measure we check to make sure the data pages have not
* been corrupted.
*/
static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct list_head *head = &cpu_buffer->pages;
+ struct list_head *head = cpu_buffer->pages;
struct buffer_page *bpage, *tmp;
+ rb_head_page_deactivate(cpu_buffer);
+
if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
return -1;
if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
return -1;
+ if (rb_check_list(cpu_buffer, head))
+ return -1;
+
list_for_each_entry_safe(bpage, tmp, head, list) {
if (RB_WARN_ON(cpu_buffer,
bpage->list.next->prev != &bpage->list))
@@ -323,25 +924,33 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
if (RB_WARN_ON(cpu_buffer,
bpage->list.prev->next != &bpage->list))
return -1;
+ if (rb_check_list(cpu_buffer, &bpage->list))
+ return -1;
}
+ rb_head_page_activate(cpu_buffer);
+
return 0;
}
static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
unsigned nr_pages)
{
- struct list_head *head = &cpu_buffer->pages;
struct buffer_page *bpage, *tmp;
unsigned long addr;
LIST_HEAD(pages);
unsigned i;
+ WARN_ON(!nr_pages);
+
for (i = 0; i < nr_pages; i++) {
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
if (!bpage)
goto free_pages;
+
+ rb_check_bpage(cpu_buffer, bpage);
+
list_add(&bpage->list, &pages);
addr = __get_free_page(GFP_KERNEL);
@@ -351,7 +960,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
rb_init_page(bpage->page);
}
- list_splice(&pages, head);
+ /*
+ * The ring buffer page list is a circular list that does not
+ * start and end with a list head. All page list items point to
+ * other pages.
+ */
+ cpu_buffer->pages = pages.next;
+ list_del(&pages);
rb_check_pages(cpu_buffer);
@@ -381,14 +996,16 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
cpu_buffer->cpu = cpu;
cpu_buffer->buffer = buffer;
spin_lock_init(&cpu_buffer->reader_lock);
+ lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
- INIT_LIST_HEAD(&cpu_buffer->pages);
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu));
if (!bpage)
goto fail_free_buffer;
+ rb_check_bpage(cpu_buffer, bpage);
+
cpu_buffer->reader_page = bpage;
addr = __get_free_page(GFP_KERNEL);
if (!addr)
@@ -403,9 +1020,11 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
goto fail_free_reader;
cpu_buffer->head_page
- = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+ = list_entry(cpu_buffer->pages, struct buffer_page, list);
cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+ rb_head_page_activate(cpu_buffer);
+
return cpu_buffer;
fail_free_reader:
@@ -418,24 +1037,29 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct list_head *head = &cpu_buffer->pages;
+ struct list_head *head = cpu_buffer->pages;
struct buffer_page *bpage, *tmp;
- list_del_init(&cpu_buffer->reader_page->list);
free_buffer_page(cpu_buffer->reader_page);
- list_for_each_entry_safe(bpage, tmp, head, list) {
- list_del_init(&bpage->list);
+ rb_head_page_deactivate(cpu_buffer);
+
+ if (head) {
+ list_for_each_entry_safe(bpage, tmp, head, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ bpage = list_entry(head, struct buffer_page, list);
free_buffer_page(bpage);
}
+
kfree(cpu_buffer);
}
-/*
- * Causes compile errors if the struct buffer_page gets bigger
- * than the struct page.
- */
-extern int ring_buffer_page_too_big(void);
+#ifdef CONFIG_HOTPLUG_CPU
+static int rb_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu);
+#endif
/**
* ring_buffer_alloc - allocate a new ring_buffer
@@ -447,17 +1071,13 @@ extern int ring_buffer_page_too_big(void);
* when the buffer wraps. If this flag is not set, the buffer will
* drop data when the tail hits the head.
*/
-struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
+struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
+ struct lock_class_key *key)
{
struct ring_buffer *buffer;
int bsize;
int cpu;
- /* Paranoid! Optimizes out when all is well */
- if (sizeof(struct buffer_page) > sizeof(struct page))
- ring_buffer_page_too_big();
-
-
/* keep it in its own cache line */
buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
GFP_KERNEL);
@@ -469,12 +1089,24 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
buffer->flags = flags;
+ buffer->clock = trace_clock_local;
+ buffer->reader_lock_key = key;
/* need at least two pages */
- if (buffer->pages == 1)
- buffer->pages++;
+ if (buffer->pages < 2)
+ buffer->pages = 2;
+ /*
+ * In case of non-hotplug cpu, if the ring-buffer is allocated
+ * in early initcall, it will not be notified of secondary cpus.
+ * In that off case, we need to allocate for all possible cpus.
+ */
+#ifdef CONFIG_HOTPLUG_CPU
+ get_online_cpus();
+ cpumask_copy(buffer->cpumask, cpu_online_mask);
+#else
cpumask_copy(buffer->cpumask, cpu_possible_mask);
+#endif
buffer->cpus = nr_cpu_ids;
bsize = sizeof(void *) * nr_cpu_ids;
@@ -490,6 +1122,13 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
goto fail_free_buffers;
}
+#ifdef CONFIG_HOTPLUG_CPU
+ buffer->cpu_notify.notifier_call = rb_cpu_notify;
+ buffer->cpu_notify.priority = 0;
+ register_cpu_notifier(&buffer->cpu_notify);
+#endif
+
+ put_online_cpus();
mutex_init(&buffer->mutex);
return buffer;
@@ -503,12 +1142,13 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
fail_free_cpumask:
free_cpumask_var(buffer->cpumask);
+ put_online_cpus();
fail_free_buffer:
kfree(buffer);
return NULL;
}
-EXPORT_SYMBOL_GPL(ring_buffer_alloc);
+EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
/**
* ring_buffer_free - free a ring buffer.
@@ -519,15 +1159,30 @@ ring_buffer_free(struct ring_buffer *buffer)
{
int cpu;
+ get_online_cpus();
+
+#ifdef CONFIG_HOTPLUG_CPU
+ unregister_cpu_notifier(&buffer->cpu_notify);
+#endif
+
for_each_buffer_cpu(buffer, cpu)
rb_free_cpu_buffer(buffer->buffers[cpu]);
+ put_online_cpus();
+
+ kfree(buffer->buffers);
free_cpumask_var(buffer->cpumask);
kfree(buffer);
}
EXPORT_SYMBOL_GPL(ring_buffer_free);
+void ring_buffer_set_clock(struct ring_buffer *buffer,
+ u64 (*clock)(void))
+{
+ buffer->clock = clock;
+}
+
static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
static void
@@ -540,15 +1195,17 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
atomic_inc(&cpu_buffer->record_disabled);
synchronize_sched();
+ rb_head_page_deactivate(cpu_buffer);
+
for (i = 0; i < nr_pages; i++) {
- if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
+ if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
return;
- p = cpu_buffer->pages.next;
+ p = cpu_buffer->pages->next;
bpage = list_entry(p, struct buffer_page, list);
list_del_init(&bpage->list);
free_buffer_page(bpage);
}
- if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
+ if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
return;
rb_reset_cpu(cpu_buffer);
@@ -570,15 +1227,19 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
atomic_inc(&cpu_buffer->record_disabled);
synchronize_sched();
+ spin_lock_irq(&cpu_buffer->reader_lock);
+ rb_head_page_deactivate(cpu_buffer);
+
for (i = 0; i < nr_pages; i++) {
if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
return;
p = pages->next;
bpage = list_entry(p, struct buffer_page, list);
list_del_init(&bpage->list);
- list_add_tail(&bpage->list, &cpu_buffer->pages);
+ list_add_tail(&bpage->list, cpu_buffer->pages);
}
rb_reset_cpu(cpu_buffer);
+ spin_unlock_irq(&cpu_buffer->reader_lock);
rb_check_pages(cpu_buffer);
@@ -627,16 +1288,15 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
return size;
mutex_lock(&buffer->mutex);
+ get_online_cpus();
nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
if (size < buffer_size) {
/* easy case, just free pages */
- if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) {
- mutex_unlock(&buffer->mutex);
- return -1;
- }
+ if (RB_WARN_ON(buffer, nr_pages >= buffer->pages))
+ goto out_fail;
rm_pages = buffer->pages - nr_pages;
@@ -655,10 +1315,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
* add these pages to the cpu_buffers. Otherwise we just free
* them all and return -ENOMEM;
*/
- if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) {
- mutex_unlock(&buffer->mutex);
- return -1;
- }
+ if (RB_WARN_ON(buffer, nr_pages <= buffer->pages))
+ goto out_fail;
new_pages = nr_pages - buffer->pages;
@@ -683,13 +1341,12 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
rb_insert_pages(cpu_buffer, &pages, new_pages);
}
- if (RB_WARN_ON(buffer, !list_empty(&pages))) {
- mutex_unlock(&buffer->mutex);
- return -1;
- }
+ if (RB_WARN_ON(buffer, !list_empty(&pages)))
+ goto out_fail;
out:
buffer->pages = nr_pages;
+ put_online_cpus();
mutex_unlock(&buffer->mutex);
return size;
@@ -699,15 +1356,20 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
list_del_init(&bpage->list);
free_buffer_page(bpage);
}
+ put_online_cpus();
mutex_unlock(&buffer->mutex);
return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_resize);
-static inline int rb_null_event(struct ring_buffer_event *event)
-{
- return event->type == RINGBUF_TYPE_PADDING;
+ /*
+ * Something went totally wrong, and we are too paranoid
+ * to even clean up the mess.
+ */
+ out_fail:
+ put_online_cpus();
+ mutex_unlock(&buffer->mutex);
+ return -1;
}
+EXPORT_SYMBOL_GPL(ring_buffer_resize);
static inline void *
__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
@@ -728,21 +1390,14 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
}
static inline struct ring_buffer_event *
-rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
-{
- return __rb_page_index(cpu_buffer->head_page,
- cpu_buffer->head_page->read);
-}
-
-static inline struct ring_buffer_event *
rb_iter_head_event(struct ring_buffer_iter *iter)
{
return __rb_page_index(iter->head_page, iter->head);
}
-static inline unsigned rb_page_write(struct buffer_page *bpage)
+static inline unsigned long rb_page_write(struct buffer_page *bpage)
{
- return local_read(&bpage->write);
+ return local_read(&bpage->write) & RB_WRITE_MASK;
}
static inline unsigned rb_page_commit(struct buffer_page *bpage)
@@ -750,6 +1405,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage)
return local_read(&bpage->page->commit);
}
+static inline unsigned long rb_page_entries(struct buffer_page *bpage)
+{
+ return local_read(&bpage->entries) & RB_WRITE_MASK;
+}
+
/* Size is determined by what has been commited */
static inline unsigned rb_page_size(struct buffer_page *bpage)
{
@@ -762,58 +1422,17 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
return rb_page_commit(cpu_buffer->commit_page);
}
-static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
-{
- return rb_page_commit(cpu_buffer->head_page);
-}
-
-/*
- * When the tail hits the head and the buffer is in overwrite mode,
- * the head jumps to the next page and all content on the previous
- * page is discarded. But before doing so, we update the overrun
- * variable of the buffer.
- */
-static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
-{
- struct ring_buffer_event *event;
- unsigned long head;
-
- for (head = 0; head < rb_head_size(cpu_buffer);
- head += rb_event_length(event)) {
-
- event = __rb_page_index(cpu_buffer->head_page, head);
- if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
- return;
- /* Only count data entries */
- if (event->type != RINGBUF_TYPE_DATA)
- continue;
- cpu_buffer->overrun++;
- cpu_buffer->entries--;
- }
-}
-
-static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
- struct buffer_page **bpage)
-{
- struct list_head *p = (*bpage)->list.next;
-
- if (p == &cpu_buffer->pages)
- p = p->next;
-
- *bpage = list_entry(p, struct buffer_page, list);
-}
-
static inline unsigned
rb_event_index(struct ring_buffer_event *event)
{
unsigned long addr = (unsigned long)event;
- return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
+ return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
}
static inline int
-rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
+rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
{
unsigned long addr = (unsigned long)event;
unsigned long index;
@@ -825,34 +1444,11 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
rb_commit_index(cpu_buffer) == index;
}
-static inline void
-rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- unsigned long addr = (unsigned long)event;
- unsigned long index;
-
- index = rb_event_index(event);
- addr &= PAGE_MASK;
-
- while (cpu_buffer->commit_page->page != (void *)addr) {
- if (RB_WARN_ON(cpu_buffer,
- cpu_buffer->commit_page == cpu_buffer->tail_page))
- return;
- cpu_buffer->commit_page->page->commit =
- cpu_buffer->commit_page->write;
- rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
- }
-
- /* Now set the commit to the event's index */
- local_set(&cpu_buffer->commit_page->page->commit, index);
-}
-
-static inline void
+static void
rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
{
+ unsigned long max_count;
+
/*
* We only race with interrupts and NMIs on this CPU.
* If we own the commit event, then we can commit
@@ -862,9 +1458,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
* assign the commit to the tail.
*/
again:
+ max_count = cpu_buffer->buffer->pages * 100;
+
while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
- cpu_buffer->commit_page->page->commit =
- cpu_buffer->commit_page->write;
+ if (RB_WARN_ON(cpu_buffer, !(--max_count)))
+ return;
+ if (RB_WARN_ON(cpu_buffer,
+ rb_is_reader_page(cpu_buffer->tail_page)))
+ return;
+ local_set(&cpu_buffer->commit_page->page->commit,
+ rb_page_write(cpu_buffer->commit_page));
rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
cpu_buffer->write_stamp =
cpu_buffer->commit_page->page->time_stamp;
@@ -873,8 +1476,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
}
while (rb_commit_index(cpu_buffer) !=
rb_page_write(cpu_buffer->commit_page)) {
- cpu_buffer->commit_page->page->commit =
- cpu_buffer->commit_page->write;
+
+ local_set(&cpu_buffer->commit_page->page->commit,
+ rb_page_write(cpu_buffer->commit_page));
+ RB_WARN_ON(cpu_buffer,
+ local_read(&cpu_buffer->commit_page->page->commit) &
+ ~RB_WRITE_MASK);
barrier();
}
@@ -896,7 +1503,7 @@ static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->reader_page->read = 0;
}
-static inline void rb_inc_iter(struct ring_buffer_iter *iter)
+static void rb_inc_iter(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
@@ -907,7 +1514,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter)
* to the head page instead of next.
*/
if (iter->head_page == cpu_buffer->reader_page)
- iter->head_page = cpu_buffer->head_page;
+ iter->head_page = rb_set_head_page(cpu_buffer);
else
rb_inc_page(cpu_buffer, &iter->head_page);
@@ -926,45 +1533,189 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter)
* and with this, we can determine what to place into the
* data field.
*/
-static inline void
+static void
rb_update_event(struct ring_buffer_event *event,
unsigned type, unsigned length)
{
- event->type = type;
+ event->type_len = type;
switch (type) {
case RINGBUF_TYPE_PADDING:
- break;
-
case RINGBUF_TYPE_TIME_EXTEND:
- event->len =
- (RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1))
- >> RB_ALIGNMENT_SHIFT;
- break;
-
case RINGBUF_TYPE_TIME_STAMP:
- event->len =
- (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
- >> RB_ALIGNMENT_SHIFT;
break;
- case RINGBUF_TYPE_DATA:
+ case 0:
length -= RB_EVNT_HDR_SIZE;
- if (length > RB_MAX_SMALL_DATA) {
- event->len = 0;
+ if (length > RB_MAX_SMALL_DATA)
event->array[0] = length;
- } else
- event->len =
- (length + (RB_ALIGNMENT-1))
- >> RB_ALIGNMENT_SHIFT;
+ else
+ event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
break;
default:
BUG();
}
}
-static inline unsigned rb_calculate_event_length(unsigned length)
+/*
+ * rb_handle_head_page - writer hit the head page
+ *
+ * Returns: +1 to retry page
+ * 0 to continue
+ * -1 on error
+ */
+static int
+rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *tail_page,
+ struct buffer_page *next_page)
+{
+ struct buffer_page *new_head;
+ int entries;
+ int type;
+ int ret;
+
+ entries = rb_page_entries(next_page);
+
+ /*
+ * The hard part is here. We need to move the head
+ * forward, and protect against both readers on
+ * other CPUs and writers coming in via interrupts.
+ */
+ type = rb_head_page_set_update(cpu_buffer, next_page, tail_page,
+ RB_PAGE_HEAD);
+
+ /*
+ * type can be one of four:
+ * NORMAL - an interrupt already moved it for us
+ * HEAD - we are the first to get here.
+ * UPDATE - we are the interrupt interrupting
+ * a current move.
+ * MOVED - a reader on another CPU moved the next
+ * pointer to its reader page. Give up
+ * and try again.
+ */
+
+ switch (type) {
+ case RB_PAGE_HEAD:
+ /*
+ * We changed the head to UPDATE, thus
+ * it is our responsibility to update
+ * the counters.
+ */
+ local_add(entries, &cpu_buffer->overrun);
+
+ /*
+ * The entries will be zeroed out when we move the
+ * tail page.
+ */
+
+ /* still more to do */
+ break;
+
+ case RB_PAGE_UPDATE:
+ /*
+ * This is an interrupt that interrupt the
+ * previous update. Still more to do.
+ */
+ break;
+ case RB_PAGE_NORMAL:
+ /*
+ * An interrupt came in before the update
+ * and processed this for us.
+ * Nothing left to do.
+ */
+ return 1;
+ case RB_PAGE_MOVED:
+ /*
+ * The reader is on another CPU and just did
+ * a swap with our next_page.
+ * Try again.
+ */
+ return 1;
+ default:
+ RB_WARN_ON(cpu_buffer, 1); /* WTF??? */
+ return -1;
+ }
+
+ /*
+ * Now that we are here, the old head pointer is
+ * set to UPDATE. This will keep the reader from
+ * swapping the head page with the reader page.
+ * The reader (on another CPU) will spin till
+ * we are finished.
+ *
+ * We just need to protect against interrupts
+ * doing the job. We will set the next pointer
+ * to HEAD. After that, we set the old pointer
+ * to NORMAL, but only if it was HEAD before.
+ * otherwise we are an interrupt, and only
+ * want the outer most commit to reset it.
+ */
+ new_head = next_page;
+ rb_inc_page(cpu_buffer, &new_head);
+
+ ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
+ RB_PAGE_NORMAL);
+
+ /*
+ * Valid returns are:
+ * HEAD - an interrupt came in and already set it.
+ * NORMAL - One of two things:
+ * 1) We really set it.
+ * 2) A bunch of interrupts came in and moved
+ * the page forward again.
+ */
+ switch (ret) {
+ case RB_PAGE_HEAD:
+ case RB_PAGE_NORMAL:
+ /* OK */
+ break;
+ default:
+ RB_WARN_ON(cpu_buffer, 1);
+ return -1;
+ }
+
+ /*
+ * It is possible that an interrupt came in,
+ * set the head up, then more interrupts came in
+ * and moved it again. When we get back here,
+ * the page would have been set to NORMAL but we
+ * just set it back to HEAD.
+ *
+ * How do you detect this? Well, if that happened
+ * the tail page would have moved.
+ */
+ if (ret == RB_PAGE_NORMAL) {
+ /*
+ * If the tail had moved passed next, then we need
+ * to reset the pointer.
+ */
+ if (cpu_buffer->tail_page != tail_page &&
+ cpu_buffer->tail_page != next_page)
+ rb_head_page_set_normal(cpu_buffer, new_head,
+ next_page,
+ RB_PAGE_HEAD);
+ }
+
+ /*
+ * If this was the outer most commit (the one that
+ * changed the original pointer from HEAD to UPDATE),
+ * then it is up to us to reset it to NORMAL.
+ */
+ if (type == RB_PAGE_HEAD) {
+ ret = rb_head_page_set_normal(cpu_buffer, next_page,
+ tail_page,
+ RB_PAGE_UPDATE);
+ if (RB_WARN_ON(cpu_buffer,
+ ret != RB_PAGE_UPDATE))
+ return -1;
+ }
+
+ return 0;
+}
+
+static unsigned rb_calculate_event_length(unsigned length)
{
struct ring_buffer_event event; /* Used only for sizeof array */
@@ -981,133 +1732,239 @@ static inline unsigned rb_calculate_event_length(unsigned length)
return length;
}
-static struct ring_buffer_event *
-__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned type, unsigned long length, u64 *ts)
+static inline void
+rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *tail_page,
+ unsigned long tail, unsigned long length)
{
- struct buffer_page *tail_page, *head_page, *reader_page, *commit_page;
- unsigned long tail, write;
- struct ring_buffer *buffer = cpu_buffer->buffer;
struct ring_buffer_event *event;
- unsigned long flags;
- commit_page = cpu_buffer->commit_page;
- /* we just need to protect against interrupts */
- barrier();
- tail_page = cpu_buffer->tail_page;
- write = local_add_return(length, &tail_page->write);
- tail = write - length;
+ /*
+ * Only the event that crossed the page boundary
+ * must fill the old tail_page with padding.
+ */
+ if (tail >= BUF_PAGE_SIZE) {
+ local_sub(length, &tail_page->write);
+ return;
+ }
- /* See if we shot pass the end of this buffer page */
- if (write > BUF_PAGE_SIZE) {
- struct buffer_page *next_page = tail_page;
+ event = __rb_page_index(tail_page, tail);
+ kmemcheck_annotate_bitfield(event, bitfield);
- local_irq_save(flags);
- __raw_spin_lock(&cpu_buffer->lock);
+ /*
+ * If this event is bigger than the minimum size, then
+ * we need to be careful that we don't subtract the
+ * write counter enough to allow another writer to slip
+ * in on this page.
+ * We put in a discarded commit instead, to make sure
+ * that this space is not used again.
+ *
+ * If we are less than the minimum size, we don't need to
+ * worry about it.
+ */
+ if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
+ /* No room for any events */
- rb_inc_page(cpu_buffer, &next_page);
+ /* Mark the rest of the page with padding */
+ rb_event_set_padding(event);
- head_page = cpu_buffer->head_page;
- reader_page = cpu_buffer->reader_page;
+ /* Set the write back to the previous setting */
+ local_sub(length, &tail_page->write);
+ return;
+ }
- /* we grabbed the lock before incrementing */
- if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
- goto out_unlock;
+ /* Put in a discarded event */
+ event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
+ event->type_len = RINGBUF_TYPE_PADDING;
+ /* time delta must be non zero */
+ event->time_delta = 1;
- /*
- * If for some reason, we had an interrupt storm that made
- * it all the way around the buffer, bail, and warn
- * about it.
- */
- if (unlikely(next_page == commit_page)) {
- WARN_ON_ONCE(1);
- goto out_unlock;
- }
+ /* Set write to end of buffer */
+ length = (tail + length) - BUF_PAGE_SIZE;
+ local_sub(length, &tail_page->write);
+}
- if (next_page == head_page) {
- if (!(buffer->flags & RB_FL_OVERWRITE)) {
- /* reset write */
- if (tail <= BUF_PAGE_SIZE)
- local_set(&tail_page->write, tail);
- goto out_unlock;
- }
+static struct ring_buffer_event *
+rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long length, unsigned long tail,
+ struct buffer_page *commit_page,
+ struct buffer_page *tail_page, u64 *ts)
+{
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct buffer_page *next_page;
+ int ret;
- /* tail_page has not moved yet? */
- if (tail_page == cpu_buffer->tail_page) {
- /* count overflows */
- rb_update_overflow(cpu_buffer);
+ next_page = tail_page;
- rb_inc_page(cpu_buffer, &head_page);
- cpu_buffer->head_page = head_page;
- cpu_buffer->head_page->read = 0;
- }
- }
+ rb_inc_page(cpu_buffer, &next_page);
+
+ /*
+ * If for some reason, we had an interrupt storm that made
+ * it all the way around the buffer, bail, and warn
+ * about it.
+ */
+ if (unlikely(next_page == commit_page)) {
+ local_inc(&cpu_buffer->commit_overrun);
+ goto out_reset;
+ }
+
+ /*
+ * This is where the fun begins!
+ *
+ * We are fighting against races between a reader that
+ * could be on another CPU trying to swap its reader
+ * page with the buffer head.
+ *
+ * We are also fighting against interrupts coming in and
+ * moving the head or tail on us as well.
+ *
+ * If the next page is the head page then we have filled
+ * the buffer, unless the commit page is still on the
+ * reader page.
+ */
+ if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
/*
- * If the tail page is still the same as what we think
- * it is, then it is up to us to update the tail
- * pointer.
+ * If the commit is not on the reader page, then
+ * move the header page.
*/
- if (tail_page == cpu_buffer->tail_page) {
- local_set(&next_page->write, 0);
- local_set(&next_page->page->commit, 0);
- cpu_buffer->tail_page = next_page;
-
- /* reread the time stamp */
- *ts = ring_buffer_time_stamp(cpu_buffer->cpu);
- cpu_buffer->tail_page->page->time_stamp = *ts;
+ if (!rb_is_reader_page(cpu_buffer->commit_page)) {
+ /*
+ * If we are not in overwrite mode,
+ * this is easy, just stop here.
+ */
+ if (!(buffer->flags & RB_FL_OVERWRITE))
+ goto out_reset;
+
+ ret = rb_handle_head_page(cpu_buffer,
+ tail_page,
+ next_page);
+ if (ret < 0)
+ goto out_reset;
+ if (ret)
+ goto out_again;
+ } else {
+ /*
+ * We need to be careful here too. The
+ * commit page could still be on the reader
+ * page. We could have a small buffer, and
+ * have filled up the buffer with events
+ * from interrupts and such, and wrapped.
+ *
+ * Note, if the tail page is also the on the
+ * reader_page, we let it move out.
+ */
+ if (unlikely((cpu_buffer->commit_page !=
+ cpu_buffer->tail_page) &&
+ (cpu_buffer->commit_page ==
+ cpu_buffer->reader_page))) {
+ local_inc(&cpu_buffer->commit_overrun);
+ goto out_reset;
+ }
}
+ }
+ ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
+ if (ret) {
/*
- * The actual tail page has moved forward.
+ * Nested commits always have zero deltas, so
+ * just reread the time stamp
*/
- if (tail < BUF_PAGE_SIZE) {
- /* Mark the rest of the page with padding */
- event = __rb_page_index(tail_page, tail);
- event->type = RINGBUF_TYPE_PADDING;
- }
+ *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
+ next_page->page->time_stamp = *ts;
+ }
- if (tail <= BUF_PAGE_SIZE)
- /* Set the write back to the previous setting */
- local_set(&tail_page->write, tail);
+ out_again:
- /*
- * If this was a commit entry that failed,
- * increment that too
- */
- if (tail_page == cpu_buffer->commit_page &&
- tail == rb_commit_index(cpu_buffer)) {
- rb_set_commit_to_write(cpu_buffer);
- }
+ rb_reset_tail(cpu_buffer, tail_page, tail, length);
- __raw_spin_unlock(&cpu_buffer->lock);
- local_irq_restore(flags);
+ /* fail and let the caller try again */
+ return ERR_PTR(-EAGAIN);
- /* fail and let the caller try again */
- return ERR_PTR(-EAGAIN);
- }
+ out_reset:
+ /* reset write */
+ rb_reset_tail(cpu_buffer, tail_page, tail, length);
- /* We reserved something on the buffer */
+ return NULL;
+}
- if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
- return NULL;
+static struct ring_buffer_event *
+__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length, u64 *ts)
+{
+ struct buffer_page *tail_page, *commit_page;
+ struct ring_buffer_event *event;
+ unsigned long tail, write;
+
+ commit_page = cpu_buffer->commit_page;
+ /* we just need to protect against interrupts */
+ barrier();
+ tail_page = cpu_buffer->tail_page;
+ write = local_add_return(length, &tail_page->write);
+
+ /* set write to only the index of the write */
+ write &= RB_WRITE_MASK;
+ tail = write - length;
+
+ /* See if we shot pass the end of this buffer page */
+ if (write > BUF_PAGE_SIZE)
+ return rb_move_tail(cpu_buffer, length, tail,
+ commit_page, tail_page, ts);
+
+ /* We reserved something on the buffer */
event = __rb_page_index(tail_page, tail);
+ kmemcheck_annotate_bitfield(event, bitfield);
rb_update_event(event, type, length);
+ /* The passed in type is zero for DATA */
+ if (likely(!type))
+ local_inc(&tail_page->entries);
+
/*
- * If this is a commit and the tail is zero, then update
- * this page's time stamp.
+ * If this is the first commit on the page, then update
+ * its timestamp.
*/
- if (!tail && rb_is_commit(cpu_buffer, event))
- cpu_buffer->commit_page->page->time_stamp = *ts;
+ if (!tail)
+ tail_page->page->time_stamp = *ts;
return event;
+}
- out_unlock:
- __raw_spin_unlock(&cpu_buffer->lock);
- local_irq_restore(flags);
- return NULL;
+static inline int
+rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ unsigned long new_index, old_index;
+ struct buffer_page *bpage;
+ unsigned long index;
+ unsigned long addr;
+
+ new_index = rb_event_index(event);
+ old_index = new_index + rb_event_length(event);
+ addr = (unsigned long)event;
+ addr &= PAGE_MASK;
+
+ bpage = cpu_buffer->tail_page;
+
+ if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
+ unsigned long write_mask =
+ local_read(&bpage->write) & ~RB_WRITE_MASK;
+ /*
+ * This is on the tail page. It is possible that
+ * a write could come in and move the tail page
+ * and write to the next page. That is fine
+ * because we just shorten what is on this page.
+ */
+ old_index += write_mask;
+ new_index += write_mask;
+ index = local_cmpxchg(&bpage->write, old_index, new_index);
+ if (index == old_index)
+ return 1;
+ }
+
+ /* could not discard */
+ return 0;
}
static int
@@ -1142,26 +1999,33 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
return -EAGAIN;
/* Only a commited time event can update the write stamp */
- if (rb_is_commit(cpu_buffer, event)) {
+ if (rb_event_is_commit(cpu_buffer, event)) {
/*
- * If this is the first on the page, then we need to
- * update the page itself, and just put in a zero.
+ * If this is the first on the page, then it was
+ * updated with the page itself. Try to discard it
+ * and if we can't just make it zero.
*/
if (rb_event_index(event)) {
event->time_delta = *delta & TS_MASK;
event->array[0] = *delta >> TS_SHIFT;
} else {
- cpu_buffer->commit_page->page->time_stamp = *ts;
- event->time_delta = 0;
- event->array[0] = 0;
+ /* try to discard, since we do not need this */
+ if (!rb_try_to_discard(cpu_buffer, event)) {
+ /* nope, just zero it */
+ event->time_delta = 0;
+ event->array[0] = 0;
+ }
}
cpu_buffer->write_stamp = *ts;
/* let the caller know this was the commit */
ret = 1;
} else {
- /* Darn, this is just wasted space */
- event->time_delta = 0;
- event->array[0] = 0;
+ /* Try to discard the event */
+ if (!rb_try_to_discard(cpu_buffer, event)) {
+ /* Darn, this is just wasted space */
+ event->time_delta = 0;
+ event->array[0] = 0;
+ }
ret = 0;
}
@@ -1170,15 +2034,72 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
return ret;
}
+static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ local_inc(&cpu_buffer->committing);
+ local_inc(&cpu_buffer->commits);
+}
+
+static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ unsigned long commits;
+
+ if (RB_WARN_ON(cpu_buffer,
+ !local_read(&cpu_buffer->committing)))
+ return;
+
+ again:
+ commits = local_read(&cpu_buffer->commits);
+ /* synchronize with interrupts */
+ barrier();
+ if (local_read(&cpu_buffer->committing) == 1)
+ rb_set_commit_to_write(cpu_buffer);
+
+ local_dec(&cpu_buffer->committing);
+
+ /* synchronize with interrupts */
+ barrier();
+
+ /*
+ * Need to account for interrupts coming in between the
+ * updating of the commit page and the clearing of the
+ * committing counter.
+ */
+ if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
+ !local_read(&cpu_buffer->committing)) {
+ local_inc(&cpu_buffer->committing);
+ goto again;
+ }
+}
+
static struct ring_buffer_event *
-rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned type, unsigned long length)
+rb_reserve_next_event(struct ring_buffer *buffer,
+ struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long length)
{
struct ring_buffer_event *event;
- u64 ts, delta;
+ u64 ts, delta = 0;
int commit = 0;
int nr_loops = 0;
+ rb_start_commit(cpu_buffer);
+
+#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
+ /*
+ * Due to the ability to swap a cpu buffer from a buffer
+ * it is possible it was swapped before we committed.
+ * (committing stops a swap). We check for it here and
+ * if it happened, we have to fail the write.
+ */
+ barrier();
+ if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
+ local_dec(&cpu_buffer->committing);
+ local_dec(&cpu_buffer->commits);
+ return NULL;
+ }
+#endif
+
+ length = rb_calculate_event_length(length);
again:
/*
* We allow for interrupts to reenter here and do a trace.
@@ -1190,9 +2111,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
* Bail!
*/
if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
- return NULL;
+ goto out_fail;
- ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+ ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
/*
* Only the first commit can update the timestamp.
@@ -1202,70 +2123,99 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
* also be made. But only the entry that did the actual
* commit will be something other than zero.
*/
- if (cpu_buffer->tail_page == cpu_buffer->commit_page &&
- rb_page_write(cpu_buffer->tail_page) ==
- rb_commit_index(cpu_buffer)) {
+ if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
+ rb_page_write(cpu_buffer->tail_page) ==
+ rb_commit_index(cpu_buffer))) {
+ u64 diff;
- delta = ts - cpu_buffer->write_stamp;
+ diff = ts - cpu_buffer->write_stamp;
- /* make sure this delta is calculated here */
+ /* make sure this diff is calculated here */
barrier();
/* Did the write stamp get updated already? */
if (unlikely(ts < cpu_buffer->write_stamp))
- delta = 0;
+ goto get_event;
- if (test_time_stamp(delta)) {
+ delta = diff;
+ if (unlikely(test_time_stamp(delta))) {
commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
-
if (commit == -EBUSY)
- return NULL;
+ goto out_fail;
if (commit == -EAGAIN)
goto again;
RB_WARN_ON(cpu_buffer, commit < 0);
}
- } else
- /* Non commits have zero deltas */
- delta = 0;
+ }
- event = __rb_reserve_next(cpu_buffer, type, length, &ts);
- if (PTR_ERR(event) == -EAGAIN)
+ get_event:
+ event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
+ if (unlikely(PTR_ERR(event) == -EAGAIN))
goto again;
- if (!event) {
- if (unlikely(commit))
- /*
- * Ouch! We needed a timestamp and it was commited. But
- * we didn't get our event reserved.
- */
- rb_set_commit_to_write(cpu_buffer);
- return NULL;
- }
+ if (!event)
+ goto out_fail;
- /*
- * If the timestamp was commited, make the commit our entry
- * now so that we will update it when needed.
- */
- if (commit)
- rb_set_commit_event(cpu_buffer, event);
- else if (!rb_is_commit(cpu_buffer, event))
+ if (!rb_event_is_commit(cpu_buffer, event))
delta = 0;
event->time_delta = delta;
return event;
+
+ out_fail:
+ rb_end_commit(cpu_buffer);
+ return NULL;
+}
+
+#ifdef CONFIG_TRACING
+
+#define TRACE_RECURSIVE_DEPTH 16
+
+static int trace_recursive_lock(void)
+{
+ current->trace_recursion++;
+
+ if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+ return 0;
+
+ /* Disable all tracing before we do anything else */
+ tracing_off_permanent();
+
+ printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
+ "HC[%lu]:SC[%lu]:NMI[%lu]\n",
+ current->trace_recursion,
+ hardirq_count() >> HARDIRQ_SHIFT,
+ softirq_count() >> SOFTIRQ_SHIFT,
+ in_nmi());
+
+ WARN_ON_ONCE(1);
+ return -1;
+}
+
+static void trace_recursive_unlock(void)
+{
+ WARN_ON_ONCE(!current->trace_recursion);
+
+ current->trace_recursion--;
}
+#else
+
+#define trace_recursive_lock() (0)
+#define trace_recursive_unlock() do { } while (0)
+
+#endif
+
static DEFINE_PER_CPU(int, rb_need_resched);
/**
* ring_buffer_lock_reserve - reserve a part of the buffer
* @buffer: the ring buffer to reserve from
* @length: the length of the data to reserve (excluding event header)
- * @flags: a pointer to save the interrupt flags
*
* Returns a reseverd event on the ring buffer to copy directly to.
* The user of this interface will need to get the body to write into
@@ -1278,9 +2228,7 @@ static DEFINE_PER_CPU(int, rb_need_resched);
* If NULL is returned, then nothing has been allocated or locked.
*/
struct ring_buffer_event *
-ring_buffer_lock_reserve(struct ring_buffer *buffer,
- unsigned long length,
- unsigned long *flags)
+ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
@@ -1295,6 +2243,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
/* If we are tracing schedule, we don't want to recurse */
resched = ftrace_preempt_disable();
+ if (trace_recursive_lock())
+ goto out_nocheck;
+
cpu = raw_smp_processor_id();
if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -1305,11 +2256,10 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
if (atomic_read(&cpu_buffer->record_disabled))
goto out;
- length = rb_calculate_event_length(length);
- if (length > BUF_PAGE_SIZE)
+ if (length > BUF_MAX_DATA_SIZE)
goto out;
- event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
+ event = rb_reserve_next_event(buffer, cpu_buffer, length);
if (!event)
goto out;
@@ -1324,38 +2274,45 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
return event;
out:
+ trace_recursive_unlock();
+
+ out_nocheck:
ftrace_preempt_enable(resched);
return NULL;
}
EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
-static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
+static void
+rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
- cpu_buffer->entries++;
-
- /* Only process further if we own the commit */
- if (!rb_is_commit(cpu_buffer, event))
- return;
-
- cpu_buffer->write_stamp += event->time_delta;
+ /*
+ * The event first in the commit queue updates the
+ * time stamp.
+ */
+ if (rb_event_is_commit(cpu_buffer, event))
+ cpu_buffer->write_stamp += event->time_delta;
+}
- rb_set_commit_to_write(cpu_buffer);
+static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ local_inc(&cpu_buffer->entries);
+ rb_update_write_stamp(cpu_buffer, event);
+ rb_end_commit(cpu_buffer);
}
/**
* ring_buffer_unlock_commit - commit a reserved
* @buffer: The buffer to commit to
* @event: The event pointer to commit.
- * @flags: the interrupt flags received from ring_buffer_lock_reserve.
*
* This commits the data to the ring buffer, and releases any locks held.
*
* Must be paired with ring_buffer_lock_reserve.
*/
int ring_buffer_unlock_commit(struct ring_buffer *buffer,
- struct ring_buffer_event *event,
- unsigned long flags)
+ struct ring_buffer_event *event)
{
struct ring_buffer_per_cpu *cpu_buffer;
int cpu = raw_smp_processor_id();
@@ -1364,6 +2321,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
rb_commit(cpu_buffer, event);
+ trace_recursive_unlock();
+
/*
* Only the last preempt count needs to restore preemption.
*/
@@ -1376,6 +2335,119 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
+static inline void rb_event_discard(struct ring_buffer_event *event)
+{
+ /* array[0] holds the actual length for the discarded event */
+ event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
+ event->type_len = RINGBUF_TYPE_PADDING;
+ /* time delta must be non zero */
+ if (!event->time_delta)
+ event->time_delta = 1;
+}
+
+/*
+ * Decrement the entries to the page that an event is on.
+ * The event does not even need to exist, only the pointer
+ * to the page it is on. This may only be called before the commit
+ * takes place.
+ */
+static inline void
+rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ unsigned long addr = (unsigned long)event;
+ struct buffer_page *bpage = cpu_buffer->commit_page;
+ struct buffer_page *start;
+
+ addr &= PAGE_MASK;
+
+ /* Do the likely case first */
+ if (likely(bpage->page == (void *)addr)) {
+ local_dec(&bpage->entries);
+ return;
+ }
+
+ /*
+ * Because the commit page may be on the reader page we
+ * start with the next page and check the end loop there.
+ */
+ rb_inc_page(cpu_buffer, &bpage);
+ start = bpage;
+ do {
+ if (bpage->page == (void *)addr) {
+ local_dec(&bpage->entries);
+ return;
+ }
+ rb_inc_page(cpu_buffer, &bpage);
+ } while (bpage != start);
+
+ /* commit not part of this buffer?? */
+ RB_WARN_ON(cpu_buffer, 1);
+}
+
+/**
+ * ring_buffer_commit_discard - discard an event that has not been committed
+ * @buffer: the ring buffer
+ * @event: non committed event to discard
+ *
+ * Sometimes an event that is in the ring buffer needs to be ignored.
+ * This function lets the user discard an event in the ring buffer
+ * and then that event will not be read later.
+ *
+ * This function only works if it is called before the the item has been
+ * committed. It will try to free the event from the ring buffer
+ * if another event has not been added behind it.
+ *
+ * If another event has been added behind it, it will set the event
+ * up as discarded, and perform the commit.
+ *
+ * If this function is called, do not call ring_buffer_unlock_commit on
+ * the event.
+ */
+void ring_buffer_discard_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ /* The event is discarded regardless */
+ rb_event_discard(event);
+
+ cpu = smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+
+ /*
+ * This must only be called if the event has not been
+ * committed yet. Thus we can assume that preemption
+ * is still disabled.
+ */
+ RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
+
+ rb_decrement_entry(cpu_buffer, event);
+ if (rb_try_to_discard(cpu_buffer, event))
+ goto out;
+
+ /*
+ * The commit is still visible by the reader, so we
+ * must still update the timestamp.
+ */
+ rb_update_write_stamp(cpu_buffer, event);
+ out:
+ rb_end_commit(cpu_buffer);
+
+ trace_recursive_unlock();
+
+ /*
+ * Only the last preempt count needs to restore preemption.
+ */
+ if (preempt_count() == 1)
+ ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
+ else
+ preempt_enable_no_resched_notrace();
+
+}
+EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
+
/**
* ring_buffer_write - write data to the buffer without reserving
* @buffer: The ring buffer to write to.
@@ -1395,7 +2467,6 @@ int ring_buffer_write(struct ring_buffer *buffer,
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
- unsigned long event_length;
void *body;
int ret = -EBUSY;
int cpu, resched;
@@ -1418,9 +2489,10 @@ int ring_buffer_write(struct ring_buffer *buffer,
if (atomic_read(&cpu_buffer->record_disabled))
goto out;
- event_length = rb_calculate_event_length(length);
- event = rb_reserve_next_event(cpu_buffer,
- RINGBUF_TYPE_DATA, event_length);
+ if (length > BUF_MAX_DATA_SIZE)
+ goto out;
+
+ event = rb_reserve_next_event(buffer, cpu_buffer, length);
if (!event)
goto out;
@@ -1438,12 +2510,16 @@ int ring_buffer_write(struct ring_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_write);
-static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = cpu_buffer->reader_page;
- struct buffer_page *head = cpu_buffer->head_page;
+ struct buffer_page *head = rb_set_head_page(cpu_buffer);
struct buffer_page *commit = cpu_buffer->commit_page;
+ /* In case of error, head will be NULL */
+ if (unlikely(!head))
+ return 1;
+
return reader->read == rb_page_commit(reader) &&
(commit == reader ||
(commit == head &&
@@ -1528,12 +2604,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long ret;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
cpu_buffer = buffer->buffers[cpu];
- return cpu_buffer->entries;
+ ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
+ - cpu_buffer->read;
+
+ return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
@@ -1545,16 +2625,40 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long ret;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
cpu_buffer = buffer->buffers[cpu];
- return cpu_buffer->overrun;
+ ret = local_read(&cpu_buffer->overrun);
+
+ return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
/**
+ * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long
+ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long ret;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ ret = local_read(&cpu_buffer->commit_overrun);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
+
+/**
* ring_buffer_entries - get the number of entries in a buffer
* @buffer: The ring buffer
*
@@ -1570,7 +2674,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
/* if you care about this being correct, lock the buffer */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
- entries += cpu_buffer->entries;
+ entries += (local_read(&cpu_buffer->entries) -
+ local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
}
return entries;
@@ -1593,7 +2698,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
/* if you care about this being correct, lock the buffer */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
- overruns += cpu_buffer->overrun;
+ overruns += local_read(&cpu_buffer->overrun);
}
return overruns;
@@ -1606,8 +2711,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
/* Iterator usage is expected to have record disabled */
if (list_empty(&cpu_buffer->reader_page->list)) {
- iter->head_page = cpu_buffer->head_page;
- iter->head = cpu_buffer->head_page->read;
+ iter->head_page = rb_set_head_page(cpu_buffer);
+ if (unlikely(!iter->head_page))
+ return;
+ iter->head = iter->head_page->read;
} else {
iter->head_page = cpu_buffer->reader_page;
iter->head = cpu_buffer->reader_page->read;
@@ -1627,9 +2734,14 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
*/
void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
{
- struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
+ if (!iter)
+ return;
+
+ cpu_buffer = iter->cpu_buffer;
+
spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
rb_iter_reset(iter);
spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
@@ -1657,7 +2769,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
{
u64 delta;
- switch (event->type) {
+ switch (event->type_len) {
case RINGBUF_TYPE_PADDING:
return;
@@ -1688,7 +2800,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
{
u64 delta;
- switch (event->type) {
+ switch (event->type_len) {
case RINGBUF_TYPE_PADDING:
return;
@@ -1719,6 +2831,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
struct buffer_page *reader = NULL;
unsigned long flags;
int nr_loops = 0;
+ int ret;
local_irq_save(flags);
__raw_spin_lock(&cpu_buffer->lock);
@@ -1752,29 +2865,56 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
goto out;
/*
- * Splice the empty reader page into the list around the head.
* Reset the reader page to size zero.
*/
+ local_set(&cpu_buffer->reader_page->write, 0);
+ local_set(&cpu_buffer->reader_page->entries, 0);
+ local_set(&cpu_buffer->reader_page->page->commit, 0);
- reader = cpu_buffer->head_page;
+ spin:
+ /*
+ * Splice the empty reader page into the list around the head.
+ */
+ reader = rb_set_head_page(cpu_buffer);
cpu_buffer->reader_page->list.next = reader->list.next;
cpu_buffer->reader_page->list.prev = reader->list.prev;
- local_set(&cpu_buffer->reader_page->write, 0);
- local_set(&cpu_buffer->reader_page->page->commit, 0);
+ /*
+ * cpu_buffer->pages just needs to point to the buffer, it
+ * has no specific buffer page to point to. Lets move it out
+ * of our way so we don't accidently swap it.
+ */
+ cpu_buffer->pages = reader->list.prev;
- /* Make the reader page now replace the head */
- reader->list.prev->next = &cpu_buffer->reader_page->list;
- reader->list.next->prev = &cpu_buffer->reader_page->list;
+ /* The reader page will be pointing to the new head */
+ rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
+
+ /*
+ * Here's the tricky part.
+ *
+ * We need to move the pointer past the header page.
+ * But we can only do that if a writer is not currently
+ * moving it. The page before the header page has the
+ * flag bit '1' set if it is pointing to the page we want.
+ * but if the writer is in the process of moving it
+ * than it will be '2' or already moved '0'.
+ */
+
+ ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
/*
- * If the tail is on the reader, then we must set the head
- * to the inserted page, otherwise we set it one before.
+ * If we did not convert it, then we must try again.
*/
- cpu_buffer->head_page = cpu_buffer->reader_page;
+ if (!ret)
+ goto spin;
- if (cpu_buffer->commit_page != reader)
- rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+ /*
+ * Yeah! We succeeded in replacing the page.
+ *
+ * Now make the new head point back to the reader page.
+ */
+ reader->list.next->prev = &cpu_buffer->reader_page->list;
+ rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
/* Finally update the reader page to the new head */
cpu_buffer->reader_page = reader;
@@ -1803,8 +2943,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
event = rb_reader_event(cpu_buffer);
- if (event->type == RINGBUF_TYPE_DATA)
- cpu_buffer->entries--;
+ if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
+ cpu_buffer->read++;
rb_update_read_stamp(cpu_buffer, event);
@@ -1826,8 +2966,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
* Check if we are at the end of the buffer.
*/
if (iter->head >= rb_page_size(iter->head_page)) {
- if (RB_WARN_ON(buffer,
- iter->head_page == cpu_buffer->commit_page))
+ /* discarded commits can make the page empty */
+ if (iter->head_page == cpu_buffer->commit_page)
return;
rb_inc_iter(iter);
return;
@@ -1864,21 +3004,16 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
struct buffer_page *reader;
int nr_loops = 0;
- if (!cpumask_test_cpu(cpu, buffer->cpumask))
- return NULL;
-
cpu_buffer = buffer->buffers[cpu];
again:
/*
* We repeat when a timestamp is encountered. It is possible
* to get multiple timestamps from an interrupt entering just
- * as one timestamp is about to be written. The max times
- * that this can happen is the number of nested interrupts we
- * can have. Nesting 10 deep of interrupts is clearly
- * an anomaly.
+ * as one timestamp is about to be written, or from discarded
+ * commits. The most that we can have is the number on a single page.
*/
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
return NULL;
reader = rb_get_reader_page(cpu_buffer);
@@ -1887,11 +3022,19 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
event = rb_reader_event(cpu_buffer);
- switch (event->type) {
+ switch (event->type_len) {
case RINGBUF_TYPE_PADDING:
- RB_WARN_ON(cpu_buffer, 1);
- rb_advance_reader(cpu_buffer);
- return NULL;
+ if (rb_null_event(event))
+ RB_WARN_ON(cpu_buffer, 1);
+ /*
+ * Because the writer could be discarding every
+ * event it creates (which would probably be bad)
+ * if we were to go back to "again" then we may never
+ * catch up, and will trigger the warn on, or lock
+ * the box. Return the padding, and we will release
+ * the current locks, and try again.
+ */
+ return event;
case RINGBUF_TYPE_TIME_EXTEND:
/* Internal data, OK to advance */
@@ -1906,7 +3049,8 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
case RINGBUF_TYPE_DATA:
if (ts) {
*ts = cpu_buffer->read_stamp + event->time_delta;
- ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ ring_buffer_normalize_time_stamp(buffer,
+ cpu_buffer->cpu, ts);
}
return event;
@@ -1934,14 +3078,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
again:
/*
- * We repeat when a timestamp is encountered. It is possible
- * to get multiple timestamps from an interrupt entering just
- * as one timestamp is about to be written. The max times
- * that this can happen is the number of nested interrupts we
- * can have. Nesting 10 deep of interrupts is clearly
- * an anomaly.
+ * We repeat when a timestamp is encountered.
+ * We can get multiple timestamps by nested interrupts or also
+ * if filtering is on (discarding commits). Since discarding
+ * commits can be frequent we can get a lot of timestamps.
+ * But we limit them by not adding timestamps if they begin
+ * at the start of a page.
*/
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
return NULL;
if (rb_per_cpu_empty(cpu_buffer))
@@ -1949,10 +3093,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
event = rb_iter_head_event(iter);
- switch (event->type) {
+ switch (event->type_len) {
case RINGBUF_TYPE_PADDING:
- rb_inc_iter(iter);
- goto again;
+ if (rb_null_event(event)) {
+ rb_inc_iter(iter);
+ goto again;
+ }
+ rb_advance_iter(iter);
+ return event;
case RINGBUF_TYPE_TIME_EXTEND:
/* Internal data, OK to advance */
@@ -1967,7 +3115,8 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
case RINGBUF_TYPE_DATA:
if (ts) {
*ts = iter->read_stamp + event->time_delta;
- ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ ring_buffer_normalize_time_stamp(buffer,
+ cpu_buffer->cpu, ts);
}
return event;
@@ -1979,6 +3128,21 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
+static inline int rb_ok_to_lock(void)
+{
+ /*
+ * If an NMI die dumps out the content of the ring buffer
+ * do not grab locks. We also permanently disable the ring
+ * buffer too. A one time deal is all you get from reading
+ * the ring buffer from an NMI.
+ */
+ if (likely(!in_nmi()))
+ return 1;
+
+ tracing_off_permanent();
+ return 0;
+}
+
/**
* ring_buffer_peek - peek at the next event to be read
* @buffer: The ring buffer to read
@@ -1994,10 +3158,25 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct ring_buffer_event *event;
unsigned long flags;
+ int dolock;
- spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return NULL;
+
+ dolock = rb_ok_to_lock();
+ again:
+ local_irq_save(flags);
+ if (dolock)
+ spin_lock(&cpu_buffer->reader_lock);
event = rb_buffer_peek(buffer, cpu, ts);
- spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ if (event && event->type_len == RINGBUF_TYPE_PADDING)
+ rb_advance_reader(cpu_buffer);
+ if (dolock)
+ spin_unlock(&cpu_buffer->reader_lock);
+ local_irq_restore(flags);
+
+ if (event && event->type_len == RINGBUF_TYPE_PADDING)
+ goto again;
return event;
}
@@ -2017,10 +3196,14 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
struct ring_buffer_event *event;
unsigned long flags;
+ again:
spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
event = rb_iter_peek(iter, ts);
spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ if (event && event->type_len == RINGBUF_TYPE_PADDING)
+ goto again;
+
return event;
}
@@ -2035,23 +3218,38 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
struct ring_buffer_event *
ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
{
- struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
- struct ring_buffer_event *event;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event = NULL;
unsigned long flags;
+ int dolock;
+
+ dolock = rb_ok_to_lock();
+
+ again:
+ /* might be called in atomic */
+ preempt_disable();
if (!cpumask_test_cpu(cpu, buffer->cpumask))
- return NULL;
+ goto out;
- spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ cpu_buffer = buffer->buffers[cpu];
+ local_irq_save(flags);
+ if (dolock)
+ spin_lock(&cpu_buffer->reader_lock);
event = rb_buffer_peek(buffer, cpu, ts);
- if (!event)
- goto out;
+ if (event)
+ rb_advance_reader(cpu_buffer);
- rb_advance_reader(cpu_buffer);
+ if (dolock)
+ spin_unlock(&cpu_buffer->reader_lock);
+ local_irq_restore(flags);
out:
- spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ preempt_enable();
+
+ if (event && event->type_len == RINGBUF_TYPE_PADDING)
+ goto again;
return event;
}
@@ -2132,10 +3330,14 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
unsigned long flags;
spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ again:
event = rb_iter_peek(iter, ts);
if (!event)
goto out;
+ if (event->type_len == RINGBUF_TYPE_PADDING)
+ goto again;
+
rb_advance_iter(iter);
out:
spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
@@ -2157,9 +3359,12 @@ EXPORT_SYMBOL_GPL(ring_buffer_size);
static void
rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
{
+ rb_head_page_deactivate(cpu_buffer);
+
cpu_buffer->head_page
- = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+ = list_entry(cpu_buffer->pages, struct buffer_page, list);
local_set(&cpu_buffer->head_page->write, 0);
+ local_set(&cpu_buffer->head_page->entries, 0);
local_set(&cpu_buffer->head_page->page->commit, 0);
cpu_buffer->head_page->read = 0;
@@ -2169,11 +3374,21 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
local_set(&cpu_buffer->reader_page->write, 0);
+ local_set(&cpu_buffer->reader_page->entries, 0);
local_set(&cpu_buffer->reader_page->page->commit, 0);
cpu_buffer->reader_page->read = 0;
- cpu_buffer->overrun = 0;
- cpu_buffer->entries = 0;
+ local_set(&cpu_buffer->commit_overrun, 0);
+ local_set(&cpu_buffer->overrun, 0);
+ local_set(&cpu_buffer->entries, 0);
+ local_set(&cpu_buffer->committing, 0);
+ local_set(&cpu_buffer->commits, 0);
+ cpu_buffer->read = 0;
+
+ cpu_buffer->write_stamp = 0;
+ cpu_buffer->read_stamp = 0;
+
+ rb_head_page_activate(cpu_buffer);
}
/**
@@ -2189,15 +3404,23 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return;
+ atomic_inc(&cpu_buffer->record_disabled);
+
spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
+ goto out;
+
__raw_spin_lock(&cpu_buffer->lock);
rb_reset_cpu(cpu_buffer);
__raw_spin_unlock(&cpu_buffer->lock);
+ out:
spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ atomic_dec(&cpu_buffer->record_disabled);
}
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
@@ -2221,14 +3444,28 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset);
int ring_buffer_empty(struct ring_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+ int dolock;
int cpu;
+ int ret;
+
+ dolock = rb_ok_to_lock();
/* yes this is racy, but if you don't like the race, lock the buffer */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
- if (!rb_per_cpu_empty(cpu_buffer))
+ local_irq_save(flags);
+ if (dolock)
+ spin_lock(&cpu_buffer->reader_lock);
+ ret = rb_per_cpu_empty(cpu_buffer);
+ if (dolock)
+ spin_unlock(&cpu_buffer->reader_lock);
+ local_irq_restore(flags);
+
+ if (!ret)
return 0;
}
+
return 1;
}
EXPORT_SYMBOL_GPL(ring_buffer_empty);
@@ -2241,15 +3478,29 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+ int dolock;
+ int ret;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 1;
+ dolock = rb_ok_to_lock();
+
cpu_buffer = buffer->buffers[cpu];
- return rb_per_cpu_empty(cpu_buffer);
+ local_irq_save(flags);
+ if (dolock)
+ spin_lock(&cpu_buffer->reader_lock);
+ ret = rb_per_cpu_empty(cpu_buffer);
+ if (dolock)
+ spin_unlock(&cpu_buffer->reader_lock);
+ local_irq_restore(flags);
+
+ return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
+#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
/**
* ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
* @buffer_a: One buffer to swap with
@@ -2265,18 +3516,36 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
{
struct ring_buffer_per_cpu *cpu_buffer_a;
struct ring_buffer_per_cpu *cpu_buffer_b;
+ int ret = -EINVAL;
if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
!cpumask_test_cpu(cpu, buffer_b->cpumask))
- return -EINVAL;
+ goto out;
/* At least make sure the two buffers are somewhat the same */
if (buffer_a->pages != buffer_b->pages)
- return -EINVAL;
+ goto out;
+
+ ret = -EAGAIN;
+
+ if (ring_buffer_flags != RB_BUFFERS_ON)
+ goto out;
+
+ if (atomic_read(&buffer_a->record_disabled))
+ goto out;
+
+ if (atomic_read(&buffer_b->record_disabled))
+ goto out;
cpu_buffer_a = buffer_a->buffers[cpu];
cpu_buffer_b = buffer_b->buffers[cpu];
+ if (atomic_read(&cpu_buffer_a->record_disabled))
+ goto out;
+
+ if (atomic_read(&cpu_buffer_b->record_disabled))
+ goto out;
+
/*
* We can't do a synchronize_sched here because this
* function can be called in atomic context.
@@ -2286,39 +3555,28 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
atomic_inc(&cpu_buffer_a->record_disabled);
atomic_inc(&cpu_buffer_b->record_disabled);
+ ret = -EBUSY;
+ if (local_read(&cpu_buffer_a->committing))
+ goto out_dec;
+ if (local_read(&cpu_buffer_b->committing))
+ goto out_dec;
+
buffer_a->buffers[cpu] = cpu_buffer_b;
buffer_b->buffers[cpu] = cpu_buffer_a;
cpu_buffer_b->buffer = buffer_a;
cpu_buffer_a->buffer = buffer_b;
+ ret = 0;
+
+out_dec:
atomic_dec(&cpu_buffer_a->record_disabled);
atomic_dec(&cpu_buffer_b->record_disabled);
-
- return 0;
+out:
+ return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
-
-static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
- struct buffer_data_page *bpage)
-{
- struct ring_buffer_event *event;
- unsigned long head;
-
- __raw_spin_lock(&cpu_buffer->lock);
- for (head = 0; head < local_read(&bpage->commit);
- head += rb_event_length(event)) {
-
- event = __rb_data_page_index(bpage, head);
- if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
- return;
- /* Only count data entries */
- if (event->type != RINGBUF_TYPE_DATA)
- continue;
- cpu_buffer->entries--;
- }
- __raw_spin_unlock(&cpu_buffer->lock);
-}
+#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */
/**
* ring_buffer_alloc_read_page - allocate a page to read from buffer
@@ -2337,8 +3595,8 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
*/
void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
{
- unsigned long addr;
struct buffer_data_page *bpage;
+ unsigned long addr;
addr = __get_free_page(GFP_KERNEL);
if (!addr)
@@ -2346,8 +3604,11 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
bpage = (void *)addr;
+ rb_init_page(bpage);
+
return bpage;
}
+EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
/**
* ring_buffer_free_read_page - free an allocated read page
@@ -2360,11 +3621,13 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
{
free_page((unsigned long)data);
}
+EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
/**
* ring_buffer_read_page - extract a page from the ring buffer
* @buffer: buffer to extract from
* @data_page: the page to use allocated from ring_buffer_alloc_read_page
+ * @len: amount to extract
* @cpu: the cpu of the buffer to extract
* @full: should the extraction only happen when the page is full.
*
@@ -2374,12 +3637,12 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
* to swap with a page in the ring buffer.
*
* for example:
- * rpage = ring_buffer_alloc_page(buffer);
+ * rpage = ring_buffer_alloc_read_page(buffer);
* if (!rpage)
* return error;
- * ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
- * if (ret)
- * process_page(rpage);
+ * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
+ * if (ret >= 0)
+ * process_page(rpage, ret);
*
* When @full is set, the function will not return true unless
* the writer is off the reader page.
@@ -2390,80 +3653,129 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
* responsible for that.
*
* Returns:
- * 1 if data has been transferred
- * 0 if no data has been transferred.
+ * >=0 if data has been transferred, returns the offset of consumed data.
+ * <0 if no data has been transferred.
*/
int ring_buffer_read_page(struct ring_buffer *buffer,
- void **data_page, int cpu, int full)
+ void **data_page, size_t len, int cpu, int full)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct ring_buffer_event *event;
struct buffer_data_page *bpage;
+ struct buffer_page *reader;
unsigned long flags;
- int ret = 0;
+ unsigned int commit;
+ unsigned int read;
+ u64 save_timestamp;
+ int ret = -1;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ goto out;
+
+ /*
+ * If len is not big enough to hold the page header, then
+ * we can not copy anything.
+ */
+ if (len <= BUF_PAGE_HDR_SIZE)
+ goto out;
+
+ len -= BUF_PAGE_HDR_SIZE;
if (!data_page)
- return 0;
+ goto out;
bpage = *data_page;
if (!bpage)
- return 0;
+ goto out;
spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- /*
- * rb_buffer_peek will get the next ring buffer if
- * the current reader page is empty.
- */
- event = rb_buffer_peek(buffer, cpu, NULL);
- if (!event)
- goto out;
+ reader = rb_get_reader_page(cpu_buffer);
+ if (!reader)
+ goto out_unlock;
+
+ event = rb_reader_event(cpu_buffer);
+
+ read = reader->read;
+ commit = rb_page_commit(reader);
- /* check for data */
- if (!local_read(&cpu_buffer->reader_page->page->commit))
- goto out;
/*
- * If the writer is already off of the read page, then simply
- * switch the read page with the given page. Otherwise
- * we need to copy the data from the reader to the writer.
+ * If this page has been partially read or
+ * if len is not big enough to read the rest of the page or
+ * a writer is still on the page, then
+ * we must copy the data from the page to the buffer.
+ * Otherwise, we can simply swap the page with the one passed in.
*/
- if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
- unsigned int read = cpu_buffer->reader_page->read;
+ if (read || (len < (commit - read)) ||
+ cpu_buffer->reader_page == cpu_buffer->commit_page) {
+ struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
+ unsigned int rpos = read;
+ unsigned int pos = 0;
+ unsigned int size;
if (full)
- goto out;
- /* The writer is still on the reader page, we must copy */
- bpage = cpu_buffer->reader_page->page;
- memcpy(bpage->data,
- cpu_buffer->reader_page->page->data + read,
- local_read(&bpage->commit) - read);
+ goto out_unlock;
- /* consume what was read */
- cpu_buffer->reader_page += read;
+ if (len > (commit - read))
+ len = (commit - read);
+ size = rb_event_length(event);
+
+ if (len < size)
+ goto out_unlock;
+
+ /* save the current timestamp, since the user will need it */
+ save_timestamp = cpu_buffer->read_stamp;
+
+ /* Need to copy one event at a time */
+ do {
+ memcpy(bpage->data + pos, rpage->data + rpos, size);
+
+ len -= size;
+
+ rb_advance_reader(cpu_buffer);
+ rpos = reader->read;
+ pos += size;
+
+ event = rb_reader_event(cpu_buffer);
+ size = rb_event_length(event);
+ } while (len > size);
+
+ /* update bpage */
+ local_set(&bpage->commit, pos);
+ bpage->time_stamp = save_timestamp;
+
+ /* we copied everything to the beginning */
+ read = 0;
} else {
+ /* update the entry counter */
+ cpu_buffer->read += rb_page_entries(reader);
+
/* swap the pages */
rb_init_page(bpage);
- bpage = cpu_buffer->reader_page->page;
- cpu_buffer->reader_page->page = *data_page;
- cpu_buffer->reader_page->read = 0;
+ bpage = reader->page;
+ reader->page = *data_page;
+ local_set(&reader->write, 0);
+ local_set(&reader->entries, 0);
+ reader->read = 0;
*data_page = bpage;
}
- ret = 1;
+ ret = read;
- /* update the entry counter */
- rb_remove_entries(cpu_buffer, bpage);
- out:
+ out_unlock:
spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ out:
return ret;
}
+EXPORT_SYMBOL_GPL(ring_buffer_read_page);
+#ifdef CONFIG_TRACING
static ssize_t
rb_simple_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- long *p = filp->private_data;
+ unsigned long *p = filp->private_data;
char buf[64];
int r;
@@ -2479,9 +3791,9 @@ static ssize_t
rb_simple_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- long *p = filp->private_data;
+ unsigned long *p = filp->private_data;
char buf[64];
- long val;
+ unsigned long val;
int ret;
if (cnt >= sizeof(buf))
@@ -2506,7 +3818,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
return cnt;
}
-static struct file_operations rb_simple_fops = {
+static const struct file_operations rb_simple_fops = {
.open = tracing_open_generic,
.read = rb_simple_read,
.write = rb_simple_write,
@@ -2516,16 +3828,53 @@ static struct file_operations rb_simple_fops = {
static __init int rb_init_debugfs(void)
{
struct dentry *d_tracer;
- struct dentry *entry;
d_tracer = tracing_init_dentry();
- entry = debugfs_create_file("tracing_on", 0644, d_tracer,
- &ring_buffer_flags, &rb_simple_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'tracing_on' entry\n");
+ trace_create_file("tracing_on", 0644, d_tracer,
+ &ring_buffer_flags, &rb_simple_fops);
return 0;
}
fs_initcall(rb_init_debugfs);
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int rb_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ struct ring_buffer *buffer =
+ container_of(self, struct ring_buffer, cpu_notify);
+ long cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ if (cpumask_test_cpu(cpu, buffer->cpumask))
+ return NOTIFY_OK;
+
+ buffer->buffers[cpu] =
+ rb_allocate_cpu_buffer(buffer, cpu);
+ if (!buffer->buffers[cpu]) {
+ WARN(1, "failed to allocate ring buffer on CPU %ld\n",
+ cpu);
+ return NOTIFY_OK;
+ }
+ smp_wmb();
+ cpumask_set_cpu(cpu, buffer->cpumask);
+ break;
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ /*
+ * Do nothing.
+ * If we were to free the buffer, then the user would
+ * lose any trace that was in the buffer.
+ */
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+#endif
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
new file mode 100644
index 00000000000..573d3cc762c
--- /dev/null
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -0,0 +1,419 @@
+/*
+ * ring buffer tester and benchmark
+ *
+ * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/time.h>
+
+struct rb_page {
+ u64 ts;
+ local_t commit;
+ char data[4080];
+};
+
+/* run time and sleep time in seconds */
+#define RUN_TIME 10
+#define SLEEP_TIME 10
+
+/* number of events for writer to wake up the reader */
+static int wakeup_interval = 100;
+
+static int reader_finish;
+static struct completion read_start;
+static struct completion read_done;
+
+static struct ring_buffer *buffer;
+static struct task_struct *producer;
+static struct task_struct *consumer;
+static unsigned long read;
+
+static int disable_reader;
+module_param(disable_reader, uint, 0644);
+MODULE_PARM_DESC(disable_reader, "only run producer");
+
+static int read_events;
+
+static int kill_test;
+
+#define KILL_TEST() \
+ do { \
+ if (!kill_test) { \
+ kill_test = 1; \
+ WARN_ON(1); \
+ } \
+ } while (0)
+
+enum event_status {
+ EVENT_FOUND,
+ EVENT_DROPPED,
+};
+
+static enum event_status read_event(int cpu)
+{
+ struct ring_buffer_event *event;
+ int *entry;
+ u64 ts;
+
+ event = ring_buffer_consume(buffer, cpu, &ts);
+ if (!event)
+ return EVENT_DROPPED;
+
+ entry = ring_buffer_event_data(event);
+ if (*entry != cpu) {
+ KILL_TEST();
+ return EVENT_DROPPED;
+ }
+
+ read++;
+ return EVENT_FOUND;
+}
+
+static enum event_status read_page(int cpu)
+{
+ struct ring_buffer_event *event;
+ struct rb_page *rpage;
+ unsigned long commit;
+ void *bpage;
+ int *entry;
+ int ret;
+ int inc;
+ int i;
+
+ bpage = ring_buffer_alloc_read_page(buffer);
+ if (!bpage)
+ return EVENT_DROPPED;
+
+ ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
+ if (ret >= 0) {
+ rpage = bpage;
+ commit = local_read(&rpage->commit);
+ for (i = 0; i < commit && !kill_test; i += inc) {
+
+ if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
+ KILL_TEST();
+ break;
+ }
+
+ inc = -1;
+ event = (void *)&rpage->data[i];
+ switch (event->type_len) {
+ case RINGBUF_TYPE_PADDING:
+ /* failed writes may be discarded events */
+ if (!event->time_delta)
+ KILL_TEST();
+ inc = event->array[0] + 4;
+ break;
+ case RINGBUF_TYPE_TIME_EXTEND:
+ inc = 8;
+ break;
+ case 0:
+ entry = ring_buffer_event_data(event);
+ if (*entry != cpu) {
+ KILL_TEST();
+ break;
+ }
+ read++;
+ if (!event->array[0]) {
+ KILL_TEST();
+ break;
+ }
+ inc = event->array[0] + 4;
+ break;
+ default:
+ entry = ring_buffer_event_data(event);
+ if (*entry != cpu) {
+ KILL_TEST();
+ break;
+ }
+ read++;
+ inc = ((event->type_len + 1) * 4);
+ }
+ if (kill_test)
+ break;
+
+ if (inc <= 0) {
+ KILL_TEST();
+ break;
+ }
+ }
+ }
+ ring_buffer_free_read_page(buffer, bpage);
+
+ if (ret < 0)
+ return EVENT_DROPPED;
+ return EVENT_FOUND;
+}
+
+static void ring_buffer_consumer(void)
+{
+ /* toggle between reading pages and events */
+ read_events ^= 1;
+
+ read = 0;
+ while (!reader_finish && !kill_test) {
+ int found;
+
+ do {
+ int cpu;
+
+ found = 0;
+ for_each_online_cpu(cpu) {
+ enum event_status stat;
+
+ if (read_events)
+ stat = read_event(cpu);
+ else
+ stat = read_page(cpu);
+
+ if (kill_test)
+ break;
+ if (stat == EVENT_FOUND)
+ found = 1;
+ }
+ } while (found && !kill_test);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (reader_finish)
+ break;
+
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ }
+ reader_finish = 0;
+ complete(&read_done);
+}
+
+static void ring_buffer_producer(void)
+{
+ struct timeval start_tv;
+ struct timeval end_tv;
+ unsigned long long time;
+ unsigned long long entries;
+ unsigned long long overruns;
+ unsigned long missed = 0;
+ unsigned long hit = 0;
+ unsigned long avg;
+ int cnt = 0;
+
+ /*
+ * Hammer the buffer for 10 secs (this may
+ * make the system stall)
+ */
+ trace_printk("Starting ring buffer hammer\n");
+ do_gettimeofday(&start_tv);
+ do {
+ struct ring_buffer_event *event;
+ int *entry;
+
+ event = ring_buffer_lock_reserve(buffer, 10);
+ if (!event) {
+ missed++;
+ } else {
+ hit++;
+ entry = ring_buffer_event_data(event);
+ *entry = smp_processor_id();
+ ring_buffer_unlock_commit(buffer, event);
+ }
+ do_gettimeofday(&end_tv);
+
+ cnt++;
+ if (consumer && !(cnt % wakeup_interval))
+ wake_up_process(consumer);
+
+#ifndef CONFIG_PREEMPT
+ /*
+ * If we are a non preempt kernel, the 10 second run will
+ * stop everything while it runs. Instead, we will call
+ * cond_resched and also add any time that was lost by a
+ * rescedule.
+ *
+ * Do a cond resched at the same frequency we would wake up
+ * the reader.
+ */
+ if (cnt % wakeup_interval)
+ cond_resched();
+#endif
+
+ } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
+ trace_printk("End ring buffer hammer\n");
+
+ if (consumer) {
+ /* Init both completions here to avoid races */
+ init_completion(&read_start);
+ init_completion(&read_done);
+ /* the completions must be visible before the finish var */
+ smp_wmb();
+ reader_finish = 1;
+ /* finish var visible before waking up the consumer */
+ smp_wmb();
+ wake_up_process(consumer);
+ wait_for_completion(&read_done);
+ }
+
+ time = end_tv.tv_sec - start_tv.tv_sec;
+ time *= USEC_PER_SEC;
+ time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
+
+ entries = ring_buffer_entries(buffer);
+ overruns = ring_buffer_overruns(buffer);
+
+ if (kill_test)
+ trace_printk("ERROR!\n");
+ trace_printk("Time: %lld (usecs)\n", time);
+ trace_printk("Overruns: %lld\n", overruns);
+ if (disable_reader)
+ trace_printk("Read: (reader disabled)\n");
+ else
+ trace_printk("Read: %ld (by %s)\n", read,
+ read_events ? "events" : "pages");
+ trace_printk("Entries: %lld\n", entries);
+ trace_printk("Total: %lld\n", entries + overruns + read);
+ trace_printk("Missed: %ld\n", missed);
+ trace_printk("Hit: %ld\n", hit);
+
+ /* Convert time from usecs to millisecs */
+ do_div(time, USEC_PER_MSEC);
+ if (time)
+ hit /= (long)time;
+ else
+ trace_printk("TIME IS ZERO??\n");
+
+ trace_printk("Entries per millisec: %ld\n", hit);
+
+ if (hit) {
+ /* Calculate the average time in nanosecs */
+ avg = NSEC_PER_MSEC / hit;
+ trace_printk("%ld ns per entry\n", avg);
+ }
+
+ if (missed) {
+ if (time)
+ missed /= (long)time;
+
+ trace_printk("Total iterations per millisec: %ld\n",
+ hit + missed);
+
+ /* it is possible that hit + missed will overflow and be zero */
+ if (!(hit + missed)) {
+ trace_printk("hit + missed overflowed and totalled zero!\n");
+ hit--; /* make it non zero */
+ }
+
+ /* Caculate the average time in nanosecs */
+ avg = NSEC_PER_MSEC / (hit + missed);
+ trace_printk("%ld ns per entry\n", avg);
+ }
+}
+
+static void wait_to_die(void)
+{
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop()) {
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+}
+
+static int ring_buffer_consumer_thread(void *arg)
+{
+ while (!kthread_should_stop() && !kill_test) {
+ complete(&read_start);
+
+ ring_buffer_consumer();
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (kthread_should_stop() || kill_test)
+ break;
+
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ }
+ __set_current_state(TASK_RUNNING);
+
+ if (kill_test)
+ wait_to_die();
+
+ return 0;
+}
+
+static int ring_buffer_producer_thread(void *arg)
+{
+ init_completion(&read_start);
+
+ while (!kthread_should_stop() && !kill_test) {
+ ring_buffer_reset(buffer);
+
+ if (consumer) {
+ smp_wmb();
+ wake_up_process(consumer);
+ wait_for_completion(&read_start);
+ }
+
+ ring_buffer_producer();
+
+ trace_printk("Sleeping for 10 secs\n");
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(HZ * SLEEP_TIME);
+ __set_current_state(TASK_RUNNING);
+ }
+
+ if (kill_test)
+ wait_to_die();
+
+ return 0;
+}
+
+static int __init ring_buffer_benchmark_init(void)
+{
+ int ret;
+
+ /* make a one meg buffer in overwite mode */
+ buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE);
+ if (!buffer)
+ return -ENOMEM;
+
+ if (!disable_reader) {
+ consumer = kthread_create(ring_buffer_consumer_thread,
+ NULL, "rb_consumer");
+ ret = PTR_ERR(consumer);
+ if (IS_ERR(consumer))
+ goto out_fail;
+ }
+
+ producer = kthread_run(ring_buffer_producer_thread,
+ NULL, "rb_producer");
+ ret = PTR_ERR(producer);
+
+ if (IS_ERR(producer))
+ goto out_kill;
+
+ return 0;
+
+ out_kill:
+ if (consumer)
+ kthread_stop(consumer);
+
+ out_fail:
+ ring_buffer_free(buffer);
+ return ret;
+}
+
+static void __exit ring_buffer_benchmark_exit(void)
+{
+ kthread_stop(producer);
+ if (consumer)
+ kthread_stop(consumer);
+ ring_buffer_free(buffer);
+}
+
+module_init(ring_buffer_benchmark_init);
+module_exit(ring_buffer_benchmark_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("ring_buffer_benchmark");
+MODULE_LICENSE("GPL");
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c580233add9..5c75deeefe3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -11,47 +11,58 @@
* Copyright (C) 2004-2006 Ingo Molnar
* Copyright (C) 2004 William Lee Irwin III
*/
+#include <linux/ring_buffer.h>
#include <linux/utsrelease.h>
+#include <linux/stacktrace.h>
+#include <linux/writeback.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
+#include <linux/smp_lock.h>
#include <linux/notifier.h>
+#include <linux/irqflags.h>
#include <linux/debugfs.h>
#include <linux/pagemap.h>
#include <linux/hardirq.h>
#include <linux/linkage.h>
#include <linux/uaccess.h>
+#include <linux/kprobes.h>
#include <linux/ftrace.h>
#include <linux/module.h>
#include <linux/percpu.h>
+#include <linux/splice.h>
#include <linux/kdebug.h>
+#include <linux/string.h>
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/gfp.h>
#include <linux/fs.h>
-#include <linux/kprobes.h>
-#include <linux/writeback.h>
-
-#include <linux/stacktrace.h>
-#include <linux/ring_buffer.h>
-#include <linux/irqflags.h>
#include "trace.h"
+#include "trace_output.h"
#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
-unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX;
-unsigned long __read_mostly tracing_thresh;
+/*
+ * On boot up, the ring buffer is set to the minimum size, so that
+ * we do not waste memory on systems that are not using tracing.
+ */
+int ring_buffer_expanded;
/*
* We need to change this state when a selftest is running.
* A selftest will lurk into the ring-buffer to count the
* entries inserted during the selftest although some concurrent
- * insertions into the ring-buffer such as ftrace_printk could occurred
+ * insertions into the ring-buffer such as trace_printk could occurred
* at the same time, giving false positive or negative results.
*/
static bool __read_mostly tracing_selftest_running;
+/*
+ * If a tracer is running, we do not want to run SELFTEST.
+ */
+bool __read_mostly tracing_selftest_disabled;
+
/* For tracers that don't implement custom flags */
static struct tracer_opt dummy_tracer_opt[] = {
{ }
@@ -73,9 +84,9 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
* of the tracer is successful. But that is the only place that sets
* this back to zero.
*/
-int tracing_disabled = 1;
+static int tracing_disabled = 1;
-static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
+DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
static inline void ftrace_disable_cpu(void)
{
@@ -91,6 +102,9 @@ static inline void ftrace_enable_cpu(void)
static cpumask_var_t __read_mostly tracing_buffer_mask;
+/* Define which cpu buffers are currently read in trace_pipe */
+static cpumask_var_t tracing_reader_cpumask;
+
#define for_each_tracing_cpu(cpu) \
for_each_cpu(cpu, tracing_buffer_mask)
@@ -109,14 +123,21 @@ static cpumask_var_t __read_mostly tracing_buffer_mask;
*/
int ftrace_dump_on_oops;
-static int tracing_set_tracer(char *buf);
+static int tracing_set_tracer(const char *buf);
+
+#define BOOTUP_TRACER_SIZE 100
+static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata;
+static char *default_bootup_tracer;
static int __init set_ftrace(char *str)
{
- tracing_set_tracer(str);
+ strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE);
+ default_bootup_tracer = bootup_tracer_buf;
+ /* We are using ftrace early, expand it */
+ ring_buffer_expanded = 1;
return 1;
}
-__setup("ftrace", set_ftrace);
+__setup("ftrace=", set_ftrace);
static int __init set_ftrace_dump_on_oops(char *str)
{
@@ -125,21 +146,13 @@ static int __init set_ftrace_dump_on_oops(char *str)
}
__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
-long
-ns2usecs(cycle_t nsec)
+unsigned long long ns2usecs(cycle_t nsec)
{
nsec += 500;
do_div(nsec, 1000);
return nsec;
}
-cycle_t ftrace_now(int cpu)
-{
- u64 ts = ring_buffer_time_stamp(cpu);
- ring_buffer_normalize_time_stamp(cpu, &ts);
- return ts;
-}
-
/*
* The global_trace is the descriptor that holds the tracing
* buffers for the live tracing. For each CPU, it contains
@@ -156,6 +169,28 @@ static struct trace_array global_trace;
static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
+int filter_current_check_discard(struct ring_buffer *buffer,
+ struct ftrace_event_call *call, void *rec,
+ struct ring_buffer_event *event)
+{
+ return filter_check_discard(call, rec, buffer, event);
+}
+EXPORT_SYMBOL_GPL(filter_current_check_discard);
+
+cycle_t ftrace_now(int cpu)
+{
+ u64 ts;
+
+ /* Early boot up does not have a buffer yet */
+ if (!global_trace.buffer)
+ return trace_clock_local();
+
+ ts = ring_buffer_time_stamp(global_trace.buffer, cpu);
+ ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts);
+
+ return ts;
+}
+
/*
* The max_tr is used to snapshot the global_trace when a maximum
* latency is reached. Some tracers will use this to store a maximum
@@ -186,9 +221,6 @@ int tracing_is_enabled(void)
return tracer_enabled;
}
-/* function tracing enabled */
-int ftrace_function_enabled;
-
/*
* trace_buf_size is the size in bytes that is allocated
* for a buffer. Note, the number of bytes is always rounded
@@ -229,7 +261,11 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
/* trace_flags holds trace_options default values */
unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
- TRACE_ITER_ANNOTATE;
+ TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
+ TRACE_ITER_GRAPH_TIME;
+
+static int trace_stop_count;
+static DEFINE_SPINLOCK(tracing_start_lock);
/**
* trace_wake_up - wake up tasks waiting for trace input
@@ -250,13 +286,12 @@ void trace_wake_up(void)
static int __init set_buf_size(char *str)
{
unsigned long buf_size;
- int ret;
if (!str)
return 0;
- ret = strict_strtoul(str, 0, &buf_size);
+ buf_size = memparse(str, &str);
/* nr_entries can not be zero */
- if (ret < 0 || buf_size == 0)
+ if (buf_size == 0)
return 0;
trace_buf_size = buf_size;
return 1;
@@ -280,192 +315,58 @@ static const char *trace_options[] = {
"block",
"stacktrace",
"sched-tree",
- "ftrace_printk",
+ "trace_printk",
"ftrace_preempt",
"branch",
"annotate",
"userstacktrace",
"sym-userobj",
"printk-msg-only",
+ "context-info",
+ "latency-format",
+ "sleep-time",
+ "graph-time",
NULL
};
-/*
- * ftrace_max_lock is used to protect the swapping of buffers
- * when taking a max snapshot. The buffers themselves are
- * protected by per_cpu spinlocks. But the action of the swap
- * needs its own lock.
- *
- * This is defined as a raw_spinlock_t in order to help
- * with performance when lockdep debugging is enabled.
- */
-static raw_spinlock_t ftrace_max_lock =
- (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
-
-/*
- * Copy the new maximum trace into the separate maximum-trace
- * structure. (this way the maximum trace is permanently saved,
- * for later retrieval via /debugfs/tracing/latency_trace)
- */
-static void
-__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
-{
- struct trace_array_cpu *data = tr->data[cpu];
-
- max_tr.cpu = cpu;
- max_tr.time_start = data->preempt_timestamp;
-
- data = max_tr.data[cpu];
- data->saved_latency = tracing_max_latency;
-
- memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
- data->pid = tsk->pid;
- data->uid = task_uid(tsk);
- data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
- data->policy = tsk->policy;
- data->rt_priority = tsk->rt_priority;
+static struct {
+ u64 (*func)(void);
+ const char *name;
+} trace_clocks[] = {
+ { trace_clock_local, "local" },
+ { trace_clock_global, "global" },
+};
- /* record this tasks comm */
- tracing_record_cmdline(current);
-}
+int trace_clock_id;
-/**
- * trace_seq_printf - sequence printing of trace information
- * @s: trace sequence descriptor
- * @fmt: printf format string
- *
- * The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
- * trace_seq_printf is used to store strings into a special
- * buffer (@s). Then the output may be either used by
- * the sequencer or pulled into another buffer.
- */
-int
-trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
{
- int len = (PAGE_SIZE - 1) - s->len;
- va_list ap;
+ int len;
int ret;
- if (!len)
- return 0;
-
- va_start(ap, fmt);
- ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
- va_end(ap);
-
- /* If we can't write it all, don't bother writing anything */
- if (ret >= len)
- return 0;
-
- s->len += ret;
-
- return len;
-}
-
-/**
- * trace_seq_puts - trace sequence printing of simple string
- * @s: trace sequence descriptor
- * @str: simple string to record
- *
- * The tracer may use either the sequence operations or its own
- * copy to user routines. This function records a simple string
- * into a special buffer (@s) for later retrieval by a sequencer
- * or other mechanism.
- */
-static int
-trace_seq_puts(struct trace_seq *s, const char *str)
-{
- int len = strlen(str);
-
- if (len > ((PAGE_SIZE - 1) - s->len))
- return 0;
-
- memcpy(s->buffer + s->len, str, len);
- s->len += len;
-
- return len;
-}
-
-static int
-trace_seq_putc(struct trace_seq *s, unsigned char c)
-{
- if (s->len >= (PAGE_SIZE - 1))
- return 0;
-
- s->buffer[s->len++] = c;
-
- return 1;
-}
-
-static int
-trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
-{
- if (len > ((PAGE_SIZE - 1) - s->len))
+ if (!cnt)
return 0;
- memcpy(s->buffer + s->len, mem, len);
- s->len += len;
-
- return len;
-}
-
-#define MAX_MEMHEX_BYTES 8
-#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
-
-static int
-trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
-{
- unsigned char hex[HEX_CHARS];
- unsigned char *data = mem;
- int i, j;
-
-#ifdef __BIG_ENDIAN
- for (i = 0, j = 0; i < len; i++) {
-#else
- for (i = len-1, j = 0; i >= 0; i--) {
-#endif
- hex[j++] = hex_asc_hi(data[i]);
- hex[j++] = hex_asc_lo(data[i]);
- }
- hex[j++] = ' ';
-
- return trace_seq_putmem(s, hex, j);
-}
-
-static int
-trace_seq_path(struct trace_seq *s, struct path *path)
-{
- unsigned char *p;
+ if (s->len <= s->readpos)
+ return -EBUSY;
- if (s->len >= (PAGE_SIZE - 1))
- return 0;
- p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
- if (!IS_ERR(p)) {
- p = mangle_path(s->buffer + s->len, p, "\n");
- if (p) {
- s->len = p - s->buffer;
- return 1;
- }
- } else {
- s->buffer[s->len++] = '?';
- return 1;
- }
+ len = s->len - s->readpos;
+ if (cnt > len)
+ cnt = len;
+ ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
+ if (ret == cnt)
+ return -EFAULT;
- return 0;
-}
+ cnt -= ret;
-static void
-trace_seq_reset(struct trace_seq *s)
-{
- s->len = 0;
- s->readpos = 0;
+ s->readpos += cnt;
+ return cnt;
}
-ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
+static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
{
int len;
- int ret;
+ void *ret;
if (s->len <= s->readpos)
return -EBUSY;
@@ -473,23 +374,62 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
len = s->len - s->readpos;
if (cnt > len)
cnt = len;
- ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
- if (ret)
+ ret = memcpy(buf, s->buffer + s->readpos, cnt);
+ if (!ret)
return -EFAULT;
- s->readpos += len;
+ s->readpos += cnt;
return cnt;
}
+/*
+ * ftrace_max_lock is used to protect the swapping of buffers
+ * when taking a max snapshot. The buffers themselves are
+ * protected by per_cpu spinlocks. But the action of the swap
+ * needs its own lock.
+ *
+ * This is defined as a raw_spinlock_t in order to help
+ * with performance when lockdep debugging is enabled.
+ *
+ * It is also used in other places outside the update_max_tr
+ * so it needs to be defined outside of the
+ * CONFIG_TRACER_MAX_TRACE.
+ */
+static raw_spinlock_t ftrace_max_lock =
+ (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+unsigned long __read_mostly tracing_max_latency;
+unsigned long __read_mostly tracing_thresh;
+
+/*
+ * Copy the new maximum trace into the separate maximum-trace
+ * structure. (this way the maximum trace is permanently saved,
+ * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
+ */
static void
-trace_print_seq(struct seq_file *m, struct trace_seq *s)
+__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
- int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+ struct trace_array_cpu *data = tr->data[cpu];
+ struct trace_array_cpu *max_data = tr->data[cpu];
- s->buffer[len] = 0;
- seq_puts(m, s->buffer);
+ max_tr.cpu = cpu;
+ max_tr.time_start = data->preempt_timestamp;
+
+ max_data = max_tr.data[cpu];
+ max_data->saved_latency = tracing_max_latency;
+ max_data->critical_start = data->critical_start;
+ max_data->critical_end = data->critical_end;
- trace_seq_reset(s);
+ memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
+ max_data->pid = tsk->pid;
+ max_data->uid = task_uid(tsk);
+ max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
+ max_data->policy = tsk->policy;
+ max_data->rt_priority = tsk->rt_priority;
+
+ /* record this tasks comm */
+ tracing_record_cmdline(tsk);
}
/**
@@ -506,16 +446,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
struct ring_buffer *buf = tr->buffer;
+ if (trace_stop_count)
+ return;
+
WARN_ON_ONCE(!irqs_disabled());
__raw_spin_lock(&ftrace_max_lock);
tr->buffer = max_tr.buffer;
max_tr.buffer = buf;
- ftrace_disable_cpu();
- ring_buffer_reset(tr->buffer);
- ftrace_enable_cpu();
-
__update_max_tr(tr, tsk, cpu);
__raw_spin_unlock(&ftrace_max_lock);
}
@@ -533,21 +472,35 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
int ret;
+ if (trace_stop_count)
+ return;
+
WARN_ON_ONCE(!irqs_disabled());
__raw_spin_lock(&ftrace_max_lock);
ftrace_disable_cpu();
- ring_buffer_reset(max_tr.buffer);
ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
+ if (ret == -EBUSY) {
+ /*
+ * We failed to swap the buffer due to a commit taking
+ * place on this CPU. We fail to record, but we reset
+ * the max trace buffer (no one writes directly to it)
+ * and flag that it failed.
+ */
+ trace_array_printk(&max_tr, _THIS_IP_,
+ "Failed to swap buffers due to commit in progress\n");
+ }
+
ftrace_enable_cpu();
- WARN_ON_ONCE(ret);
+ WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
__update_max_tr(tr, tsk, cpu);
__raw_spin_unlock(&ftrace_max_lock);
}
+#endif /* CONFIG_TRACER_MAX_TRACE */
/**
* register_tracer - register a tracer with the ftrace system.
@@ -556,6 +509,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
* Register a new plugin tracer.
*/
int register_tracer(struct tracer *type)
+__releases(kernel_lock)
+__acquires(kernel_lock)
{
struct tracer *t;
int len;
@@ -594,12 +549,14 @@ int register_tracer(struct tracer *type)
else
if (!type->flags->opts)
type->flags->opts = dummy_tracer_opt;
+ if (!type->wait_pipe)
+ type->wait_pipe = default_wait_pipe;
+
#ifdef CONFIG_FTRACE_STARTUP_TEST
- if (type->selftest) {
+ if (type->selftest && !tracing_selftest_disabled) {
struct tracer *saved_tracer = current_trace;
struct trace_array *tr = &global_trace;
- int i;
/*
* Run a selftest on this tracer.
@@ -608,8 +565,7 @@ int register_tracer(struct tracer *type)
* internal tracing to verify that everything is in order.
* If we fail, we do not register this tracer.
*/
- for_each_tracing_cpu(i)
- tracing_reset(tr, i);
+ tracing_reset_online_cpus(tr);
current_trace = type;
/* the test is responsible for initializing and enabling */
@@ -622,8 +578,7 @@ int register_tracer(struct tracer *type)
goto out;
}
/* Only reset on passing, to avoid touching corrupted buffers */
- for_each_tracing_cpu(i)
- tracing_reset(tr, i);
+ tracing_reset_online_cpus(tr);
printk(KERN_CONT "PASSED\n");
}
@@ -638,8 +593,26 @@ int register_tracer(struct tracer *type)
out:
tracing_selftest_running = false;
mutex_unlock(&trace_types_lock);
- lock_kernel();
+ if (ret || !default_bootup_tracer)
+ goto out_unlock;
+
+ if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE))
+ goto out_unlock;
+
+ printk(KERN_INFO "Starting tracer '%s'\n", type->name);
+ /* Do we want this tracer to start on bootup? */
+ tracing_set_tracer(type->name);
+ default_bootup_tracer = NULL;
+ /* disable other selftests, since this will break it. */
+ tracing_selftest_disabled = 1;
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+ printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
+ type->name);
+#endif
+
+ out_unlock:
+ lock_kernel();
return ret;
}
@@ -658,6 +631,15 @@ void unregister_tracer(struct tracer *type)
found:
*t = (*t)->next;
+
+ if (type == current_trace && tracer_enabled) {
+ tracer_enabled = 0;
+ tracing_stop();
+ if (current_trace->stop)
+ current_trace->stop(&global_trace);
+ current_trace = &nop_trace;
+ }
+
if (strlen(type->name) != max_tracer_type_len)
goto out;
@@ -671,43 +653,72 @@ void unregister_tracer(struct tracer *type)
mutex_unlock(&trace_types_lock);
}
-void tracing_reset(struct trace_array *tr, int cpu)
+static void __tracing_reset(struct trace_array *tr, int cpu)
{
ftrace_disable_cpu();
ring_buffer_reset_cpu(tr->buffer, cpu);
ftrace_enable_cpu();
}
+void tracing_reset(struct trace_array *tr, int cpu)
+{
+ struct ring_buffer *buffer = tr->buffer;
+
+ ring_buffer_record_disable(buffer);
+
+ /* Make sure all commits have finished */
+ synchronize_sched();
+ __tracing_reset(tr, cpu);
+
+ ring_buffer_record_enable(buffer);
+}
+
void tracing_reset_online_cpus(struct trace_array *tr)
{
+ struct ring_buffer *buffer = tr->buffer;
int cpu;
+ ring_buffer_record_disable(buffer);
+
+ /* Make sure all commits have finished */
+ synchronize_sched();
+
tr->time_start = ftrace_now(tr->cpu);
for_each_online_cpu(cpu)
- tracing_reset(tr, cpu);
+ __tracing_reset(tr, cpu);
+
+ ring_buffer_record_enable(buffer);
+}
+
+void tracing_reset_current(int cpu)
+{
+ tracing_reset(&global_trace, cpu);
+}
+
+void tracing_reset_current_online_cpus(void)
+{
+ tracing_reset_online_cpus(&global_trace);
}
#define SAVED_CMDLINES 128
+#define NO_CMDLINE_MAP UINT_MAX
static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
static int cmdline_idx;
-static DEFINE_SPINLOCK(trace_cmdline_lock);
+static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED;
/* temporary disable recording */
-atomic_t trace_record_cmdline_disabled __read_mostly;
+static atomic_t trace_record_cmdline_disabled __read_mostly;
static void trace_init_cmdlines(void)
{
- memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline));
- memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid));
+ memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline));
+ memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid));
cmdline_idx = 0;
}
-static int trace_stop_count;
-static DEFINE_SPINLOCK(tracing_start_lock);
-
/**
* ftrace_off_permanent - disable all ftrace code permanently
*
@@ -738,13 +749,12 @@ void tracing_start(void)
return;
spin_lock_irqsave(&tracing_start_lock, flags);
- if (--trace_stop_count)
- goto out;
-
- if (trace_stop_count < 0) {
- /* Someone screwed up their debugging */
- WARN_ON_ONCE(1);
- trace_stop_count = 0;
+ if (--trace_stop_count) {
+ if (trace_stop_count < 0) {
+ /* Someone screwed up their debugging */
+ WARN_ON_ONCE(1);
+ trace_stop_count = 0;
+ }
goto out;
}
@@ -794,8 +804,7 @@ void trace_stop_cmdline_recording(void);
static void trace_save_cmdline(struct task_struct *tsk)
{
- unsigned map;
- unsigned idx;
+ unsigned pid, idx;
if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
return;
@@ -806,17 +815,24 @@ static void trace_save_cmdline(struct task_struct *tsk)
* nor do we want to disable interrupts,
* so if we miss here, then better luck next time.
*/
- if (!spin_trylock(&trace_cmdline_lock))
+ if (!__raw_spin_trylock(&trace_cmdline_lock))
return;
idx = map_pid_to_cmdline[tsk->pid];
- if (idx >= SAVED_CMDLINES) {
+ if (idx == NO_CMDLINE_MAP) {
idx = (cmdline_idx + 1) % SAVED_CMDLINES;
- map = map_cmdline_to_pid[idx];
- if (map <= PID_MAX_DEFAULT)
- map_pid_to_cmdline[map] = (unsigned)-1;
+ /*
+ * Check whether the cmdline buffer at idx has a pid
+ * mapped. We are going to overwrite that entry so we
+ * need to clear the map_pid_to_cmdline. Otherwise we
+ * would read the new comm for the old pid.
+ */
+ pid = map_cmdline_to_pid[idx];
+ if (pid != NO_CMDLINE_MAP)
+ map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
+ map_cmdline_to_pid[idx] = tsk->pid;
map_pid_to_cmdline[tsk->pid] = idx;
cmdline_idx = idx;
@@ -824,33 +840,39 @@ static void trace_save_cmdline(struct task_struct *tsk)
memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
- spin_unlock(&trace_cmdline_lock);
+ __raw_spin_unlock(&trace_cmdline_lock);
}
-char *trace_find_cmdline(int pid)
+void trace_find_cmdline(int pid, char comm[])
{
- char *cmdline = "<...>";
unsigned map;
- if (!pid)
- return "<idle>";
+ if (!pid) {
+ strcpy(comm, "<idle>");
+ return;
+ }
- if (pid > PID_MAX_DEFAULT)
- goto out;
+ if (pid > PID_MAX_DEFAULT) {
+ strcpy(comm, "<...>");
+ return;
+ }
+ preempt_disable();
+ __raw_spin_lock(&trace_cmdline_lock);
map = map_pid_to_cmdline[pid];
- if (map >= SAVED_CMDLINES)
- goto out;
-
- cmdline = saved_cmdlines[map];
+ if (map != NO_CMDLINE_MAP)
+ strcpy(comm, saved_cmdlines[map]);
+ else
+ strcpy(comm, "<...>");
- out:
- return cmdline;
+ __raw_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
}
void tracing_record_cmdline(struct task_struct *tsk)
{
- if (atomic_read(&trace_record_cmdline_disabled))
+ if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled ||
+ !tracing_is_on())
return;
trace_save_cmdline(tsk);
@@ -864,7 +886,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
entry->preempt_count = pc & 0xff;
entry->pid = (tsk) ? tsk->pid : 0;
- entry->tgid = (tsk) ? tsk->tgid : 0;
+ entry->tgid = (tsk) ? tsk->tgid : 0;
entry->flags =
#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -875,81 +897,108 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
}
+EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
-void
-trace_function(struct trace_array *tr, struct trace_array_cpu *data,
- unsigned long ip, unsigned long parent_ip, unsigned long flags,
- int pc)
+struct ring_buffer_event *
+trace_buffer_lock_reserve(struct ring_buffer *buffer,
+ int type,
+ unsigned long len,
+ unsigned long flags, int pc)
{
struct ring_buffer_event *event;
- struct ftrace_entry *entry;
- unsigned long irq_flags;
- /* If we are reading the ring buffer, don't trace */
- if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
- return;
+ event = ring_buffer_lock_reserve(buffer, len);
+ if (event != NULL) {
+ struct trace_entry *ent = ring_buffer_event_data(event);
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
- &irq_flags);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, flags, pc);
- entry->ent.type = TRACE_FN;
- entry->ip = ip;
- entry->parent_ip = parent_ip;
- ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+ tracing_generic_entry_update(ent, flags, pc);
+ ent->type = type;
+ }
+
+ return event;
}
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static void __trace_graph_entry(struct trace_array *tr,
- struct trace_array_cpu *data,
- struct ftrace_graph_ent *trace,
- unsigned long flags,
- int pc)
+static inline void
+__trace_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags, int pc,
+ int wake)
{
- struct ring_buffer_event *event;
- struct ftrace_graph_ent_entry *entry;
- unsigned long irq_flags;
+ ring_buffer_unlock_commit(buffer, event);
- if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
- return;
+ ftrace_trace_stack(buffer, flags, 6, pc);
+ ftrace_trace_userstack(buffer, flags, pc);
- event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
- &irq_flags);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, flags, pc);
- entry->ent.type = TRACE_GRAPH_ENT;
- entry->graph_ent = *trace;
- ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+ if (wake)
+ trace_wake_up();
+}
+
+void trace_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags, int pc)
+{
+ __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
}
-static void __trace_graph_return(struct trace_array *tr,
- struct trace_array_cpu *data,
- struct ftrace_graph_ret *trace,
- unsigned long flags,
- int pc)
+struct ring_buffer_event *
+trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
+ int type, unsigned long len,
+ unsigned long flags, int pc)
{
+ *current_rb = global_trace.buffer;
+ return trace_buffer_lock_reserve(*current_rb,
+ type, len, flags, pc);
+}
+EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
+
+void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags, int pc)
+{
+ __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
+}
+EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
+
+void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags, int pc)
+{
+ __trace_buffer_unlock_commit(buffer, event, flags, pc, 0);
+}
+EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
+
+void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ ring_buffer_discard_commit(buffer, event);
+}
+EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
+
+void
+trace_function(struct trace_array *tr,
+ unsigned long ip, unsigned long parent_ip, unsigned long flags,
+ int pc)
+{
+ struct ftrace_event_call *call = &event_function;
+ struct ring_buffer *buffer = tr->buffer;
struct ring_buffer_event *event;
- struct ftrace_graph_ret_entry *entry;
- unsigned long irq_flags;
+ struct ftrace_entry *entry;
+ /* If we are reading the ring buffer, don't trace */
if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
return;
- event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
- &irq_flags);
+ event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
+ flags, pc);
if (!event)
return;
entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, flags, pc);
- entry->ent.type = TRACE_GRAPH_RET;
- entry->ret = *trace;
- ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+ entry->ip = ip;
+ entry->parent_ip = parent_ip;
+
+ if (!filter_check_discard(call, entry, buffer, event))
+ ring_buffer_unlock_commit(buffer, event);
}
-#endif
void
ftrace(struct trace_array *tr, struct trace_array_cpu *data,
@@ -957,31 +1006,24 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
int pc)
{
if (likely(!atomic_read(&data->disabled)))
- trace_function(tr, data, ip, parent_ip, flags, pc);
+ trace_function(tr, ip, parent_ip, flags, pc);
}
-static void ftrace_trace_stack(struct trace_array *tr,
- struct trace_array_cpu *data,
- unsigned long flags,
- int skip, int pc)
-{
#ifdef CONFIG_STACKTRACE
+static void __ftrace_trace_stack(struct ring_buffer *buffer,
+ unsigned long flags,
+ int skip, int pc)
+{
+ struct ftrace_event_call *call = &event_kernel_stack;
struct ring_buffer_event *event;
struct stack_entry *entry;
struct stack_trace trace;
- unsigned long irq_flags;
-
- if (!(trace_flags & TRACE_ITER_STACKTRACE))
- return;
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
- &irq_flags);
+ event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
+ sizeof(*entry), flags, pc);
if (!event)
return;
entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, flags, pc);
- entry->ent.type = TRACE_STACK;
-
memset(&entry->caller, 0, sizeof(entry->caller));
trace.nr_entries = 0;
@@ -990,38 +1032,41 @@ static void ftrace_trace_stack(struct trace_array *tr,
trace.entries = entry->caller;
save_stack_trace(&trace);
- ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-#endif
+ if (!filter_check_discard(call, entry, buffer, event))
+ ring_buffer_unlock_commit(buffer, event);
}
-void __trace_stack(struct trace_array *tr,
- struct trace_array_cpu *data,
- unsigned long flags,
- int skip)
+void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
+ int skip, int pc)
{
- ftrace_trace_stack(tr, data, flags, skip, preempt_count());
+ if (!(trace_flags & TRACE_ITER_STACKTRACE))
+ return;
+
+ __ftrace_trace_stack(buffer, flags, skip, pc);
}
-static void ftrace_trace_userstack(struct trace_array *tr,
- struct trace_array_cpu *data,
- unsigned long flags, int pc)
+void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
+ int pc)
{
-#ifdef CONFIG_STACKTRACE
+ __ftrace_trace_stack(tr->buffer, flags, skip, pc);
+}
+
+void
+ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
+{
+ struct ftrace_event_call *call = &event_user_stack;
struct ring_buffer_event *event;
struct userstack_entry *entry;
struct stack_trace trace;
- unsigned long irq_flags;
if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
return;
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
- &irq_flags);
+ event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
+ sizeof(*entry), flags, pc);
if (!event)
return;
entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, flags, pc);
- entry->ent.type = TRACE_USER_STACK;
memset(&entry->caller, 0, sizeof(entry->caller));
@@ -1031,112 +1076,45 @@ static void ftrace_trace_userstack(struct trace_array *tr,
trace.entries = entry->caller;
save_stack_trace_user(&trace);
- ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-#endif
+ if (!filter_check_discard(call, entry, buffer, event))
+ ring_buffer_unlock_commit(buffer, event);
}
-void __trace_userstack(struct trace_array *tr,
- struct trace_array_cpu *data,
- unsigned long flags)
+#ifdef UNUSED
+static void __trace_userstack(struct trace_array *tr, unsigned long flags)
{
- ftrace_trace_userstack(tr, data, flags, preempt_count());
+ ftrace_trace_userstack(tr, flags, preempt_count());
}
+#endif /* UNUSED */
+
+#endif /* CONFIG_STACKTRACE */
static void
-ftrace_trace_special(void *__tr, void *__data,
+ftrace_trace_special(void *__tr,
unsigned long arg1, unsigned long arg2, unsigned long arg3,
int pc)
{
struct ring_buffer_event *event;
- struct trace_array_cpu *data = __data;
struct trace_array *tr = __tr;
+ struct ring_buffer *buffer = tr->buffer;
struct special_entry *entry;
- unsigned long irq_flags;
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
- &irq_flags);
+ event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL,
+ sizeof(*entry), 0, pc);
if (!event)
return;
entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, 0, pc);
- entry->ent.type = TRACE_SPECIAL;
entry->arg1 = arg1;
entry->arg2 = arg2;
entry->arg3 = arg3;
- ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
- ftrace_trace_stack(tr, data, irq_flags, 4, pc);
- ftrace_trace_userstack(tr, data, irq_flags, pc);
-
- trace_wake_up();
+ trace_buffer_unlock_commit(buffer, event, 0, pc);
}
void
__trace_special(void *__tr, void *__data,
unsigned long arg1, unsigned long arg2, unsigned long arg3)
{
- ftrace_trace_special(__tr, __data, arg1, arg2, arg3, preempt_count());
-}
-
-void
-tracing_sched_switch_trace(struct trace_array *tr,
- struct trace_array_cpu *data,
- struct task_struct *prev,
- struct task_struct *next,
- unsigned long flags, int pc)
-{
- struct ring_buffer_event *event;
- struct ctx_switch_entry *entry;
- unsigned long irq_flags;
-
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
- &irq_flags);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, flags, pc);
- entry->ent.type = TRACE_CTX;
- entry->prev_pid = prev->pid;
- entry->prev_prio = prev->prio;
- entry->prev_state = prev->state;
- entry->next_pid = next->pid;
- entry->next_prio = next->prio;
- entry->next_state = next->state;
- entry->next_cpu = task_cpu(next);
- ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
- ftrace_trace_stack(tr, data, flags, 5, pc);
- ftrace_trace_userstack(tr, data, flags, pc);
-}
-
-void
-tracing_sched_wakeup_trace(struct trace_array *tr,
- struct trace_array_cpu *data,
- struct task_struct *wakee,
- struct task_struct *curr,
- unsigned long flags, int pc)
-{
- struct ring_buffer_event *event;
- struct ctx_switch_entry *entry;
- unsigned long irq_flags;
-
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
- &irq_flags);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, flags, pc);
- entry->ent.type = TRACE_WAKE;
- entry->prev_pid = curr->pid;
- entry->prev_prio = curr->prio;
- entry->prev_state = curr->state;
- entry->next_pid = wakee->pid;
- entry->next_prio = wakee->prio;
- entry->next_state = wakee->state;
- entry->next_cpu = task_cpu(wakee);
- ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
- ftrace_trace_stack(tr, data, flags, 6, pc);
- ftrace_trace_userstack(tr, data, flags, pc);
-
- trace_wake_up();
+ ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
}
void
@@ -1157,152 +1135,163 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
data = tr->data[cpu];
if (likely(atomic_inc_return(&data->disabled) == 1))
- ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
+ ftrace_trace_special(tr, arg1, arg2, arg3, pc);
atomic_dec(&data->disabled);
local_irq_restore(flags);
}
-#ifdef CONFIG_FUNCTION_TRACER
-static void
-function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
+/**
+ * trace_vbprintk - write binary msg to tracing buffer
+ *
+ */
+int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
{
+ static raw_spinlock_t trace_buf_lock =
+ (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+ static u32 trace_buf[TRACE_BUF_SIZE];
+
+ struct ftrace_event_call *call = &event_bprint;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
struct trace_array *tr = &global_trace;
struct trace_array_cpu *data;
+ struct bprint_entry *entry;
unsigned long flags;
- long disabled;
- int cpu, resched;
- int pc;
+ int disable;
+ int resched;
+ int cpu, len = 0, size, pc;
- if (unlikely(!ftrace_function_enabled))
- return;
+ if (unlikely(tracing_selftest_running || tracing_disabled))
+ return 0;
+
+ /* Don't pollute graph traces with trace_vprintk internals */
+ pause_graph_tracing();
pc = preempt_count();
resched = ftrace_preempt_disable();
- local_save_flags(flags);
cpu = raw_smp_processor_id();
data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
-
- if (likely(disabled == 1))
- trace_function(tr, data, ip, parent_ip, flags, pc);
- atomic_dec(&data->disabled);
- ftrace_preempt_enable(resched);
-}
+ disable = atomic_inc_return(&data->disabled);
+ if (unlikely(disable != 1))
+ goto out;
-static void
-function_trace_call(unsigned long ip, unsigned long parent_ip)
-{
- struct trace_array *tr = &global_trace;
- struct trace_array_cpu *data;
- unsigned long flags;
- long disabled;
- int cpu;
- int pc;
+ /* Lockdep uses trace_printk for lock tracing */
+ local_irq_save(flags);
+ __raw_spin_lock(&trace_buf_lock);
+ len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
- if (unlikely(!ftrace_function_enabled))
- return;
+ if (len > TRACE_BUF_SIZE || len < 0)
+ goto out_unlock;
- /*
- * Need to use raw, since this must be called before the
- * recursive protection is performed.
- */
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
+ size = sizeof(*entry) + sizeof(u32) * len;
+ buffer = tr->buffer;
+ event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
+ flags, pc);
+ if (!event)
+ goto out_unlock;
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+ entry->fmt = fmt;
- if (likely(disabled == 1)) {
- pc = preempt_count();
- trace_function(tr, data, ip, parent_ip, flags, pc);
- }
+ memcpy(entry->buf, trace_buf, sizeof(u32) * len);
+ if (!filter_check_discard(call, entry, buffer, event))
+ ring_buffer_unlock_commit(buffer, event);
- atomic_dec(&data->disabled);
+out_unlock:
+ __raw_spin_unlock(&trace_buf_lock);
local_irq_restore(flags);
+
+out:
+ atomic_dec_return(&data->disabled);
+ ftrace_preempt_enable(resched);
+ unpause_graph_tracing();
+
+ return len;
}
+EXPORT_SYMBOL_GPL(trace_vbprintk);
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-int trace_graph_entry(struct ftrace_graph_ent *trace)
+int trace_array_printk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, ...)
{
- struct trace_array *tr = &global_trace;
- struct trace_array_cpu *data;
- unsigned long flags;
- long disabled;
- int cpu;
- int pc;
-
- if (!ftrace_trace_task(current))
- return 0;
+ int ret;
+ va_list ap;
- if (!ftrace_graph_addr(trace->func))
+ if (!(trace_flags & TRACE_ITER_PRINTK))
return 0;
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
- if (likely(disabled == 1)) {
- pc = preempt_count();
- __trace_graph_entry(tr, data, trace, flags, pc);
- }
- /* Only do the atomic if it is not already set */
- if (!test_tsk_trace_graph(current))
- set_tsk_trace_graph(current);
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
-
- return 1;
+ va_start(ap, fmt);
+ ret = trace_array_vprintk(tr, ip, fmt, ap);
+ va_end(ap);
+ return ret;
}
-void trace_graph_return(struct ftrace_graph_ret *trace)
+int trace_array_vprintk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, va_list args)
{
- struct trace_array *tr = &global_trace;
+ static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
+ static char trace_buf[TRACE_BUF_SIZE];
+
+ struct ftrace_event_call *call = &event_print;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
struct trace_array_cpu *data;
- unsigned long flags;
- long disabled;
- int cpu;
- int pc;
+ int cpu, len = 0, size, pc;
+ struct print_entry *entry;
+ unsigned long irq_flags;
+ int disable;
- local_irq_save(flags);
+ if (tracing_disabled || tracing_selftest_running)
+ return 0;
+
+ pc = preempt_count();
+ preempt_disable_notrace();
cpu = raw_smp_processor_id();
data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
- if (likely(disabled == 1)) {
- pc = preempt_count();
- __trace_graph_return(tr, data, trace, flags, pc);
- }
- if (!trace->depth)
- clear_tsk_trace_graph(current);
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
-}
-#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-static struct ftrace_ops trace_ops __read_mostly =
-{
- .func = function_trace_call,
-};
+ disable = atomic_inc_return(&data->disabled);
+ if (unlikely(disable != 1))
+ goto out;
-void tracing_start_function_trace(void)
-{
- ftrace_function_enabled = 0;
+ pause_graph_tracing();
+ raw_local_irq_save(irq_flags);
+ __raw_spin_lock(&trace_buf_lock);
+ len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
- if (trace_flags & TRACE_ITER_PREEMPTONLY)
- trace_ops.func = function_trace_call_preempt_only;
- else
- trace_ops.func = function_trace_call;
+ len = min(len, TRACE_BUF_SIZE-1);
+ trace_buf[len] = 0;
+
+ size = sizeof(*entry) + len + 1;
+ buffer = tr->buffer;
+ event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
+ irq_flags, pc);
+ if (!event)
+ goto out_unlock;
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
- register_ftrace_function(&trace_ops);
- ftrace_function_enabled = 1;
+ memcpy(&entry->buf, trace_buf, len);
+ entry->buf[len] = 0;
+ if (!filter_check_discard(call, entry, buffer, event))
+ ring_buffer_unlock_commit(buffer, event);
+
+ out_unlock:
+ __raw_spin_unlock(&trace_buf_lock);
+ raw_local_irq_restore(irq_flags);
+ unpause_graph_tracing();
+ out:
+ atomic_dec_return(&data->disabled);
+ preempt_enable_notrace();
+
+ return len;
}
-void tracing_stop_function_trace(void)
+int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
{
- ftrace_function_enabled = 0;
- unregister_ftrace_function(&trace_ops);
+ return trace_array_printk(&global_trace, ip, fmt, args);
}
-#endif
+EXPORT_SYMBOL_GPL(trace_vprintk);
enum trace_file_type {
TRACE_FILE_LAT_FMT = 1,
@@ -1345,10 +1334,25 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
{
struct ring_buffer *buffer = iter->tr->buffer;
struct trace_entry *ent, *next = NULL;
+ int cpu_file = iter->cpu_file;
u64 next_ts = 0, ts;
int next_cpu = -1;
int cpu;
+ /*
+ * If we are in a per_cpu trace file, don't bother by iterating over
+ * all cpu and peek directly.
+ */
+ if (cpu_file > TRACE_PIPE_ALL_CPU) {
+ if (ring_buffer_empty_cpu(buffer, cpu_file))
+ return NULL;
+ ent = peek_next_entry(iter, cpu_file, ent_ts);
+ if (ent_cpu)
+ *ent_cpu = cpu_file;
+
+ return ent;
+ }
+
for_each_tracing_cpu(cpu) {
if (ring_buffer_empty_cpu(buffer, cpu))
@@ -1376,8 +1380,8 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
}
/* Find the next real entry, without updating the iterator itself */
-static struct trace_entry *
-find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
+struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
+ int *ent_cpu, u64 *ent_ts)
{
return __find_next_entry(iter, ent_cpu, ent_ts);
}
@@ -1426,19 +1430,63 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
return ent;
}
+static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
+{
+ struct trace_array *tr = iter->tr;
+ struct ring_buffer_event *event;
+ struct ring_buffer_iter *buf_iter;
+ unsigned long entries = 0;
+ u64 ts;
+
+ tr->data[cpu]->skipped_entries = 0;
+
+ if (!iter->buffer_iter[cpu])
+ return;
+
+ buf_iter = iter->buffer_iter[cpu];
+ ring_buffer_iter_reset(buf_iter);
+
+ /*
+ * We could have the case with the max latency tracers
+ * that a reset never took place on a cpu. This is evident
+ * by the timestamp being before the start of the buffer.
+ */
+ while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
+ if (ts >= iter->tr->time_start)
+ break;
+ entries++;
+ ring_buffer_read(buf_iter, NULL);
+ }
+
+ tr->data[cpu]->skipped_entries = entries;
+}
+
+/*
+ * No necessary locking here. The worst thing which can
+ * happen is loosing events consumed at the same time
+ * by a trace_pipe reader.
+ * Other than that, we don't risk to crash the ring buffer
+ * because it serializes the readers.
+ *
+ * The current tracer is copied to avoid a global locking
+ * all around.
+ */
static void *s_start(struct seq_file *m, loff_t *pos)
{
struct trace_iterator *iter = m->private;
+ static struct tracer *old_tracer;
+ int cpu_file = iter->cpu_file;
void *p = NULL;
loff_t l = 0;
int cpu;
+ /* copy the tracer to avoid using a global lock all around */
mutex_lock(&trace_types_lock);
-
- if (!current_trace || current_trace != iter->trace) {
- mutex_unlock(&trace_types_lock);
- return NULL;
+ if (unlikely(old_tracer != current_trace && current_trace)) {
+ old_tracer = current_trace;
+ *iter->trace = *current_trace;
}
+ mutex_unlock(&trace_types_lock);
atomic_inc(&trace_record_cmdline_disabled);
@@ -1449,9 +1497,11 @@ static void *s_start(struct seq_file *m, loff_t *pos)
ftrace_disable_cpu();
- for_each_tracing_cpu(cpu) {
- ring_buffer_iter_reset(iter->buffer_iter[cpu]);
- }
+ if (cpu_file == TRACE_PIPE_ALL_CPU) {
+ for_each_tracing_cpu(cpu)
+ tracing_iter_reset(iter, cpu);
+ } else
+ tracing_iter_reset(iter, cpu_file);
ftrace_enable_cpu();
@@ -1463,161 +1513,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
p = s_next(m, p, &l);
}
+ trace_event_read_lock();
return p;
}
static void s_stop(struct seq_file *m, void *p)
{
atomic_dec(&trace_record_cmdline_disabled);
- mutex_unlock(&trace_types_lock);
-}
-
-#ifdef CONFIG_KRETPROBES
-static inline const char *kretprobed(const char *name)
-{
- static const char tramp_name[] = "kretprobe_trampoline";
- int size = sizeof(tramp_name);
-
- if (strncmp(tramp_name, name, size) == 0)
- return "[unknown/kretprobe'd]";
- return name;
-}
-#else
-static inline const char *kretprobed(const char *name)
-{
- return name;
-}
-#endif /* CONFIG_KRETPROBES */
-
-static int
-seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
-{
-#ifdef CONFIG_KALLSYMS
- char str[KSYM_SYMBOL_LEN];
- const char *name;
-
- kallsyms_lookup(address, NULL, NULL, NULL, str);
-
- name = kretprobed(str);
-
- return trace_seq_printf(s, fmt, name);
-#endif
- return 1;
-}
-
-static int
-seq_print_sym_offset(struct trace_seq *s, const char *fmt,
- unsigned long address)
-{
-#ifdef CONFIG_KALLSYMS
- char str[KSYM_SYMBOL_LEN];
- const char *name;
-
- sprint_symbol(str, address);
- name = kretprobed(str);
-
- return trace_seq_printf(s, fmt, name);
-#endif
- return 1;
-}
-
-#ifndef CONFIG_64BIT
-# define IP_FMT "%08lx"
-#else
-# define IP_FMT "%016lx"
-#endif
-
-int
-seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
-{
- int ret;
-
- if (!ip)
- return trace_seq_printf(s, "0");
-
- if (sym_flags & TRACE_ITER_SYM_OFFSET)
- ret = seq_print_sym_offset(s, "%s", ip);
- else
- ret = seq_print_sym_short(s, "%s", ip);
-
- if (!ret)
- return 0;
-
- if (sym_flags & TRACE_ITER_SYM_ADDR)
- ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
- return ret;
-}
-
-static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
- unsigned long ip, unsigned long sym_flags)
-{
- struct file *file = NULL;
- unsigned long vmstart = 0;
- int ret = 1;
-
- if (mm) {
- const struct vm_area_struct *vma;
-
- down_read(&mm->mmap_sem);
- vma = find_vma(mm, ip);
- if (vma) {
- file = vma->vm_file;
- vmstart = vma->vm_start;
- }
- if (file) {
- ret = trace_seq_path(s, &file->f_path);
- if (ret)
- ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
- }
- up_read(&mm->mmap_sem);
- }
- if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
- ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
- return ret;
-}
-
-static int
-seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
- unsigned long sym_flags)
-{
- struct mm_struct *mm = NULL;
- int ret = 1;
- unsigned int i;
-
- if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
- struct task_struct *task;
- /*
- * we do the lookup on the thread group leader,
- * since individual threads might have already quit!
- */
- rcu_read_lock();
- task = find_task_by_vpid(entry->ent.tgid);
- if (task)
- mm = get_task_mm(task);
- rcu_read_unlock();
- }
-
- for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
- unsigned long ip = entry->caller[i];
-
- if (ip == ULONG_MAX || !ret)
- break;
- if (i && ret)
- ret = trace_seq_puts(s, " <- ");
- if (!ip) {
- if (ret)
- ret = trace_seq_puts(s, "??");
- continue;
- }
- if (!ret)
- break;
- if (ret)
- ret = seq_print_user_ip(s, mm, ip, sym_flags);
- }
-
- if (mm)
- mmput(mm);
- return ret;
+ trace_event_read_unlock();
}
static void print_lat_help_header(struct seq_file *m)
@@ -1647,22 +1550,38 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
struct trace_array *tr = iter->tr;
struct trace_array_cpu *data = tr->data[tr->cpu];
struct tracer *type = current_trace;
- unsigned long total;
- unsigned long entries;
+ unsigned long entries = 0;
+ unsigned long total = 0;
+ unsigned long count;
const char *name = "preemption";
+ int cpu;
if (type)
name = type->name;
- entries = ring_buffer_entries(iter->tr->buffer);
- total = entries +
- ring_buffer_overruns(iter->tr->buffer);
- seq_printf(m, "%s latency trace v1.1.5 on %s\n",
+ for_each_tracing_cpu(cpu) {
+ count = ring_buffer_entries_cpu(tr->buffer, cpu);
+ /*
+ * If this buffer has skipped entries, then we hold all
+ * entries for the trace and we need to ignore the
+ * ones before the time stamp.
+ */
+ if (tr->data[cpu]->skipped_entries) {
+ count -= tr->data[cpu]->skipped_entries;
+ /* total is the same as the entries */
+ total += count;
+ } else
+ total += count +
+ ring_buffer_overrun_cpu(tr->buffer, cpu);
+ entries += count;
+ }
+
+ seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
name, UTS_RELEASE);
- seq_puts(m, "-----------------------------------"
+ seq_puts(m, "# -----------------------------------"
"---------------------------------\n");
- seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |"
+ seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |"
" (M:%s VP:%d, KP:%d, SP:%d HP:%d",
nsecs_to_usecs(data->saved_latency),
entries,
@@ -1684,121 +1603,24 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
#else
seq_puts(m, ")\n");
#endif
- seq_puts(m, " -----------------\n");
- seq_printf(m, " | task: %.16s-%d "
+ seq_puts(m, "# -----------------\n");
+ seq_printf(m, "# | task: %.16s-%d "
"(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
data->comm, data->pid, data->uid, data->nice,
data->policy, data->rt_priority);
- seq_puts(m, " -----------------\n");
+ seq_puts(m, "# -----------------\n");
if (data->critical_start) {
- seq_puts(m, " => started at: ");
+ seq_puts(m, "# => started at: ");
seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags);
trace_print_seq(m, &iter->seq);
- seq_puts(m, "\n => ended at: ");
+ seq_puts(m, "\n# => ended at: ");
seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
trace_print_seq(m, &iter->seq);
- seq_puts(m, "\n");
+ seq_puts(m, "\n#\n");
}
- seq_puts(m, "\n");
-}
-
-static void
-lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
-{
- int hardirq, softirq;
- char *comm;
-
- comm = trace_find_cmdline(entry->pid);
-
- trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
- trace_seq_printf(s, "%3d", cpu);
- trace_seq_printf(s, "%c%c",
- (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
- (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.',
- ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
-
- hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
- softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
- if (hardirq && softirq) {
- trace_seq_putc(s, 'H');
- } else {
- if (hardirq) {
- trace_seq_putc(s, 'h');
- } else {
- if (softirq)
- trace_seq_putc(s, 's');
- else
- trace_seq_putc(s, '.');
- }
- }
-
- if (entry->preempt_count)
- trace_seq_printf(s, "%x", entry->preempt_count);
- else
- trace_seq_puts(s, ".");
-}
-
-unsigned long preempt_mark_thresh = 100;
-
-static void
-lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
- unsigned long rel_usecs)
-{
- trace_seq_printf(s, " %4lldus", abs_usecs);
- if (rel_usecs > preempt_mark_thresh)
- trace_seq_puts(s, "!: ");
- else if (rel_usecs > 1)
- trace_seq_puts(s, "+: ");
- else
- trace_seq_puts(s, " : ");
-}
-
-static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
-
-static int task_state_char(unsigned long state)
-{
- int bit = state ? __ffs(state) + 1 : 0;
-
- return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
-}
-
-/*
- * The message is supposed to contain an ending newline.
- * If the printing stops prematurely, try to add a newline of our own.
- */
-void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
-{
- struct trace_entry *ent;
- struct trace_field_cont *cont;
- bool ok = true;
-
- ent = peek_next_entry(iter, iter->cpu, NULL);
- if (!ent || ent->type != TRACE_CONT) {
- trace_seq_putc(s, '\n');
- return;
- }
-
- do {
- cont = (struct trace_field_cont *)ent;
- if (ok)
- ok = (trace_seq_printf(s, "%s", cont->buf) > 0);
-
- ftrace_disable_cpu();
-
- if (iter->buffer_iter[iter->cpu])
- ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
- else
- ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
-
- ftrace_enable_cpu();
-
- ent = peek_next_entry(iter, iter->cpu, NULL);
- } while (ent && ent->type == TRACE_CONT);
-
- if (!ok)
- trace_seq_putc(s, '\n');
+ seq_puts(m, "#\n");
}
static void test_cpu_buff_start(struct trace_iterator *iter)
@@ -1814,142 +1636,15 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
if (cpumask_test_cpu(iter->cpu, iter->started))
return;
- cpumask_set_cpu(iter->cpu, iter->started);
- trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
-}
-
-static enum print_line_t
-print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
-{
- struct trace_seq *s = &iter->seq;
- unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
- struct trace_entry *next_entry;
- unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
- struct trace_entry *entry = iter->ent;
- unsigned long abs_usecs;
- unsigned long rel_usecs;
- u64 next_ts;
- char *comm;
- int S, T;
- int i;
-
- if (entry->type == TRACE_CONT)
- return TRACE_TYPE_HANDLED;
-
- test_cpu_buff_start(iter);
-
- next_entry = find_next_entry(iter, NULL, &next_ts);
- if (!next_entry)
- next_ts = iter->ts;
- rel_usecs = ns2usecs(next_ts - iter->ts);
- abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
-
- if (verbose) {
- comm = trace_find_cmdline(entry->pid);
- trace_seq_printf(s, "%16s %5d %3d %d %08x %08x [%08lx]"
- " %ld.%03ldms (+%ld.%03ldms): ",
- comm,
- entry->pid, cpu, entry->flags,
- entry->preempt_count, trace_idx,
- ns2usecs(iter->ts),
- abs_usecs/1000,
- abs_usecs % 1000, rel_usecs/1000,
- rel_usecs % 1000);
- } else {
- lat_print_generic(s, entry, cpu);
- lat_print_timestamp(s, abs_usecs, rel_usecs);
- }
- switch (entry->type) {
- case TRACE_FN: {
- struct ftrace_entry *field;
-
- trace_assign_type(field, entry);
-
- seq_print_ip_sym(s, field->ip, sym_flags);
- trace_seq_puts(s, " (");
- seq_print_ip_sym(s, field->parent_ip, sym_flags);
- trace_seq_puts(s, ")\n");
- break;
- }
- case TRACE_CTX:
- case TRACE_WAKE: {
- struct ctx_switch_entry *field;
-
- trace_assign_type(field, entry);
-
- T = task_state_char(field->next_state);
- S = task_state_char(field->prev_state);
- comm = trace_find_cmdline(field->next_pid);
- trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
- field->prev_pid,
- field->prev_prio,
- S, entry->type == TRACE_CTX ? "==>" : " +",
- field->next_cpu,
- field->next_pid,
- field->next_prio,
- T, comm);
- break;
- }
- case TRACE_SPECIAL: {
- struct special_entry *field;
-
- trace_assign_type(field, entry);
-
- trace_seq_printf(s, "# %ld %ld %ld\n",
- field->arg1,
- field->arg2,
- field->arg3);
- break;
- }
- case TRACE_STACK: {
- struct stack_entry *field;
-
- trace_assign_type(field, entry);
-
- for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
- if (i)
- trace_seq_puts(s, " <= ");
- seq_print_ip_sym(s, field->caller[i], sym_flags);
- }
- trace_seq_puts(s, "\n");
- break;
- }
- case TRACE_PRINT: {
- struct print_entry *field;
-
- trace_assign_type(field, entry);
-
- seq_print_ip_sym(s, field->ip, sym_flags);
- trace_seq_printf(s, ": %s", field->buf);
- if (entry->flags & TRACE_FLAG_CONT)
- trace_seq_print_cont(s, iter);
- break;
- }
- case TRACE_BRANCH: {
- struct trace_branch *field;
-
- trace_assign_type(field, entry);
-
- trace_seq_printf(s, "[%s] %s:%s:%d\n",
- field->correct ? " ok " : " MISS ",
- field->func,
- field->file,
- field->line);
- break;
- }
- case TRACE_USER_STACK: {
- struct userstack_entry *field;
+ if (iter->tr->data[iter->cpu]->skipped_entries)
+ return;
- trace_assign_type(field, entry);
+ cpumask_set_cpu(iter->cpu, iter->started);
- seq_print_userip_objs(field, s, sym_flags);
- trace_seq_putc(s, '\n');
- break;
- }
- default:
- trace_seq_printf(s, "Unknown type %d\n", entry->type);
- }
- return TRACE_TYPE_HANDLED;
+ /* Don't print started cpu buffer for the first entry of the trace */
+ if (iter->idx > 1)
+ trace_seq_printf(s, "##### CPU %u buffer started ####\n",
+ iter->cpu);
}
static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
@@ -1957,333 +1652,84 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
struct trace_seq *s = &iter->seq;
unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
struct trace_entry *entry;
- unsigned long usec_rem;
- unsigned long long t;
- unsigned long secs;
- char *comm;
- int ret;
- int S, T;
- int i;
+ struct trace_event *event;
entry = iter->ent;
- if (entry->type == TRACE_CONT)
- return TRACE_TYPE_HANDLED;
-
test_cpu_buff_start(iter);
- comm = trace_find_cmdline(iter->ent->pid);
-
- t = ns2usecs(iter->ts);
- usec_rem = do_div(t, 1000000ULL);
- secs = (unsigned long)t;
+ event = ftrace_find_event(entry->type);
- ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- ret = trace_seq_printf(s, "[%03d] ", iter->cpu);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- switch (entry->type) {
- case TRACE_FN: {
- struct ftrace_entry *field;
-
- trace_assign_type(field, entry);
-
- ret = seq_print_ip_sym(s, field->ip, sym_flags);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
- field->parent_ip) {
- ret = trace_seq_printf(s, " <-");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- ret = seq_print_ip_sym(s,
- field->parent_ip,
- sym_flags);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
- ret = trace_seq_printf(s, "\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- break;
- }
- case TRACE_CTX:
- case TRACE_WAKE: {
- struct ctx_switch_entry *field;
-
- trace_assign_type(field, entry);
-
- T = task_state_char(field->next_state);
- S = task_state_char(field->prev_state);
- ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
- field->prev_pid,
- field->prev_prio,
- S,
- entry->type == TRACE_CTX ? "==>" : " +",
- field->next_cpu,
- field->next_pid,
- field->next_prio,
- T);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- break;
- }
- case TRACE_SPECIAL: {
- struct special_entry *field;
-
- trace_assign_type(field, entry);
-
- ret = trace_seq_printf(s, "# %ld %ld %ld\n",
- field->arg1,
- field->arg2,
- field->arg3);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- break;
- }
- case TRACE_STACK: {
- struct stack_entry *field;
-
- trace_assign_type(field, entry);
-
- for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
- if (i) {
- ret = trace_seq_puts(s, " <= ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
- ret = seq_print_ip_sym(s, field->caller[i],
- sym_flags);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+ if (!trace_print_lat_context(iter))
+ goto partial;
+ } else {
+ if (!trace_print_context(iter))
+ goto partial;
}
- ret = trace_seq_puts(s, "\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- break;
- }
- case TRACE_PRINT: {
- struct print_entry *field;
-
- trace_assign_type(field, entry);
-
- seq_print_ip_sym(s, field->ip, sym_flags);
- trace_seq_printf(s, ": %s", field->buf);
- if (entry->flags & TRACE_FLAG_CONT)
- trace_seq_print_cont(s, iter);
- break;
}
- case TRACE_GRAPH_RET: {
- return print_graph_function(iter);
- }
- case TRACE_GRAPH_ENT: {
- return print_graph_function(iter);
- }
- case TRACE_BRANCH: {
- struct trace_branch *field;
-
- trace_assign_type(field, entry);
- trace_seq_printf(s, "[%s] %s:%s:%d\n",
- field->correct ? " ok " : " MISS ",
- field->func,
- field->file,
- field->line);
- break;
- }
- case TRACE_USER_STACK: {
- struct userstack_entry *field;
+ if (event)
+ return event->trace(iter, sym_flags);
- trace_assign_type(field, entry);
+ if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
+ goto partial;
- ret = seq_print_userip_objs(field, s, sym_flags);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- ret = trace_seq_putc(s, '\n');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- break;
- }
- }
return TRACE_TYPE_HANDLED;
+partial:
+ return TRACE_TYPE_PARTIAL_LINE;
}
static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *entry;
- int ret;
- int S, T;
+ struct trace_event *event;
entry = iter->ent;
- if (entry->type == TRACE_CONT)
- return TRACE_TYPE_HANDLED;
-
- ret = trace_seq_printf(s, "%d %d %llu ",
- entry->pid, iter->cpu, iter->ts);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- switch (entry->type) {
- case TRACE_FN: {
- struct ftrace_entry *field;
-
- trace_assign_type(field, entry);
-
- ret = trace_seq_printf(s, "%x %x\n",
- field->ip,
- field->parent_ip);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- break;
- }
- case TRACE_CTX:
- case TRACE_WAKE: {
- struct ctx_switch_entry *field;
-
- trace_assign_type(field, entry);
-
- T = task_state_char(field->next_state);
- S = entry->type == TRACE_WAKE ? '+' :
- task_state_char(field->prev_state);
- ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
- field->prev_pid,
- field->prev_prio,
- S,
- field->next_cpu,
- field->next_pid,
- field->next_prio,
- T);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- break;
+ if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ if (!trace_seq_printf(s, "%d %d %llu ",
+ entry->pid, iter->cpu, iter->ts))
+ goto partial;
}
- case TRACE_SPECIAL:
- case TRACE_USER_STACK:
- case TRACE_STACK: {
- struct special_entry *field;
- trace_assign_type(field, entry);
+ event = ftrace_find_event(entry->type);
+ if (event)
+ return event->raw(iter, 0);
- ret = trace_seq_printf(s, "# %ld %ld %ld\n",
- field->arg1,
- field->arg2,
- field->arg3);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- break;
- }
- case TRACE_PRINT: {
- struct print_entry *field;
-
- trace_assign_type(field, entry);
+ if (!trace_seq_printf(s, "%d ?\n", entry->type))
+ goto partial;
- trace_seq_printf(s, "# %lx %s", field->ip, field->buf);
- if (entry->flags & TRACE_FLAG_CONT)
- trace_seq_print_cont(s, iter);
- break;
- }
- }
return TRACE_TYPE_HANDLED;
+partial:
+ return TRACE_TYPE_PARTIAL_LINE;
}
-#define SEQ_PUT_FIELD_RET(s, x) \
-do { \
- if (!trace_seq_putmem(s, &(x), sizeof(x))) \
- return 0; \
-} while (0)
-
-#define SEQ_PUT_HEX_FIELD_RET(s, x) \
-do { \
- BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
- if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
- return 0; \
-} while (0)
-
static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
unsigned char newline = '\n';
struct trace_entry *entry;
- int S, T;
+ struct trace_event *event;
entry = iter->ent;
- if (entry->type == TRACE_CONT)
- return TRACE_TYPE_HANDLED;
-
- SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
- SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
- SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
-
- switch (entry->type) {
- case TRACE_FN: {
- struct ftrace_entry *field;
-
- trace_assign_type(field, entry);
-
- SEQ_PUT_HEX_FIELD_RET(s, field->ip);
- SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
- break;
- }
- case TRACE_CTX:
- case TRACE_WAKE: {
- struct ctx_switch_entry *field;
-
- trace_assign_type(field, entry);
-
- T = task_state_char(field->next_state);
- S = entry->type == TRACE_WAKE ? '+' :
- task_state_char(field->prev_state);
- SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
- SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
- SEQ_PUT_HEX_FIELD_RET(s, S);
- SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
- SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
- SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
- SEQ_PUT_HEX_FIELD_RET(s, T);
- break;
+ if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
+ SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
+ SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
}
- case TRACE_SPECIAL:
- case TRACE_USER_STACK:
- case TRACE_STACK: {
- struct special_entry *field;
- trace_assign_type(field, entry);
-
- SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
- SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
- SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
- break;
- }
+ event = ftrace_find_event(entry->type);
+ if (event) {
+ enum print_line_t ret = event->hex(iter, 0);
+ if (ret != TRACE_TYPE_HANDLED)
+ return ret;
}
- SEQ_PUT_FIELD_RET(s, newline);
-
- return TRACE_TYPE_HANDLED;
-}
-static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
-{
- struct trace_seq *s = &iter->seq;
- struct trace_entry *entry = iter->ent;
- struct print_entry *field;
- int ret;
-
- trace_assign_type(field, entry);
-
- ret = trace_seq_printf(s, field->buf);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- if (entry->flags & TRACE_FLAG_CONT)
- trace_seq_print_cont(s, iter);
+ SEQ_PUT_FIELD_RET(s, newline);
return TRACE_TYPE_HANDLED;
}
@@ -2292,59 +1738,37 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *entry;
+ struct trace_event *event;
entry = iter->ent;
- if (entry->type == TRACE_CONT)
- return TRACE_TYPE_HANDLED;
-
- SEQ_PUT_FIELD_RET(s, entry->pid);
- SEQ_PUT_FIELD_RET(s, entry->cpu);
- SEQ_PUT_FIELD_RET(s, iter->ts);
-
- switch (entry->type) {
- case TRACE_FN: {
- struct ftrace_entry *field;
-
- trace_assign_type(field, entry);
-
- SEQ_PUT_FIELD_RET(s, field->ip);
- SEQ_PUT_FIELD_RET(s, field->parent_ip);
- break;
+ if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ SEQ_PUT_FIELD_RET(s, entry->pid);
+ SEQ_PUT_FIELD_RET(s, iter->cpu);
+ SEQ_PUT_FIELD_RET(s, iter->ts);
}
- case TRACE_CTX: {
- struct ctx_switch_entry *field;
- trace_assign_type(field, entry);
-
- SEQ_PUT_FIELD_RET(s, field->prev_pid);
- SEQ_PUT_FIELD_RET(s, field->prev_prio);
- SEQ_PUT_FIELD_RET(s, field->prev_state);
- SEQ_PUT_FIELD_RET(s, field->next_pid);
- SEQ_PUT_FIELD_RET(s, field->next_prio);
- SEQ_PUT_FIELD_RET(s, field->next_state);
- break;
- }
- case TRACE_SPECIAL:
- case TRACE_USER_STACK:
- case TRACE_STACK: {
- struct special_entry *field;
-
- trace_assign_type(field, entry);
-
- SEQ_PUT_FIELD_RET(s, field->arg1);
- SEQ_PUT_FIELD_RET(s, field->arg2);
- SEQ_PUT_FIELD_RET(s, field->arg3);
- break;
- }
- }
- return 1;
+ event = ftrace_find_event(entry->type);
+ return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED;
}
static int trace_empty(struct trace_iterator *iter)
{
int cpu;
+ /* If we are looking at one CPU buffer, only check that one */
+ if (iter->cpu_file != TRACE_PIPE_ALL_CPU) {
+ cpu = iter->cpu_file;
+ if (iter->buffer_iter[cpu]) {
+ if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
+ return 0;
+ } else {
+ if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
+ return 0;
+ }
+ return 1;
+ }
+
for_each_tracing_cpu(cpu) {
if (iter->buffer_iter[cpu]) {
if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
@@ -2358,6 +1782,7 @@ static int trace_empty(struct trace_iterator *iter)
return 1;
}
+/* Called with trace_event_read_lock() held. */
static enum print_line_t print_trace_line(struct trace_iterator *iter)
{
enum print_line_t ret;
@@ -2368,10 +1793,15 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
return ret;
}
+ if (iter->ent->type == TRACE_BPRINT &&
+ trace_flags & TRACE_ITER_PRINTK &&
+ trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+ return trace_print_bprintk_msg_only(iter);
+
if (iter->ent->type == TRACE_PRINT &&
trace_flags & TRACE_ITER_PRINTK &&
trace_flags & TRACE_ITER_PRINTK_MSGONLY)
- return print_printk_msg_only(iter);
+ return trace_print_printk_msg_only(iter);
if (trace_flags & TRACE_ITER_BIN)
return print_bin_fmt(iter);
@@ -2382,9 +1812,6 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
if (trace_flags & TRACE_ITER_RAW)
return print_raw_fmt(iter);
- if (iter->iter_flags & TRACE_FILE_LAT_FMT)
- return print_lat_fmt(iter, iter->idx, iter->cpu);
-
return print_trace_fmt(iter);
}
@@ -2426,30 +1853,45 @@ static struct seq_operations tracer_seq_ops = {
};
static struct trace_iterator *
-__tracing_open(struct inode *inode, struct file *file, int *ret)
+__tracing_open(struct inode *inode, struct file *file)
{
+ long cpu_file = (long) inode->i_private;
+ void *fail_ret = ERR_PTR(-ENOMEM);
struct trace_iterator *iter;
struct seq_file *m;
- int cpu;
+ int cpu, ret;
- if (tracing_disabled) {
- *ret = -ENODEV;
- return NULL;
- }
+ if (tracing_disabled)
+ return ERR_PTR(-ENODEV);
iter = kzalloc(sizeof(*iter), GFP_KERNEL);
- if (!iter) {
- *ret = -ENOMEM;
- goto out;
- }
+ if (!iter)
+ return ERR_PTR(-ENOMEM);
+ /*
+ * We make a copy of the current tracer to avoid concurrent
+ * changes on it while we are reading.
+ */
mutex_lock(&trace_types_lock);
+ iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL);
+ if (!iter->trace)
+ goto fail;
+
+ if (current_trace)
+ *iter->trace = *current_trace;
+
+ if (!alloc_cpumask_var(&iter->started, GFP_KERNEL))
+ goto fail;
+
+ cpumask_clear(iter->started);
+
if (current_trace && current_trace->print_max)
iter->tr = &max_tr;
else
- iter->tr = inode->i_private;
- iter->trace = current_trace;
+ iter->tr = &global_trace;
iter->pos = -1;
+ mutex_init(&iter->mutex);
+ iter->cpu_file = cpu_file;
/* Notify the tracer early; before we stop tracing. */
if (iter->trace && iter->trace->open)
@@ -2459,30 +1901,34 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
if (ring_buffer_overruns(iter->tr->buffer))
iter->iter_flags |= TRACE_FILE_ANNOTATE;
+ /* stop the trace while dumping */
+ tracing_stop();
- for_each_tracing_cpu(cpu) {
+ if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
+ for_each_tracing_cpu(cpu) {
+ iter->buffer_iter[cpu] =
+ ring_buffer_read_start(iter->tr->buffer, cpu);
+ tracing_iter_reset(iter, cpu);
+ }
+ } else {
+ cpu = iter->cpu_file;
iter->buffer_iter[cpu] =
- ring_buffer_read_start(iter->tr->buffer, cpu);
-
- if (!iter->buffer_iter[cpu])
- goto fail_buffer;
+ ring_buffer_read_start(iter->tr->buffer, cpu);
+ tracing_iter_reset(iter, cpu);
}
- /* TODO stop tracer */
- *ret = seq_open(file, &tracer_seq_ops);
- if (*ret)
+ ret = seq_open(file, &tracer_seq_ops);
+ if (ret < 0) {
+ fail_ret = ERR_PTR(ret);
goto fail_buffer;
+ }
m = file->private_data;
m->private = iter;
- /* stop the trace while dumping */
- tracing_stop();
-
mutex_unlock(&trace_types_lock);
- out:
return iter;
fail_buffer:
@@ -2490,10 +1936,14 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
if (iter->buffer_iter[cpu])
ring_buffer_read_finish(iter->buffer_iter[cpu]);
}
+ free_cpumask_var(iter->started);
+ tracing_start();
+ fail:
mutex_unlock(&trace_types_lock);
+ kfree(iter->trace);
kfree(iter);
- return ERR_PTR(-ENOMEM);
+ return fail_ret;
}
int tracing_open_generic(struct inode *inode, struct file *filp)
@@ -2505,12 +1955,17 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
return 0;
}
-int tracing_release(struct inode *inode, struct file *file)
+static int tracing_release(struct inode *inode, struct file *file)
{
struct seq_file *m = (struct seq_file *)file->private_data;
- struct trace_iterator *iter = m->private;
+ struct trace_iterator *iter;
int cpu;
+ if (!(file->f_mode & FMODE_READ))
+ return 0;
+
+ iter = m->private;
+
mutex_lock(&trace_types_lock);
for_each_tracing_cpu(cpu) {
if (iter->buffer_iter[cpu])
@@ -2525,55 +1980,59 @@ int tracing_release(struct inode *inode, struct file *file)
mutex_unlock(&trace_types_lock);
seq_release(inode, file);
+ mutex_destroy(&iter->mutex);
+ free_cpumask_var(iter->started);
+ kfree(iter->trace);
kfree(iter);
return 0;
}
static int tracing_open(struct inode *inode, struct file *file)
{
- int ret;
-
- __tracing_open(inode, file, &ret);
-
- return ret;
-}
-
-static int tracing_lt_open(struct inode *inode, struct file *file)
-{
struct trace_iterator *iter;
- int ret;
+ int ret = 0;
- iter = __tracing_open(inode, file, &ret);
+ /* If this file was open for write, then erase contents */
+ if ((file->f_mode & FMODE_WRITE) &&
+ (file->f_flags & O_TRUNC)) {
+ long cpu = (long) inode->i_private;
- if (!ret)
- iter->iter_flags |= TRACE_FILE_LAT_FMT;
+ if (cpu == TRACE_PIPE_ALL_CPU)
+ tracing_reset_online_cpus(&global_trace);
+ else
+ tracing_reset(&global_trace, cpu);
+ }
+ if (file->f_mode & FMODE_READ) {
+ iter = __tracing_open(inode, file);
+ if (IS_ERR(iter))
+ ret = PTR_ERR(iter);
+ else if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ iter->iter_flags |= TRACE_FILE_LAT_FMT;
+ }
return ret;
}
-
static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct tracer *t = m->private;
+ struct tracer *t = v;
(*pos)++;
if (t)
t = t->next;
- m->private = t;
-
return t;
}
static void *t_start(struct seq_file *m, loff_t *pos)
{
- struct tracer *t = m->private;
+ struct tracer *t;
loff_t l = 0;
mutex_lock(&trace_types_lock);
- for (; t && l < *pos; t = t_next(m, t, &l))
+ for (t = trace_types; t && l < *pos; t = t_next(m, t, &l))
;
return t;
@@ -2609,35 +2068,28 @@ static struct seq_operations show_traces_seq_ops = {
static int show_traces_open(struct inode *inode, struct file *file)
{
- int ret;
-
if (tracing_disabled)
return -ENODEV;
- ret = seq_open(file, &show_traces_seq_ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = trace_types;
- }
+ return seq_open(file, &show_traces_seq_ops);
+}
- return ret;
+static ssize_t
+tracing_write_stub(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ return count;
}
-static struct file_operations tracing_fops = {
+static const struct file_operations tracing_fops = {
.open = tracing_open,
.read = seq_read,
+ .write = tracing_write_stub,
.llseek = seq_lseek,
.release = tracing_release,
};
-static struct file_operations tracing_lt_fops = {
- .open = tracing_lt_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = tracing_release,
-};
-
-static struct file_operations show_traces_fops = {
+static const struct file_operations show_traces_fops = {
.open = show_traces_open,
.read = seq_read,
.release = seq_release,
@@ -2692,11 +2144,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
return -ENOMEM;
- mutex_lock(&tracing_cpumask_update_lock);
err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
if (err)
goto err_unlock;
+ mutex_lock(&tracing_cpumask_update_lock);
+
local_irq_disable();
__raw_spin_lock(&ftrace_max_lock);
for_each_tracing_cpu(cpu) {
@@ -2724,13 +2177,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
return count;
err_unlock:
- mutex_unlock(&tracing_cpumask_update_lock);
- free_cpumask_var(tracing_cpumask);
+ free_cpumask_var(tracing_cpumask_new);
return err;
}
-static struct file_operations tracing_cpumask_fops = {
+static const struct file_operations tracing_cpumask_fops = {
.open = tracing_open_generic,
.read = tracing_cpumask_read,
.write = tracing_cpumask_write,
@@ -2740,80 +2192,85 @@ static ssize_t
tracing_trace_options_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- int i;
+ struct tracer_opt *trace_opts;
+ u32 tracer_flags;
+ int len = 0;
char *buf;
int r = 0;
- int len = 0;
- u32 tracer_flags = current_trace->flags->val;
- struct tracer_opt *trace_opts = current_trace->flags->opts;
+ int i;
- /* calulate max size */
+ /* calculate max size */
for (i = 0; trace_options[i]; i++) {
len += strlen(trace_options[i]);
- len += 3; /* "no" and space */
+ len += 3; /* "no" and newline */
}
+ mutex_lock(&trace_types_lock);
+ tracer_flags = current_trace->flags->val;
+ trace_opts = current_trace->flags->opts;
+
/*
* Increase the size with names of options specific
* of the current tracer.
*/
for (i = 0; trace_opts[i].name; i++) {
len += strlen(trace_opts[i].name);
- len += 3; /* "no" and space */
+ len += 3; /* "no" and newline */
}
- /* +2 for \n and \0 */
- buf = kmalloc(len + 2, GFP_KERNEL);
- if (!buf)
+ /* +1 for \0 */
+ buf = kmalloc(len + 1, GFP_KERNEL);
+ if (!buf) {
+ mutex_unlock(&trace_types_lock);
return -ENOMEM;
+ }
for (i = 0; trace_options[i]; i++) {
if (trace_flags & (1 << i))
- r += sprintf(buf + r, "%s ", trace_options[i]);
+ r += sprintf(buf + r, "%s\n", trace_options[i]);
else
- r += sprintf(buf + r, "no%s ", trace_options[i]);
+ r += sprintf(buf + r, "no%s\n", trace_options[i]);
}
for (i = 0; trace_opts[i].name; i++) {
if (tracer_flags & trace_opts[i].bit)
- r += sprintf(buf + r, "%s ",
+ r += sprintf(buf + r, "%s\n",
trace_opts[i].name);
else
- r += sprintf(buf + r, "no%s ",
+ r += sprintf(buf + r, "no%s\n",
trace_opts[i].name);
}
+ mutex_unlock(&trace_types_lock);
- r += sprintf(buf + r, "\n");
- WARN_ON(r >= len + 2);
+ WARN_ON(r >= len + 1);
r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
kfree(buf);
-
return r;
}
/* Try to assign a tracer specific option */
static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
{
- struct tracer_flags *trace_flags = trace->flags;
+ struct tracer_flags *tracer_flags = trace->flags;
struct tracer_opt *opts = NULL;
int ret = 0, i = 0;
int len;
- for (i = 0; trace_flags->opts[i].name; i++) {
- opts = &trace_flags->opts[i];
+ for (i = 0; tracer_flags->opts[i].name; i++) {
+ opts = &tracer_flags->opts[i];
len = strlen(opts->name);
if (strncmp(cmp, opts->name, len) == 0) {
- ret = trace->set_flag(trace_flags->val,
+ ret = trace->set_flag(tracer_flags->val,
opts->bit, !neg);
break;
}
}
/* Not found */
- if (!trace_flags->opts[i].name)
+ if (!tracer_flags->opts[i].name)
return -EINVAL;
/* Refused to handle */
@@ -2821,13 +2278,25 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
return ret;
if (neg)
- trace_flags->val &= ~opts->bit;
+ tracer_flags->val &= ~opts->bit;
else
- trace_flags->val |= opts->bit;
+ tracer_flags->val |= opts->bit;
return 0;
}
+static void set_tracer_flags(unsigned int mask, int enabled)
+{
+ /* do nothing if flag is already set */
+ if (!!(trace_flags & mask) == !!enabled)
+ return;
+
+ if (enabled)
+ trace_flags |= mask;
+ else
+ trace_flags &= ~mask;
+}
+
static ssize_t
tracing_trace_options_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
@@ -2855,17 +2324,16 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
int len = strlen(trace_options[i]);
if (strncmp(cmp, trace_options[i], len) == 0) {
- if (neg)
- trace_flags &= ~(1 << i);
- else
- trace_flags |= (1 << i);
+ set_tracer_flags(1 << i, !neg);
break;
}
}
/* If no option could be set, test the specific tracer options */
if (!trace_options[i]) {
+ mutex_lock(&trace_types_lock);
ret = set_tracer_option(current_trace, cmp, neg);
+ mutex_unlock(&trace_types_lock);
if (ret)
return ret;
}
@@ -2875,7 +2343,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
return cnt;
}
-static struct file_operations tracing_iter_fops = {
+static const struct file_operations tracing_iter_fops = {
.open = tracing_open_generic,
.read = tracing_trace_options_read,
.write = tracing_trace_options_write,
@@ -2883,21 +2351,20 @@ static struct file_operations tracing_iter_fops = {
static const char readme_msg[] =
"tracing mini-HOWTO:\n\n"
- "# mkdir /debug\n"
- "# mount -t debugfs nodev /debug\n\n"
- "# cat /debug/tracing/available_tracers\n"
- "wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n"
- "# cat /debug/tracing/current_tracer\n"
- "none\n"
- "# echo sched_switch > /debug/tracing/current_tracer\n"
- "# cat /debug/tracing/current_tracer\n"
+ "# mount -t debugfs nodev /sys/kernel/debug\n\n"
+ "# cat /sys/kernel/debug/tracing/available_tracers\n"
+ "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
+ "# cat /sys/kernel/debug/tracing/current_tracer\n"
+ "nop\n"
+ "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n"
+ "# cat /sys/kernel/debug/tracing/current_tracer\n"
"sched_switch\n"
- "# cat /debug/tracing/trace_options\n"
+ "# cat /sys/kernel/debug/tracing/trace_options\n"
"noprint-parent nosym-offset nosym-addr noverbose\n"
- "# echo print-parent > /debug/tracing/trace_options\n"
- "# echo 1 > /debug/tracing/tracing_enabled\n"
- "# cat /debug/tracing/trace > /tmp/trace.txt\n"
- "echo 0 > /debug/tracing/tracing_enabled\n"
+ "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
+ "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n"
+ "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
+ "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n"
;
static ssize_t
@@ -2908,12 +2375,62 @@ tracing_readme_read(struct file *filp, char __user *ubuf,
readme_msg, strlen(readme_msg));
}
-static struct file_operations tracing_readme_fops = {
+static const struct file_operations tracing_readme_fops = {
.open = tracing_open_generic,
.read = tracing_readme_read,
};
static ssize_t
+tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char *buf_comm;
+ char *file_buf;
+ char *buf;
+ int len = 0;
+ int pid;
+ int i;
+
+ file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL);
+ if (!file_buf)
+ return -ENOMEM;
+
+ buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL);
+ if (!buf_comm) {
+ kfree(file_buf);
+ return -ENOMEM;
+ }
+
+ buf = file_buf;
+
+ for (i = 0; i < SAVED_CMDLINES; i++) {
+ int r;
+
+ pid = map_cmdline_to_pid[i];
+ if (pid == -1 || pid == NO_CMDLINE_MAP)
+ continue;
+
+ trace_find_cmdline(pid, buf_comm);
+ r = sprintf(buf, "%d %s\n", pid, buf_comm);
+ buf += r;
+ len += r;
+ }
+
+ len = simple_read_from_buffer(ubuf, cnt, ppos,
+ file_buf, len);
+
+ kfree(file_buf);
+ kfree(buf_comm);
+
+ return len;
+}
+
+static const struct file_operations tracing_saved_cmdlines_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_saved_cmdlines_read,
+};
+
+static ssize_t
tracing_ctrl_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
@@ -2930,7 +2447,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
{
struct trace_array *tr = filp->private_data;
char buf[64];
- long val;
+ unsigned long val;
int ret;
if (cnt >= sizeof(buf))
@@ -2985,13 +2502,105 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
-static int tracing_set_tracer(char *buf)
+int tracer_init(struct tracer *t, struct trace_array *tr)
+{
+ tracing_reset_online_cpus(tr);
+ return t->init(tr);
+}
+
+static int tracing_resize_ring_buffer(unsigned long size)
+{
+ int ret;
+
+ /*
+ * If kernel or user changes the size of the ring buffer
+ * we use the size that was given, and we can forget about
+ * expanding it later.
+ */
+ ring_buffer_expanded = 1;
+
+ ret = ring_buffer_resize(global_trace.buffer, size);
+ if (ret < 0)
+ return ret;
+
+ ret = ring_buffer_resize(max_tr.buffer, size);
+ if (ret < 0) {
+ int r;
+
+ r = ring_buffer_resize(global_trace.buffer,
+ global_trace.entries);
+ if (r < 0) {
+ /*
+ * AARGH! We are left with different
+ * size max buffer!!!!
+ * The max buffer is our "snapshot" buffer.
+ * When a tracer needs a snapshot (one of the
+ * latency tracers), it swaps the max buffer
+ * with the saved snap shot. We succeeded to
+ * update the size of the main buffer, but failed to
+ * update the size of the max buffer. But when we tried
+ * to reset the main buffer to the original size, we
+ * failed there too. This is very unlikely to
+ * happen, but if it does, warn and kill all
+ * tracing.
+ */
+ WARN_ON(1);
+ tracing_disabled = 1;
+ }
+ return ret;
+ }
+
+ global_trace.entries = size;
+
+ return ret;
+}
+
+/**
+ * tracing_update_buffers - used by tracing facility to expand ring buffers
+ *
+ * To save on memory when the tracing is never used on a system with it
+ * configured in. The ring buffers are set to a minimum size. But once
+ * a user starts to use the tracing facility, then they need to grow
+ * to their default size.
+ *
+ * This function is to be called when a tracer is about to be used.
+ */
+int tracing_update_buffers(void)
+{
+ int ret = 0;
+
+ mutex_lock(&trace_types_lock);
+ if (!ring_buffer_expanded)
+ ret = tracing_resize_ring_buffer(trace_buf_size);
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+struct trace_option_dentry;
+
+static struct trace_option_dentry *
+create_trace_option_files(struct tracer *tracer);
+
+static void
+destroy_trace_option_files(struct trace_option_dentry *topts);
+
+static int tracing_set_tracer(const char *buf)
{
+ static struct trace_option_dentry *topts;
struct trace_array *tr = &global_trace;
struct tracer *t;
int ret = 0;
mutex_lock(&trace_types_lock);
+
+ if (!ring_buffer_expanded) {
+ ret = tracing_resize_ring_buffer(trace_buf_size);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+ }
+
for (t = trace_types; t; t = t->next) {
if (strcmp(t->name, buf) == 0)
break;
@@ -3007,9 +2616,14 @@ static int tracing_set_tracer(char *buf)
if (current_trace && current_trace->reset)
current_trace->reset(tr);
+ destroy_trace_option_files(topts);
+
current_trace = t;
+
+ topts = create_trace_option_files(current_trace);
+
if (t->init) {
- ret = t->init(tr);
+ ret = tracer_init(t, tr);
if (ret)
goto out;
}
@@ -3072,9 +2686,9 @@ static ssize_t
tracing_max_lat_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- long *ptr = filp->private_data;
+ unsigned long *ptr = filp->private_data;
char buf[64];
- long val;
+ unsigned long val;
int ret;
if (cnt >= sizeof(buf))
@@ -3094,54 +2708,99 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
return cnt;
}
-static atomic_t tracing_reader;
-
static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
+ long cpu_file = (long) inode->i_private;
struct trace_iterator *iter;
+ int ret = 0;
if (tracing_disabled)
return -ENODEV;
- /* We only allow for reader of the pipe */
- if (atomic_inc_return(&tracing_reader) != 1) {
- atomic_dec(&tracing_reader);
- return -EBUSY;
+ mutex_lock(&trace_types_lock);
+
+ /* We only allow one reader per cpu */
+ if (cpu_file == TRACE_PIPE_ALL_CPU) {
+ if (!cpumask_empty(tracing_reader_cpumask)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ cpumask_setall(tracing_reader_cpumask);
+ } else {
+ if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
+ cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
+ else {
+ ret = -EBUSY;
+ goto out;
+ }
}
/* create a buffer to store the information to pass to userspace */
iter = kzalloc(sizeof(*iter), GFP_KERNEL);
- if (!iter)
- return -ENOMEM;
+ if (!iter) {
+ ret = -ENOMEM;
+ goto out;
+ }
- if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
- kfree(iter);
- return -ENOMEM;
+ /*
+ * We make a copy of the current tracer to avoid concurrent
+ * changes on it while we are reading.
+ */
+ iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL);
+ if (!iter->trace) {
+ ret = -ENOMEM;
+ goto fail;
}
+ if (current_trace)
+ *iter->trace = *current_trace;
- mutex_lock(&trace_types_lock);
+ if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto fail;
+ }
/* trace pipe does not show start of buffer */
cpumask_setall(iter->started);
+ if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ iter->iter_flags |= TRACE_FILE_LAT_FMT;
+
+ iter->cpu_file = cpu_file;
iter->tr = &global_trace;
- iter->trace = current_trace;
+ mutex_init(&iter->mutex);
filp->private_data = iter;
if (iter->trace->pipe_open)
iter->trace->pipe_open(iter);
+
+out:
mutex_unlock(&trace_types_lock);
+ return ret;
- return 0;
+fail:
+ kfree(iter->trace);
+ kfree(iter);
+ mutex_unlock(&trace_types_lock);
+ return ret;
}
static int tracing_release_pipe(struct inode *inode, struct file *file)
{
struct trace_iterator *iter = file->private_data;
+ mutex_lock(&trace_types_lock);
+
+ if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
+ cpumask_clear(tracing_reader_cpumask);
+ else
+ cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
+
+ mutex_unlock(&trace_types_lock);
+
free_cpumask_var(iter->started);
+ mutex_destroy(&iter->mutex);
+ kfree(iter->trace);
kfree(iter);
- atomic_dec(&tracing_reader);
return 0;
}
@@ -3167,67 +2826,57 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
}
}
-/*
- * Consumer reader.
- */
-static ssize_t
-tracing_read_pipe(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
+
+void default_wait_pipe(struct trace_iterator *iter)
{
- struct trace_iterator *iter = filp->private_data;
- ssize_t sret;
+ DEFINE_WAIT(wait);
- /* return any leftover data */
- sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
- if (sret != -EBUSY)
- return sret;
+ prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
- trace_seq_reset(&iter->seq);
+ if (trace_empty(iter))
+ schedule();
- mutex_lock(&trace_types_lock);
- if (iter->trace->read) {
- sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
- if (sret)
- goto out;
- }
+ finish_wait(&trace_wait, &wait);
+}
+
+/*
+ * This is a make-shift waitqueue.
+ * A tracer might use this callback on some rare cases:
+ *
+ * 1) the current tracer might hold the runqueue lock when it wakes up
+ * a reader, hence a deadlock (sched, function, and function graph tracers)
+ * 2) the function tracers, trace all functions, we don't want
+ * the overhead of calling wake_up and friends
+ * (and tracing them too)
+ *
+ * Anyway, this is really very primitive wakeup.
+ */
+void poll_wait_pipe(struct trace_iterator *iter)
+{
+ set_current_state(TASK_INTERRUPTIBLE);
+ /* sleep for 100 msecs, and try again. */
+ schedule_timeout(HZ / 10);
+}
+
+/* Must be called with trace_types_lock mutex held. */
+static int tracing_wait_pipe(struct file *filp)
+{
+ struct trace_iterator *iter = filp->private_data;
-waitagain:
- sret = 0;
while (trace_empty(iter)) {
if ((filp->f_flags & O_NONBLOCK)) {
- sret = -EAGAIN;
- goto out;
+ return -EAGAIN;
}
- /*
- * This is a make-shift waitqueue. The reason we don't use
- * an actual wait queue is because:
- * 1) we only ever have one waiter
- * 2) the tracing, traces all functions, we don't want
- * the overhead of calling wake_up and friends
- * (and tracing them too)
- * Anyway, this is really very primitive wakeup.
- */
- set_current_state(TASK_INTERRUPTIBLE);
- iter->tr->waiter = current;
-
- mutex_unlock(&trace_types_lock);
-
- /* sleep for 100 msecs, and try again. */
- schedule_timeout(HZ/10);
+ mutex_unlock(&iter->mutex);
- mutex_lock(&trace_types_lock);
-
- iter->tr->waiter = NULL;
+ iter->trace->wait_pipe(iter);
- if (signal_pending(current)) {
- sret = -EINTR;
- goto out;
- }
+ mutex_lock(&iter->mutex);
- if (iter->trace != current_trace)
- goto out;
+ if (signal_pending(current))
+ return -EINTR;
/*
* We block until we read something and tracing is disabled.
@@ -3240,13 +2889,59 @@ waitagain:
*/
if (!tracer_enabled && iter->pos)
break;
+ }
+
+ return 1;
+}
- continue;
+/*
+ * Consumer reader.
+ */
+static ssize_t
+tracing_read_pipe(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_iterator *iter = filp->private_data;
+ static struct tracer *old_tracer;
+ ssize_t sret;
+
+ /* return any leftover data */
+ sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+ if (sret != -EBUSY)
+ return sret;
+
+ trace_seq_init(&iter->seq);
+
+ /* copy the tracer to avoid using a global lock all around */
+ mutex_lock(&trace_types_lock);
+ if (unlikely(old_tracer != current_trace && current_trace)) {
+ old_tracer = current_trace;
+ *iter->trace = *current_trace;
+ }
+ mutex_unlock(&trace_types_lock);
+
+ /*
+ * Avoid more than one consumer on a single file descriptor
+ * This is just a matter of traces coherency, the ring buffer itself
+ * is protected.
+ */
+ mutex_lock(&iter->mutex);
+ if (iter->trace->read) {
+ sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
+ if (sret)
+ goto out;
}
+waitagain:
+ sret = tracing_wait_pipe(filp);
+ if (sret <= 0)
+ goto out;
+
/* stop when tracing is finished */
- if (trace_empty(iter))
+ if (trace_empty(iter)) {
+ sret = 0;
goto out;
+ }
if (cnt >= PAGE_SIZE)
cnt = PAGE_SIZE - 1;
@@ -3257,6 +2952,7 @@ waitagain:
offsetof(struct trace_iterator, seq));
iter->pos = -1;
+ trace_event_read_lock();
while (find_next_entry_inc(iter) != NULL) {
enum print_line_t ret;
int len = iter->seq.len;
@@ -3267,17 +2963,18 @@ waitagain:
iter->seq.len = len;
break;
}
-
- trace_consume(iter);
+ if (ret != TRACE_TYPE_NO_CONSUME)
+ trace_consume(iter);
if (iter->seq.len >= cnt)
break;
}
+ trace_event_read_unlock();
/* Now copy what we have to the user */
sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
if (iter->seq.readpos >= iter->seq.len)
- trace_seq_reset(&iter->seq);
+ trace_seq_init(&iter->seq);
/*
* If there was nothing to send to user, inspite of consuming trace
@@ -3287,20 +2984,169 @@ waitagain:
goto waitagain;
out:
- mutex_unlock(&trace_types_lock);
+ mutex_unlock(&iter->mutex);
return sret;
}
+static void tracing_pipe_buf_release(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ __free_page(buf->page);
+}
+
+static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
+ unsigned int idx)
+{
+ __free_page(spd->pages[idx]);
+}
+
+static struct pipe_buf_operations tracing_pipe_buf_ops = {
+ .can_merge = 0,
+ .map = generic_pipe_buf_map,
+ .unmap = generic_pipe_buf_unmap,
+ .confirm = generic_pipe_buf_confirm,
+ .release = tracing_pipe_buf_release,
+ .steal = generic_pipe_buf_steal,
+ .get = generic_pipe_buf_get,
+};
+
+static size_t
+tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
+{
+ size_t count;
+ int ret;
+
+ /* Seq buffer is page-sized, exactly what we need. */
+ for (;;) {
+ count = iter->seq.len;
+ ret = print_trace_line(iter);
+ count = iter->seq.len - count;
+ if (rem < count) {
+ rem = 0;
+ iter->seq.len -= count;
+ break;
+ }
+ if (ret == TRACE_TYPE_PARTIAL_LINE) {
+ iter->seq.len -= count;
+ break;
+ }
+
+ if (ret != TRACE_TYPE_NO_CONSUME)
+ trace_consume(iter);
+ rem -= count;
+ if (!find_next_entry_inc(iter)) {
+ rem = 0;
+ iter->ent = NULL;
+ break;
+ }
+ }
+
+ return rem;
+}
+
+static ssize_t tracing_splice_read_pipe(struct file *filp,
+ loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len,
+ unsigned int flags)
+{
+ struct page *pages[PIPE_BUFFERS];
+ struct partial_page partial[PIPE_BUFFERS];
+ struct trace_iterator *iter = filp->private_data;
+ struct splice_pipe_desc spd = {
+ .pages = pages,
+ .partial = partial,
+ .nr_pages = 0, /* This gets updated below. */
+ .flags = flags,
+ .ops = &tracing_pipe_buf_ops,
+ .spd_release = tracing_spd_release_pipe,
+ };
+ static struct tracer *old_tracer;
+ ssize_t ret;
+ size_t rem;
+ unsigned int i;
+
+ /* copy the tracer to avoid using a global lock all around */
+ mutex_lock(&trace_types_lock);
+ if (unlikely(old_tracer != current_trace && current_trace)) {
+ old_tracer = current_trace;
+ *iter->trace = *current_trace;
+ }
+ mutex_unlock(&trace_types_lock);
+
+ mutex_lock(&iter->mutex);
+
+ if (iter->trace->splice_read) {
+ ret = iter->trace->splice_read(iter, filp,
+ ppos, pipe, len, flags);
+ if (ret)
+ goto out_err;
+ }
+
+ ret = tracing_wait_pipe(filp);
+ if (ret <= 0)
+ goto out_err;
+
+ if (!iter->ent && !find_next_entry_inc(iter)) {
+ ret = -EFAULT;
+ goto out_err;
+ }
+
+ trace_event_read_lock();
+
+ /* Fill as many pages as possible. */
+ for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
+ pages[i] = alloc_page(GFP_KERNEL);
+ if (!pages[i])
+ break;
+
+ rem = tracing_fill_pipe_page(rem, iter);
+
+ /* Copy the data into the page, so we can start over. */
+ ret = trace_seq_to_buffer(&iter->seq,
+ page_address(pages[i]),
+ iter->seq.len);
+ if (ret < 0) {
+ __free_page(pages[i]);
+ break;
+ }
+ partial[i].offset = 0;
+ partial[i].len = iter->seq.len;
+
+ trace_seq_init(&iter->seq);
+ }
+
+ trace_event_read_unlock();
+ mutex_unlock(&iter->mutex);
+
+ spd.nr_pages = i;
+
+ return splice_to_pipe(pipe, &spd);
+
+out_err:
+ mutex_unlock(&iter->mutex);
+
+ return ret;
+}
+
static ssize_t
tracing_entries_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
struct trace_array *tr = filp->private_data;
- char buf[64];
+ char buf[96];
int r;
- r = sprintf(buf, "%lu\n", tr->entries >> 10);
+ mutex_lock(&trace_types_lock);
+ if (!ring_buffer_expanded)
+ r = sprintf(buf, "%lu (expanded: %lu)\n",
+ tr->entries >> 10,
+ trace_buf_size >> 10);
+ else
+ r = sprintf(buf, "%lu\n", tr->entries >> 10);
+ mutex_unlock(&trace_types_lock);
+
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
@@ -3344,28 +3190,11 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
val <<= 10;
if (val != global_trace.entries) {
- ret = ring_buffer_resize(global_trace.buffer, val);
- if (ret < 0) {
- cnt = ret;
- goto out;
- }
-
- ret = ring_buffer_resize(max_tr.buffer, val);
+ ret = tracing_resize_ring_buffer(val);
if (ret < 0) {
- int r;
cnt = ret;
- r = ring_buffer_resize(global_trace.buffer,
- global_trace.entries);
- if (r < 0) {
- /* AARGH! We are left with different
- * size max buffer!!!! */
- WARN_ON(1);
- tracing_disabled = 1;
- }
goto out;
}
-
- global_trace.entries = val;
}
filp->f_pos += cnt;
@@ -3393,7 +3222,7 @@ static int mark_printk(const char *fmt, ...)
int ret;
va_list args;
va_start(args, fmt);
- ret = trace_vprintk(0, -1, fmt, args);
+ ret = trace_vprintk(0, fmt, args);
va_end(args);
return ret;
}
@@ -3433,42 +3262,397 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
return cnt;
}
-static struct file_operations tracing_max_lat_fops = {
+static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ int bufiter = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
+ bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter,
+ "%s%s%s%s", i ? " " : "",
+ i == trace_clock_id ? "[" : "", trace_clocks[i].name,
+ i == trace_clock_id ? "]" : "");
+ bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n");
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter);
+}
+
+static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *fpos)
+{
+ char buf[64];
+ const char *clockstr;
+ int i;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ clockstr = strstrip(buf);
+
+ for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) {
+ if (strcmp(trace_clocks[i].name, clockstr) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(trace_clocks))
+ return -EINVAL;
+
+ trace_clock_id = i;
+
+ mutex_lock(&trace_types_lock);
+
+ ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func);
+ if (max_tr.buffer)
+ ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
+
+ mutex_unlock(&trace_types_lock);
+
+ *fpos += cnt;
+
+ return cnt;
+}
+
+static const struct file_operations tracing_max_lat_fops = {
.open = tracing_open_generic,
.read = tracing_max_lat_read,
.write = tracing_max_lat_write,
};
-static struct file_operations tracing_ctrl_fops = {
+static const struct file_operations tracing_ctrl_fops = {
.open = tracing_open_generic,
.read = tracing_ctrl_read,
.write = tracing_ctrl_write,
};
-static struct file_operations set_tracer_fops = {
+static const struct file_operations set_tracer_fops = {
.open = tracing_open_generic,
.read = tracing_set_trace_read,
.write = tracing_set_trace_write,
};
-static struct file_operations tracing_pipe_fops = {
+static const struct file_operations tracing_pipe_fops = {
.open = tracing_open_pipe,
.poll = tracing_poll_pipe,
.read = tracing_read_pipe,
+ .splice_read = tracing_splice_read_pipe,
.release = tracing_release_pipe,
};
-static struct file_operations tracing_entries_fops = {
+static const struct file_operations tracing_entries_fops = {
.open = tracing_open_generic,
.read = tracing_entries_read,
.write = tracing_entries_write,
};
-static struct file_operations tracing_mark_fops = {
+static const struct file_operations tracing_mark_fops = {
.open = tracing_open_generic,
.write = tracing_mark_write,
};
+static const struct file_operations trace_clock_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_clock_read,
+ .write = tracing_clock_write,
+};
+
+struct ftrace_buffer_info {
+ struct trace_array *tr;
+ void *spare;
+ int cpu;
+ unsigned int read;
+};
+
+static int tracing_buffers_open(struct inode *inode, struct file *filp)
+{
+ int cpu = (int)(long)inode->i_private;
+ struct ftrace_buffer_info *info;
+
+ if (tracing_disabled)
+ return -ENODEV;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ info->tr = &global_trace;
+ info->cpu = cpu;
+ info->spare = NULL;
+ /* Force reading ring buffer for first read */
+ info->read = (unsigned int)-1;
+
+ filp->private_data = info;
+
+ return nonseekable_open(inode, filp);
+}
+
+static ssize_t
+tracing_buffers_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct ftrace_buffer_info *info = filp->private_data;
+ unsigned int pos;
+ ssize_t ret;
+ size_t size;
+
+ if (!count)
+ return 0;
+
+ if (!info->spare)
+ info->spare = ring_buffer_alloc_read_page(info->tr->buffer);
+ if (!info->spare)
+ return -ENOMEM;
+
+ /* Do we have previous read data to read? */
+ if (info->read < PAGE_SIZE)
+ goto read;
+
+ info->read = 0;
+
+ ret = ring_buffer_read_page(info->tr->buffer,
+ &info->spare,
+ count,
+ info->cpu, 0);
+ if (ret < 0)
+ return 0;
+
+ pos = ring_buffer_page_len(info->spare);
+
+ if (pos < PAGE_SIZE)
+ memset(info->spare + pos, 0, PAGE_SIZE - pos);
+
+read:
+ size = PAGE_SIZE - info->read;
+ if (size > count)
+ size = count;
+
+ ret = copy_to_user(ubuf, info->spare + info->read, size);
+ if (ret == size)
+ return -EFAULT;
+ size -= ret;
+
+ *ppos += size;
+ info->read += size;
+
+ return size;
+}
+
+static int tracing_buffers_release(struct inode *inode, struct file *file)
+{
+ struct ftrace_buffer_info *info = file->private_data;
+
+ if (info->spare)
+ ring_buffer_free_read_page(info->tr->buffer, info->spare);
+ kfree(info);
+
+ return 0;
+}
+
+struct buffer_ref {
+ struct ring_buffer *buffer;
+ void *page;
+ int ref;
+};
+
+static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ struct buffer_ref *ref = (struct buffer_ref *)buf->private;
+
+ if (--ref->ref)
+ return;
+
+ ring_buffer_free_read_page(ref->buffer, ref->page);
+ kfree(ref);
+ buf->private = 0;
+}
+
+static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ return 1;
+}
+
+static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ struct buffer_ref *ref = (struct buffer_ref *)buf->private;
+
+ ref->ref++;
+}
+
+/* Pipe buffer operations for a buffer. */
+static struct pipe_buf_operations buffer_pipe_buf_ops = {
+ .can_merge = 0,
+ .map = generic_pipe_buf_map,
+ .unmap = generic_pipe_buf_unmap,
+ .confirm = generic_pipe_buf_confirm,
+ .release = buffer_pipe_buf_release,
+ .steal = buffer_pipe_buf_steal,
+ .get = buffer_pipe_buf_get,
+};
+
+/*
+ * Callback from splice_to_pipe(), if we need to release some pages
+ * at the end of the spd in case we error'ed out in filling the pipe.
+ */
+static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
+{
+ struct buffer_ref *ref =
+ (struct buffer_ref *)spd->partial[i].private;
+
+ if (--ref->ref)
+ return;
+
+ ring_buffer_free_read_page(ref->buffer, ref->page);
+ kfree(ref);
+ spd->partial[i].private = 0;
+}
+
+static ssize_t
+tracing_buffers_splice_read(struct file *file, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct ftrace_buffer_info *info = file->private_data;
+ struct partial_page partial[PIPE_BUFFERS];
+ struct page *pages[PIPE_BUFFERS];
+ struct splice_pipe_desc spd = {
+ .pages = pages,
+ .partial = partial,
+ .flags = flags,
+ .ops = &buffer_pipe_buf_ops,
+ .spd_release = buffer_spd_release,
+ };
+ struct buffer_ref *ref;
+ int entries, size, i;
+ size_t ret;
+
+ if (*ppos & (PAGE_SIZE - 1)) {
+ WARN_ONCE(1, "Ftrace: previous read must page-align\n");
+ return -EINVAL;
+ }
+
+ if (len & (PAGE_SIZE - 1)) {
+ WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
+ if (len < PAGE_SIZE)
+ return -EINVAL;
+ len &= PAGE_MASK;
+ }
+
+ entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
+
+ for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
+ struct page *page;
+ int r;
+
+ ref = kzalloc(sizeof(*ref), GFP_KERNEL);
+ if (!ref)
+ break;
+
+ ref->ref = 1;
+ ref->buffer = info->tr->buffer;
+ ref->page = ring_buffer_alloc_read_page(ref->buffer);
+ if (!ref->page) {
+ kfree(ref);
+ break;
+ }
+
+ r = ring_buffer_read_page(ref->buffer, &ref->page,
+ len, info->cpu, 1);
+ if (r < 0) {
+ ring_buffer_free_read_page(ref->buffer,
+ ref->page);
+ kfree(ref);
+ break;
+ }
+
+ /*
+ * zero out any left over data, this is going to
+ * user land.
+ */
+ size = ring_buffer_page_len(ref->page);
+ if (size < PAGE_SIZE)
+ memset(ref->page + size, 0, PAGE_SIZE - size);
+
+ page = virt_to_page(ref->page);
+
+ spd.pages[i] = page;
+ spd.partial[i].len = PAGE_SIZE;
+ spd.partial[i].offset = 0;
+ spd.partial[i].private = (unsigned long)ref;
+ spd.nr_pages++;
+ *ppos += PAGE_SIZE;
+
+ entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
+ }
+
+ spd.nr_pages = i;
+
+ /* did we read anything? */
+ if (!spd.nr_pages) {
+ if (flags & SPLICE_F_NONBLOCK)
+ ret = -EAGAIN;
+ else
+ ret = 0;
+ /* TODO: block */
+ return ret;
+ }
+
+ ret = splice_to_pipe(pipe, &spd);
+
+ return ret;
+}
+
+static const struct file_operations tracing_buffers_fops = {
+ .open = tracing_buffers_open,
+ .read = tracing_buffers_read,
+ .release = tracing_buffers_release,
+ .splice_read = tracing_buffers_splice_read,
+ .llseek = no_llseek,
+};
+
+static ssize_t
+tracing_stats_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ unsigned long cpu = (unsigned long)filp->private_data;
+ struct trace_array *tr = &global_trace;
+ struct trace_seq *s;
+ unsigned long cnt;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return ENOMEM;
+
+ trace_seq_init(s);
+
+ cnt = ring_buffer_entries_cpu(tr->buffer, cpu);
+ trace_seq_printf(s, "entries: %ld\n", cnt);
+
+ cnt = ring_buffer_overrun_cpu(tr->buffer, cpu);
+ trace_seq_printf(s, "overrun: %ld\n", cnt);
+
+ cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
+ trace_seq_printf(s, "commit overrun: %ld\n", cnt);
+
+ count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
+
+ kfree(s);
+
+ return count;
+}
+
+static const struct file_operations tracing_stats_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_stats_read,
+};
+
#ifdef CONFIG_DYNAMIC_FTRACE
int __weak ftrace_arch_read_dyn_info(char *buf, int size)
@@ -3500,7 +3684,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
return r;
}
-static struct file_operations tracing_dyn_info_fops = {
+static const struct file_operations tracing_dyn_info_fops = {
.open = tracing_open_generic,
.read = tracing_read_dyn_info,
};
@@ -3515,6 +3699,9 @@ struct dentry *tracing_init_dentry(void)
if (d_tracer)
return d_tracer;
+ if (!debugfs_initialized())
+ return NULL;
+
d_tracer = debugfs_create_dir("tracing", NULL);
if (!d_tracer && !once) {
@@ -3526,170 +3713,405 @@ struct dentry *tracing_init_dentry(void)
return d_tracer;
}
+static struct dentry *d_percpu;
+
+struct dentry *tracing_dentry_percpu(void)
+{
+ static int once;
+ struct dentry *d_tracer;
+
+ if (d_percpu)
+ return d_percpu;
+
+ d_tracer = tracing_init_dentry();
+
+ if (!d_tracer)
+ return NULL;
+
+ d_percpu = debugfs_create_dir("per_cpu", d_tracer);
+
+ if (!d_percpu && !once) {
+ once = 1;
+ pr_warning("Could not create debugfs directory 'per_cpu'\n");
+ return NULL;
+ }
+
+ return d_percpu;
+}
+
+static void tracing_init_debugfs_percpu(long cpu)
+{
+ struct dentry *d_percpu = tracing_dentry_percpu();
+ struct dentry *d_cpu;
+ /* strlen(cpu) + MAX(log10(cpu)) + '\0' */
+ char cpu_dir[7];
+
+ if (cpu > 999 || cpu < 0)
+ return;
+
+ sprintf(cpu_dir, "cpu%ld", cpu);
+ d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
+ if (!d_cpu) {
+ pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
+ return;
+ }
+
+ /* per cpu trace_pipe */
+ trace_create_file("trace_pipe", 0444, d_cpu,
+ (void *) cpu, &tracing_pipe_fops);
+
+ /* per cpu trace */
+ trace_create_file("trace", 0644, d_cpu,
+ (void *) cpu, &tracing_fops);
+
+ trace_create_file("trace_pipe_raw", 0444, d_cpu,
+ (void *) cpu, &tracing_buffers_fops);
+
+ trace_create_file("stats", 0444, d_cpu,
+ (void *) cpu, &tracing_stats_fops);
+}
+
#ifdef CONFIG_FTRACE_SELFTEST
/* Let selftest have access to static functions in this file */
#include "trace_selftest.c"
#endif
-static __init int tracer_init_debugfs(void)
+struct trace_option_dentry {
+ struct tracer_opt *opt;
+ struct tracer_flags *flags;
+ struct dentry *entry;
+};
+
+static ssize_t
+trace_options_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct trace_option_dentry *topt = filp->private_data;
+ char *buf;
+
+ if (topt->flags->val & topt->opt->bit)
+ buf = "1\n";
+ else
+ buf = "0\n";
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+}
+
+static ssize_t
+trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct trace_option_dentry *topt = filp->private_data;
+ unsigned long val;
+ char buf[64];
+ int ret;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ ret = 0;
+ switch (val) {
+ case 0:
+ /* do nothing if already cleared */
+ if (!(topt->flags->val & topt->opt->bit))
+ break;
+
+ mutex_lock(&trace_types_lock);
+ if (current_trace->set_flag)
+ ret = current_trace->set_flag(topt->flags->val,
+ topt->opt->bit, 0);
+ mutex_unlock(&trace_types_lock);
+ if (ret)
+ return ret;
+ topt->flags->val &= ~topt->opt->bit;
+ break;
+ case 1:
+ /* do nothing if already set */
+ if (topt->flags->val & topt->opt->bit)
+ break;
+
+ mutex_lock(&trace_types_lock);
+ if (current_trace->set_flag)
+ ret = current_trace->set_flag(topt->flags->val,
+ topt->opt->bit, 1);
+ mutex_unlock(&trace_types_lock);
+ if (ret)
+ return ret;
+ topt->flags->val |= topt->opt->bit;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+
+static const struct file_operations trace_options_fops = {
+ .open = tracing_open_generic,
+ .read = trace_options_read,
+ .write = trace_options_write,
+};
+
+static ssize_t
+trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ long index = (long)filp->private_data;
+ char *buf;
+
+ if (trace_flags & (1 << index))
+ buf = "1\n";
+ else
+ buf = "0\n";
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+}
+
+static ssize_t
+trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ long index = (long)filp->private_data;
+ char buf[64];
+ unsigned long val;
+ int ret;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ if (val != 0 && val != 1)
+ return -EINVAL;
+ set_tracer_flags(1 << index, val);
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static const struct file_operations trace_options_core_fops = {
+ .open = tracing_open_generic,
+ .read = trace_options_core_read,
+ .write = trace_options_core_write,
+};
+
+struct dentry *trace_create_file(const char *name,
+ mode_t mode,
+ struct dentry *parent,
+ void *data,
+ const struct file_operations *fops)
+{
+ struct dentry *ret;
+
+ ret = debugfs_create_file(name, mode, parent, data, fops);
+ if (!ret)
+ pr_warning("Could not create debugfs '%s' entry\n", name);
+
+ return ret;
+}
+
+
+static struct dentry *trace_options_init_dentry(void)
{
struct dentry *d_tracer;
- struct dentry *entry;
+ static struct dentry *t_options;
+
+ if (t_options)
+ return t_options;
d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return NULL;
- entry = debugfs_create_file("tracing_enabled", 0644, d_tracer,
- &global_trace, &tracing_ctrl_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
-
- entry = debugfs_create_file("trace_options", 0644, d_tracer,
- NULL, &tracing_iter_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'trace_options' entry\n");
-
- entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
- NULL, &tracing_cpumask_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
-
- entry = debugfs_create_file("latency_trace", 0444, d_tracer,
- &global_trace, &tracing_lt_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'latency_trace' entry\n");
-
- entry = debugfs_create_file("trace", 0444, d_tracer,
- &global_trace, &tracing_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'trace' entry\n");
-
- entry = debugfs_create_file("available_tracers", 0444, d_tracer,
- &global_trace, &show_traces_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'available_tracers' entry\n");
-
- entry = debugfs_create_file("current_tracer", 0444, d_tracer,
- &global_trace, &set_tracer_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'current_tracer' entry\n");
-
- entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
- &tracing_max_latency,
- &tracing_max_lat_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'tracing_max_latency' entry\n");
-
- entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
- &tracing_thresh, &tracing_max_lat_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'tracing_thresh' entry\n");
- entry = debugfs_create_file("README", 0644, d_tracer,
- NULL, &tracing_readme_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'README' entry\n");
-
- entry = debugfs_create_file("trace_pipe", 0644, d_tracer,
- NULL, &tracing_pipe_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'trace_pipe' entry\n");
-
- entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
- &global_trace, &tracing_entries_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'buffer_size_kb' entry\n");
-
- entry = debugfs_create_file("trace_marker", 0220, d_tracer,
- NULL, &tracing_mark_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'trace_marker' entry\n");
+ t_options = debugfs_create_dir("options", d_tracer);
+ if (!t_options) {
+ pr_warning("Could not create debugfs directory 'options'\n");
+ return NULL;
+ }
-#ifdef CONFIG_DYNAMIC_FTRACE
- entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
- &ftrace_update_tot_cnt,
- &tracing_dyn_info_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'dyn_ftrace_total_info' entry\n");
-#endif
-#ifdef CONFIG_SYSPROF_TRACER
- init_tracer_sysprof_debugfs(d_tracer);
-#endif
- return 0;
+ return t_options;
}
-int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+static void
+create_trace_option_file(struct trace_option_dentry *topt,
+ struct tracer_flags *flags,
+ struct tracer_opt *opt)
{
- static DEFINE_SPINLOCK(trace_buf_lock);
- static char trace_buf[TRACE_BUF_SIZE];
+ struct dentry *t_options;
- struct ring_buffer_event *event;
- struct trace_array *tr = &global_trace;
- struct trace_array_cpu *data;
- int cpu, len = 0, size, pc;
- struct print_entry *entry;
- unsigned long irq_flags;
+ t_options = trace_options_init_dentry();
+ if (!t_options)
+ return;
- if (tracing_disabled || tracing_selftest_running)
- return 0;
+ topt->flags = flags;
+ topt->opt = opt;
- pc = preempt_count();
- preempt_disable_notrace();
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
+ topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
+ &trace_options_fops);
- if (unlikely(atomic_read(&data->disabled)))
- goto out;
+}
- pause_graph_tracing();
- spin_lock_irqsave(&trace_buf_lock, irq_flags);
- len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+static struct trace_option_dentry *
+create_trace_option_files(struct tracer *tracer)
+{
+ struct trace_option_dentry *topts;
+ struct tracer_flags *flags;
+ struct tracer_opt *opts;
+ int cnt;
- len = min(len, TRACE_BUF_SIZE-1);
- trace_buf[len] = 0;
+ if (!tracer)
+ return NULL;
- size = sizeof(*entry) + len + 1;
- event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags);
- if (!event)
- goto out_unlock;
- entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, irq_flags, pc);
- entry->ent.type = TRACE_PRINT;
- entry->ip = ip;
- entry->depth = depth;
+ flags = tracer->flags;
- memcpy(&entry->buf, trace_buf, len);
- entry->buf[len] = 0;
- ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+ if (!flags || !flags->opts)
+ return NULL;
- out_unlock:
- spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
- unpause_graph_tracing();
- out:
- preempt_enable_notrace();
+ opts = flags->opts;
- return len;
+ for (cnt = 0; opts[cnt].name; cnt++)
+ ;
+
+ topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL);
+ if (!topts)
+ return NULL;
+
+ for (cnt = 0; opts[cnt].name; cnt++)
+ create_trace_option_file(&topts[cnt], flags,
+ &opts[cnt]);
+
+ return topts;
}
-EXPORT_SYMBOL_GPL(trace_vprintk);
-int __ftrace_printk(unsigned long ip, const char *fmt, ...)
+static void
+destroy_trace_option_files(struct trace_option_dentry *topts)
{
- int ret;
- va_list ap;
+ int cnt;
- if (!(trace_flags & TRACE_ITER_PRINTK))
- return 0;
+ if (!topts)
+ return;
- va_start(ap, fmt);
- ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
- va_end(ap);
- return ret;
+ for (cnt = 0; topts[cnt].opt; cnt++) {
+ if (topts[cnt].entry)
+ debugfs_remove(topts[cnt].entry);
+ }
+
+ kfree(topts);
+}
+
+static struct dentry *
+create_trace_option_core_file(const char *option, long index)
+{
+ struct dentry *t_options;
+
+ t_options = trace_options_init_dentry();
+ if (!t_options)
+ return NULL;
+
+ return trace_create_file(option, 0644, t_options, (void *)index,
+ &trace_options_core_fops);
+}
+
+static __init void create_trace_options_dir(void)
+{
+ struct dentry *t_options;
+ int i;
+
+ t_options = trace_options_init_dentry();
+ if (!t_options)
+ return;
+
+ for (i = 0; trace_options[i]; i++)
+ create_trace_option_core_file(trace_options[i], i);
+}
+
+static __init int tracer_init_debugfs(void)
+{
+ struct dentry *d_tracer;
+ int cpu;
+
+ d_tracer = tracing_init_dentry();
+
+ trace_create_file("tracing_enabled", 0644, d_tracer,
+ &global_trace, &tracing_ctrl_fops);
+
+ trace_create_file("trace_options", 0644, d_tracer,
+ NULL, &tracing_iter_fops);
+
+ trace_create_file("tracing_cpumask", 0644, d_tracer,
+ NULL, &tracing_cpumask_fops);
+
+ trace_create_file("trace", 0644, d_tracer,
+ (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
+
+ trace_create_file("available_tracers", 0444, d_tracer,
+ &global_trace, &show_traces_fops);
+
+ trace_create_file("current_tracer", 0644, d_tracer,
+ &global_trace, &set_tracer_fops);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ trace_create_file("tracing_max_latency", 0644, d_tracer,
+ &tracing_max_latency, &tracing_max_lat_fops);
+
+ trace_create_file("tracing_thresh", 0644, d_tracer,
+ &tracing_thresh, &tracing_max_lat_fops);
+#endif
+
+ trace_create_file("README", 0444, d_tracer,
+ NULL, &tracing_readme_fops);
+
+ trace_create_file("trace_pipe", 0444, d_tracer,
+ (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
+
+ trace_create_file("buffer_size_kb", 0644, d_tracer,
+ &global_trace, &tracing_entries_fops);
+
+ trace_create_file("trace_marker", 0220, d_tracer,
+ NULL, &tracing_mark_fops);
+
+ trace_create_file("saved_cmdlines", 0444, d_tracer,
+ NULL, &tracing_saved_cmdlines_fops);
+
+ trace_create_file("trace_clock", 0644, d_tracer, NULL,
+ &trace_clock_fops);
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+ trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
+ &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
+#endif
+#ifdef CONFIG_SYSPROF_TRACER
+ init_tracer_sysprof_debugfs(d_tracer);
+#endif
+
+ create_trace_options_dir();
+
+ for_each_tracing_cpu(cpu)
+ tracing_init_debugfs_percpu(cpu);
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(__ftrace_printk);
static int trace_panic_handler(struct notifier_block *this,
unsigned long event, void *unused)
@@ -3736,7 +4158,7 @@ static struct notifier_block trace_die_notifier = {
* it if we decide to change what log level the ftrace dump
* should be at.
*/
-#define KERN_TRACE KERN_INFO
+#define KERN_TRACE KERN_EMERG
static void
trace_printk_seq(struct trace_seq *s)
@@ -3750,39 +4172,48 @@ trace_printk_seq(struct trace_seq *s)
printk(KERN_TRACE "%s", s->buffer);
- trace_seq_reset(s);
+ trace_seq_init(s);
}
-void ftrace_dump(void)
+static void __ftrace_dump(bool disable_tracing)
{
- static DEFINE_SPINLOCK(ftrace_dump_lock);
+ static raw_spinlock_t ftrace_dump_lock =
+ (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
/* use static because iter can be a bit big for the stack */
static struct trace_iterator iter;
+ unsigned int old_userobj;
static int dump_ran;
unsigned long flags;
int cnt = 0, cpu;
/* only one dump */
- spin_lock_irqsave(&ftrace_dump_lock, flags);
+ local_irq_save(flags);
+ __raw_spin_lock(&ftrace_dump_lock);
if (dump_ran)
goto out;
dump_ran = 1;
- /* No turning back! */
- ftrace_kill();
+ tracing_off();
+
+ if (disable_tracing)
+ ftrace_kill();
for_each_tracing_cpu(cpu) {
atomic_inc(&global_trace.data[cpu]->disabled);
}
+ old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
+
/* don't look at user memory in panic mode */
trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
printk(KERN_TRACE "Dumping ftrace buffer:\n");
+ /* Simulate the iterator */
iter.tr = &global_trace;
iter.trace = current_trace;
+ iter.cpu_file = TRACE_PIPE_ALL_CPU;
/*
* We need to stop all tracing on all CPUS to read the
@@ -3806,8 +4237,11 @@ void ftrace_dump(void)
iter.pos = -1;
if (find_next_entry_inc(&iter) != NULL) {
- print_trace_line(&iter);
- trace_consume(&iter);
+ int ret;
+
+ ret = print_trace_line(&iter);
+ if (ret != TRACE_TYPE_NO_CONSUME)
+ trace_consume(&iter);
}
trace_printk_seq(&iter.seq);
@@ -3818,13 +4252,30 @@ void ftrace_dump(void)
else
printk(KERN_TRACE "---------------------------------\n");
+ /* Re-enable tracing if requested */
+ if (!disable_tracing) {
+ trace_flags |= old_userobj;
+
+ for_each_tracing_cpu(cpu) {
+ atomic_dec(&global_trace.data[cpu]->disabled);
+ }
+ tracing_on();
+ }
+
out:
- spin_unlock_irqrestore(&ftrace_dump_lock, flags);
+ __raw_spin_unlock(&ftrace_dump_lock);
+ local_irq_restore(flags);
+}
+
+/* By default: disable tracing after the dump */
+void ftrace_dump(void)
+{
+ __ftrace_dump(true);
}
__init static int tracer_alloc_buffers(void)
{
- struct trace_array_cpu *data;
+ int ring_buf_size;
int i;
int ret = -ENOMEM;
@@ -3834,11 +4285,21 @@ __init static int tracer_alloc_buffers(void)
if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
goto out_free_buffer_mask;
+ if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
+ goto out_free_tracing_cpumask;
+
+ /* To save memory, keep the ring buffer size to its minimum */
+ if (ring_buffer_expanded)
+ ring_buf_size = trace_buf_size;
+ else
+ ring_buf_size = 1;
+
cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
cpumask_copy(tracing_cpumask, cpu_all_mask);
+ cpumask_clear(tracing_reader_cpumask);
/* TODO: make the number of buffers hot pluggable with CPUS */
- global_trace.buffer = ring_buffer_alloc(trace_buf_size,
+ global_trace.buffer = ring_buffer_alloc(ring_buf_size,
TRACE_BUFFER_FLAGS);
if (!global_trace.buffer) {
printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
@@ -3849,7 +4310,7 @@ __init static int tracer_alloc_buffers(void)
#ifdef CONFIG_TRACER_MAX_TRACE
- max_tr.buffer = ring_buffer_alloc(trace_buf_size,
+ max_tr.buffer = ring_buffer_alloc(ring_buf_size,
TRACE_BUFFER_FLAGS);
if (!max_tr.buffer) {
printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
@@ -3863,21 +4324,17 @@ __init static int tracer_alloc_buffers(void)
/* Allocate the first page for all buffers */
for_each_tracing_cpu(i) {
- data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
+ global_trace.data[i] = &per_cpu(global_trace_cpu, i);
max_tr.data[i] = &per_cpu(max_data, i);
}
trace_init_cmdlines();
register_tracer(&nop_trace);
+ current_trace = &nop_trace;
#ifdef CONFIG_BOOT_TRACER
register_tracer(&boot_tracer);
- current_trace = &boot_tracer;
- current_trace->init(&global_trace);
-#else
- current_trace = &nop_trace;
#endif
-
/* All seems OK, enable tracing */
tracing_disabled = 0;
@@ -3885,14 +4342,38 @@ __init static int tracer_alloc_buffers(void)
&trace_panic_notifier);
register_die_notifier(&trace_die_notifier);
- ret = 0;
+
+ return 0;
out_free_cpumask:
+ free_cpumask_var(tracing_reader_cpumask);
+out_free_tracing_cpumask:
free_cpumask_var(tracing_cpumask);
out_free_buffer_mask:
free_cpumask_var(tracing_buffer_mask);
out:
return ret;
}
+
+__init static int clear_boot_tracer(void)
+{
+ /*
+ * The default tracer at boot buffer is an init section.
+ * This function is called in lateinit. If we did not
+ * find the boot tracer, then clear it out, to prevent
+ * later registration from accessing the buffer that is
+ * about to be freed.
+ */
+ if (!default_bootup_tracer)
+ return 0;
+
+ printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n",
+ default_bootup_tracer);
+ default_bootup_tracer = NULL;
+
+ return 0;
+}
+
early_initcall(tracer_alloc_buffers);
fs_initcall(tracer_init_debugfs);
+late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4d3d381bfd9..fa1dccb579d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,6 +9,11 @@
#include <linux/mmiotrace.h>
#include <linux/ftrace.h>
#include <trace/boot.h>
+#include <linux/kmemtrace.h>
+#include <trace/power.h>
+
+#include <linux/trace_seq.h>
+#include <linux/ftrace_event.h>
enum trace_type {
__TRACE_FIRST_TYPE = 0,
@@ -16,9 +21,9 @@ enum trace_type {
TRACE_FN,
TRACE_CTX,
TRACE_WAKE,
- TRACE_CONT,
TRACE_STACK,
TRACE_PRINT,
+ TRACE_BPRINT,
TRACE_SPECIAL,
TRACE_MMIO_RW,
TRACE_MMIO_MAP,
@@ -29,24 +34,12 @@ enum trace_type {
TRACE_GRAPH_ENT,
TRACE_USER_STACK,
TRACE_HW_BRANCHES,
+ TRACE_KMEM_ALLOC,
+ TRACE_KMEM_FREE,
TRACE_POWER,
+ TRACE_BLK,
- __TRACE_LAST_TYPE
-};
-
-/*
- * The trace entry - the most basic unit of tracing. This is what
- * is printed in the end as a single line in the trace output, such as:
- *
- * bash-15816 [01] 235.197585: idle_cpu <- irq_enter
- */
-struct trace_entry {
- unsigned char type;
- unsigned char cpu;
- unsigned char flags;
- unsigned char preempt_count;
- int pid;
- int tgid;
+ __TRACE_LAST_TYPE,
};
/*
@@ -60,13 +53,13 @@ struct ftrace_entry {
/* Function call entry */
struct ftrace_graph_ent_entry {
- struct trace_entry ent;
+ struct trace_entry ent;
struct ftrace_graph_ent graph_ent;
};
/* Function return entry */
struct ftrace_graph_ret_entry {
- struct trace_entry ent;
+ struct trace_entry ent;
struct ftrace_graph_ret ret;
};
extern struct tracer boot_tracer;
@@ -112,12 +105,18 @@ struct userstack_entry {
};
/*
- * ftrace_printk entry:
+ * trace_printk entry:
*/
+struct bprint_entry {
+ struct trace_entry ent;
+ unsigned long ip;
+ const char *fmt;
+ u32 buf[];
+};
+
struct print_entry {
struct trace_entry ent;
unsigned long ip;
- int depth;
char buf[];
};
@@ -170,15 +169,51 @@ struct trace_power {
struct power_trace state_data;
};
+enum kmemtrace_type_id {
+ KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
+ KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
+ KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
+};
+
+struct kmemtrace_alloc_entry {
+ struct trace_entry ent;
+ enum kmemtrace_type_id type_id;
+ unsigned long call_site;
+ const void *ptr;
+ size_t bytes_req;
+ size_t bytes_alloc;
+ gfp_t gfp_flags;
+ int node;
+};
+
+struct kmemtrace_free_entry {
+ struct trace_entry ent;
+ enum kmemtrace_type_id type_id;
+ unsigned long call_site;
+ const void *ptr;
+};
+
+struct syscall_trace_enter {
+ struct trace_entry ent;
+ int nr;
+ unsigned long args[];
+};
+
+struct syscall_trace_exit {
+ struct trace_entry ent;
+ int nr;
+ unsigned long ret;
+};
+
+
/*
* trace_flag_type is an enumeration that holds different
* states when a trace occurs. These are:
* IRQS_OFF - interrupts were disabled
- * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
+ * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
* NEED_RESCED - reschedule is requested
* HARDIRQ - inside an interrupt handler
* SOFTIRQ - inside a softirq handler
- * CONT - multiple entries hold the trace item
*/
enum trace_flag_type {
TRACE_FLAG_IRQS_OFF = 0x01,
@@ -186,7 +221,6 @@ enum trace_flag_type {
TRACE_FLAG_NEED_RESCHED = 0x04,
TRACE_FLAG_HARDIRQ = 0x08,
TRACE_FLAG_SOFTIRQ = 0x10,
- TRACE_FLAG_CONT = 0x20,
};
#define TRACE_BUF_SIZE 1024
@@ -198,10 +232,8 @@ enum trace_flag_type {
*/
struct trace_array_cpu {
atomic_t disabled;
+ void *buffer_page; /* ring buffer spare */
- /* these fields get copied into max-trace: */
- unsigned long trace_idx;
- unsigned long overrun;
unsigned long saved_latency;
unsigned long critical_start;
unsigned long critical_end;
@@ -209,14 +241,13 @@ struct trace_array_cpu {
unsigned long nice;
unsigned long policy;
unsigned long rt_priority;
+ unsigned long skipped_entries;
cycle_t preempt_timestamp;
pid_t pid;
uid_t uid;
char comm[TASK_COMM_LEN];
};
-struct trace_iterator;
-
/*
* The trace array - an array of per-CPU trace arrays. This is the
* highest level data structure that individual tracers deal with.
@@ -262,10 +293,10 @@ extern void __ftrace_bad_type(void);
do { \
IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \
IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
- IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
+ IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
IF_ASSIGN(var, ent, struct special_entry, 0); \
IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
TRACE_MMIO_RW); \
@@ -279,26 +310,22 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
TRACE_GRAPH_RET); \
IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
- IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
+ IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
+ IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
+ TRACE_KMEM_ALLOC); \
+ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
+ TRACE_KMEM_FREE); \
__ftrace_bad_type(); \
} while (0)
-/* Return values for print_line callback */
-enum print_line_t {
- TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */
- TRACE_TYPE_HANDLED = 1,
- TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */
-};
-
-
/*
* An option specific to a tracer. This is a boolean value.
* The bit is the bit index that sets its value on the
* flags value in struct tracer_flags.
*/
struct tracer_opt {
- const char *name; /* Will appear on the trace_options file */
- u32 bit; /* Mask assigned in val field in tracer_flags */
+ const char *name; /* Will appear on the trace_options file */
+ u32 bit; /* Mask assigned in val field in tracer_flags */
};
/*
@@ -307,28 +334,51 @@ struct tracer_opt {
*/
struct tracer_flags {
u32 val;
- struct tracer_opt *opts;
+ struct tracer_opt *opts;
};
/* Makes more easy to define a tracer opt */
#define TRACER_OPT(s, b) .name = #s, .bit = b
-/*
- * A specific tracer, represented by methods that operate on a trace array:
+
+/**
+ * struct tracer - a specific tracer and its callbacks to interact with debugfs
+ * @name: the name chosen to select it on the available_tracers file
+ * @init: called when one switches to this tracer (echo name > current_tracer)
+ * @reset: called when one switches to another tracer
+ * @start: called when tracing is unpaused (echo 1 > tracing_enabled)
+ * @stop: called when tracing is paused (echo 0 > tracing_enabled)
+ * @open: called when the trace file is opened
+ * @pipe_open: called when the trace_pipe file is opened
+ * @wait_pipe: override how the user waits for traces on trace_pipe
+ * @close: called when the trace file is released
+ * @read: override the default read callback on trace_pipe
+ * @splice_read: override the default splice_read callback on trace_pipe
+ * @selftest: selftest to run on boot (see trace_selftest.c)
+ * @print_headers: override the first lines that describe your columns
+ * @print_line: callback that prints a trace
+ * @set_flag: signals one of your private flags changed (trace_options file)
+ * @flags: your private flags
*/
struct tracer {
const char *name;
- /* Your tracer should raise a warning if init fails */
int (*init)(struct trace_array *tr);
void (*reset)(struct trace_array *tr);
void (*start)(struct trace_array *tr);
void (*stop)(struct trace_array *tr);
void (*open)(struct trace_iterator *iter);
void (*pipe_open)(struct trace_iterator *iter);
+ void (*wait_pipe)(struct trace_iterator *iter);
void (*close)(struct trace_iterator *iter);
ssize_t (*read)(struct trace_iterator *iter,
struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos);
+ ssize_t (*splice_read)(struct trace_iterator *iter,
+ struct file *filp,
+ loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len,
+ unsigned int flags);
#ifdef CONFIG_FTRACE_STARTUP_TEST
int (*selftest)(struct tracer *trace,
struct trace_array *tr);
@@ -339,51 +389,50 @@ struct tracer {
int (*set_flag)(u32 old_flags, u32 bit, int set);
struct tracer *next;
int print_max;
- struct tracer_flags *flags;
+ struct tracer_flags *flags;
+ struct tracer_stat *stats;
};
-struct trace_seq {
- unsigned char buffer[PAGE_SIZE];
- unsigned int len;
- unsigned int readpos;
-};
-
-/*
- * Trace iterator - used by printout routines who present trace
- * results to users and which routines might sleep, etc:
- */
-struct trace_iterator {
- struct trace_array *tr;
- struct tracer *trace;
- void *private;
- struct ring_buffer_iter *buffer_iter[NR_CPUS];
-
- /* The below is zeroed out in pipe_read */
- struct trace_seq seq;
- struct trace_entry *ent;
- int cpu;
- u64 ts;
- unsigned long iter_flags;
- loff_t pos;
- long idx;
-
- cpumask_var_t started;
-};
+#define TRACE_PIPE_ALL_CPU -1
+int tracer_init(struct tracer *t, struct trace_array *tr);
int tracing_is_enabled(void);
void trace_wake_up(void);
void tracing_reset(struct trace_array *tr, int cpu);
void tracing_reset_online_cpus(struct trace_array *tr);
+void tracing_reset_current(int cpu);
+void tracing_reset_current_online_cpus(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
+struct dentry *trace_create_file(const char *name,
+ mode_t mode,
+ struct dentry *parent,
+ void *data,
+ const struct file_operations *fops);
+
struct dentry *tracing_init_dentry(void);
void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
+struct ring_buffer_event;
+
+struct ring_buffer_event *
+trace_buffer_lock_reserve(struct ring_buffer *buffer,
+ int type,
+ unsigned long len,
+ unsigned long flags,
+ int pc);
+void trace_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags, int pc);
+
struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_array_cpu *data);
-void tracing_generic_entry_update(struct trace_entry *entry,
- unsigned long flags,
- int pc);
+
+struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
+ int *ent_cpu, u64 *ent_ts);
+
+void default_wait_pipe(struct trace_iterator *iter);
+void poll_wait_pipe(struct trace_iterator *iter);
void ftrace(struct trace_array *tr,
struct trace_array_cpu *data,
@@ -391,14 +440,11 @@ void ftrace(struct trace_array *tr,
unsigned long parent_ip,
unsigned long flags, int pc);
void tracing_sched_switch_trace(struct trace_array *tr,
- struct trace_array_cpu *data,
struct task_struct *prev,
struct task_struct *next,
unsigned long flags, int pc);
-void tracing_record_cmdline(struct task_struct *tsk);
void tracing_sched_wakeup_trace(struct trace_array *tr,
- struct trace_array_cpu *data,
struct task_struct *wakee,
struct task_struct *cur,
unsigned long flags, int pc);
@@ -408,14 +454,13 @@ void trace_special(struct trace_array *tr,
unsigned long arg2,
unsigned long arg3, int pc);
void trace_function(struct trace_array *tr,
- struct trace_array_cpu *data,
unsigned long ip,
unsigned long parent_ip,
unsigned long flags, int pc);
void trace_graph_return(struct ftrace_graph_ret *trace);
int trace_graph_entry(struct ftrace_graph_ent *trace);
-void trace_hw_branch(struct trace_array *tr, u64 from, u64 to);
+void set_graph_array(struct trace_array *tr);
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
@@ -427,22 +472,42 @@ void unregister_tracer(struct tracer *type);
extern unsigned long nsecs_to_usecs(unsigned long nsecs);
+#ifdef CONFIG_TRACER_MAX_TRACE
extern unsigned long tracing_max_latency;
extern unsigned long tracing_thresh;
void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
+#endif /* CONFIG_TRACER_MAX_TRACE */
-extern cycle_t ftrace_now(int cpu);
+#ifdef CONFIG_STACKTRACE
+void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
+ int skip, int pc);
-#ifdef CONFIG_FUNCTION_TRACER
-void tracing_start_function_trace(void);
-void tracing_stop_function_trace(void);
+void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
+ int pc);
+
+void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
+ int pc);
#else
-# define tracing_start_function_trace() do { } while (0)
-# define tracing_stop_function_trace() do { } while (0)
-#endif
+static inline void ftrace_trace_stack(struct trace_array *tr,
+ unsigned long flags, int skip, int pc)
+{
+}
+
+static inline void ftrace_trace_userstack(struct trace_array *tr,
+ unsigned long flags, int pc)
+{
+}
+
+static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
+ int skip, int pc)
+{
+}
+#endif /* CONFIG_STACKTRACE */
+
+extern cycle_t ftrace_now(int cpu);
#ifdef CONFIG_CONTEXT_SWITCH_TRACER
typedef void
@@ -456,19 +521,25 @@ struct tracer_switch_ops {
void *private;
struct tracer_switch_ops *next;
};
-
-char *trace_find_cmdline(int pid);
#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
+extern void trace_find_cmdline(int pid, char comm[]);
+
#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
extern int DYN_FTRACE_TEST_NAME(void);
#endif
+extern int ring_buffer_expanded;
+extern bool tracing_selftest_disabled;
+DECLARE_PER_CPU(local_t, ftrace_cpu_disabled);
+
#ifdef CONFIG_FTRACE_STARTUP_TEST
extern int trace_selftest_startup_function(struct tracer *trace,
struct trace_array *tr);
+extern int trace_selftest_startup_function_graph(struct tracer *trace,
+ struct trace_array *tr);
extern int trace_selftest_startup_irqsoff(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_preemptoff(struct tracer *trace,
@@ -485,27 +556,31 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_branch(struct tracer *trace,
struct trace_array *tr);
+extern int trace_selftest_startup_hw_branches(struct tracer *trace,
+ struct trace_array *tr);
#endif /* CONFIG_FTRACE_STARTUP_TEST */
extern void *head_page(struct trace_array_cpu *data);
-extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
-extern void trace_seq_print_cont(struct trace_seq *s,
- struct trace_iterator *iter);
-
+extern unsigned long long ns2usecs(cycle_t nsec);
+extern int
+trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
extern int
-seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
- unsigned long sym_flags);
-extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
- size_t cnt);
-extern long ns2usecs(cycle_t nsec);
+trace_vprintk(unsigned long ip, const char *fmt, va_list args);
extern int
-trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
+trace_array_vprintk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, va_list args);
+int trace_array_printk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, ...);
extern unsigned long trace_flags;
+extern int trace_clock_id;
+
/* Standard output formatting function used for function return traces */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+extern enum print_line_t
+trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
#ifdef CONFIG_DYNAMIC_FTRACE
/* TODO: make this variable */
@@ -537,7 +612,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
return 1;
}
#endif /* CONFIG_DYNAMIC_FTRACE */
-
#else /* CONFIG_FUNCTION_GRAPH_TRACER */
static inline enum print_line_t
print_graph_function(struct trace_iterator *iter)
@@ -548,6 +622,7 @@ print_graph_function(struct trace_iterator *iter)
extern struct pid *ftrace_pid_trace;
+#ifdef CONFIG_FUNCTION_TRACER
static inline int ftrace_trace_task(struct task_struct *task)
{
if (!ftrace_pid_trace)
@@ -555,6 +630,12 @@ static inline int ftrace_trace_task(struct task_struct *task)
return test_tsk_trace_trace(task);
}
+#else
+static inline int ftrace_trace_task(struct task_struct *task)
+{
+ return 1;
+}
+#endif
/*
* trace_iterator_flags is an enumeration that defines bit
@@ -580,7 +661,11 @@ enum trace_iterator_flags {
TRACE_ITER_ANNOTATE = 0x2000,
TRACE_ITER_USERSTACKTRACE = 0x4000,
TRACE_ITER_SYM_USEROBJ = 0x8000,
- TRACE_ITER_PRINTK_MSGONLY = 0x10000
+ TRACE_ITER_PRINTK_MSGONLY = 0x10000,
+ TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */
+ TRACE_ITER_LATENCY_FMT = 0x40000,
+ TRACE_ITER_SLEEP_TIME = 0x80000,
+ TRACE_ITER_GRAPH_TIME = 0x100000,
};
/*
@@ -601,12 +686,12 @@ extern struct tracer nop_trace;
* preempt_enable (after a disable), a schedule might take place
* causing an infinite recursion.
*
- * To prevent this, we read the need_recshed flag before
+ * To prevent this, we read the need_resched flag before
* disabling preemption. When we want to enable preemption we
* check the flag, if it is set, then we call preempt_enable_no_resched.
* Otherwise, we call preempt_enable.
*
- * The rational for doing the above is that if need resched is set
+ * The rational for doing the above is that if need_resched is set
* and we have yet to reschedule, we are either in an atomic location
* (where we do not need to check for scheduling) or we are inside
* the scheduler and do not want to resched.
@@ -627,7 +712,7 @@ static inline int ftrace_preempt_disable(void)
*
* This is a scheduler safe way to enable preemption and not miss
* any preemption checks. The disabled saved the state of preemption.
- * If resched is set, then we were either inside an atomic or
+ * If resched is set, then we are either inside an atomic or
* are inside the scheduler (we would have already scheduled
* otherwise). In this case, we do not want to call normal
* preempt_enable, but preempt_enable_no_resched instead.
@@ -664,4 +749,132 @@ static inline void trace_branch_disable(void)
}
#endif /* CONFIG_BRANCH_TRACER */
+/* set ring buffers to default size if not already done so */
+int tracing_update_buffers(void);
+
+/* trace event type bit fields, not numeric */
+enum {
+ TRACE_EVENT_TYPE_PRINTF = 1,
+ TRACE_EVENT_TYPE_RAW = 2,
+};
+
+struct ftrace_event_field {
+ struct list_head link;
+ char *name;
+ char *type;
+ int filter_type;
+ int offset;
+ int size;
+ int is_signed;
+};
+
+struct event_filter {
+ int n_preds;
+ struct filter_pred **preds;
+ char *filter_string;
+ bool no_reset;
+};
+
+struct event_subsystem {
+ struct list_head list;
+ const char *name;
+ struct dentry *entry;
+ struct event_filter *filter;
+ int nr_events;
+};
+
+struct filter_pred;
+
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
+ int val1, int val2);
+
+struct filter_pred {
+ filter_pred_fn_t fn;
+ u64 val;
+ char str_val[MAX_FILTER_STR_VAL];
+ int str_len;
+ char *field_name;
+ int offset;
+ int not;
+ int op;
+ int pop_n;
+};
+
+extern void print_event_filter(struct ftrace_event_call *call,
+ struct trace_seq *s);
+extern int apply_event_filter(struct ftrace_event_call *call,
+ char *filter_string);
+extern int apply_subsystem_event_filter(struct event_subsystem *system,
+ char *filter_string);
+extern void print_subsystem_event_filter(struct event_subsystem *system,
+ struct trace_seq *s);
+extern int filter_assign_type(const char *type);
+
+static inline int
+filter_check_discard(struct ftrace_event_call *call, void *rec,
+ struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
+ ring_buffer_discard_commit(buffer, event);
+ return 1;
+ }
+
+ return 0;
+}
+
+#define DEFINE_COMPARISON_PRED(type) \
+static int filter_pred_##type(struct filter_pred *pred, void *event, \
+ int val1, int val2) \
+{ \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ int match = 0; \
+ \
+ switch (pred->op) { \
+ case OP_LT: \
+ match = (*addr < val); \
+ break; \
+ case OP_LE: \
+ match = (*addr <= val); \
+ break; \
+ case OP_GT: \
+ match = (*addr > val); \
+ break; \
+ case OP_GE: \
+ match = (*addr >= val); \
+ break; \
+ default: \
+ break; \
+ } \
+ \
+ return match; \
+}
+
+#define DEFINE_EQUALITY_PRED(size) \
+static int filter_pred_##size(struct filter_pred *pred, void *event, \
+ int val1, int val2) \
+{ \
+ u##size *addr = (u##size *)(event + pred->offset); \
+ u##size val = (u##size)pred->val; \
+ int match; \
+ \
+ match = (val == *addr) ^ pred->not; \
+ \
+ return match; \
+}
+
+extern struct mutex event_mutex;
+extern struct list_head ftrace_events;
+
+extern const char *__start___trace_bprintk_fmt[];
+extern const char *__stop___trace_bprintk_fmt[];
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
+ extern struct ftrace_event_call event_##call;
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt)
+#include "trace_event_types.h"
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 366c8c333e1..19bfc75d467 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -9,8 +9,10 @@
#include <linux/debugfs.h>
#include <linux/ftrace.h>
#include <linux/kallsyms.h>
+#include <linux/time.h>
#include "trace.h"
+#include "trace_output.h"
static struct trace_array *boot_trace;
static bool pre_initcalls_finished;
@@ -27,23 +29,24 @@ void start_boot_trace(void)
void enable_boot_trace(void)
{
- if (pre_initcalls_finished)
+ if (boot_trace && pre_initcalls_finished)
tracing_start_sched_switch_record();
}
void disable_boot_trace(void)
{
- if (pre_initcalls_finished)
+ if (boot_trace && pre_initcalls_finished)
tracing_stop_sched_switch_record();
}
static int boot_trace_init(struct trace_array *tr)
{
- int cpu;
boot_trace = tr;
- for_each_cpu(cpu, cpu_possible_mask)
- tracing_reset(tr, cpu);
+ if (!tr)
+ return 0;
+
+ tracing_reset_online_cpus(tr);
tracing_sched_switch_assign_trace(tr);
return 0;
@@ -63,7 +66,7 @@ initcall_call_print_line(struct trace_iterator *iter)
trace_assign_type(field, entry);
call = &field->boot_call;
ts = iter->ts;
- nsec_rem = do_div(ts, 1000000000);
+ nsec_rem = do_div(ts, NSEC_PER_SEC);
ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
(unsigned long)ts, nsec_rem, call->func, call->caller);
@@ -88,7 +91,7 @@ initcall_ret_print_line(struct trace_iterator *iter)
trace_assign_type(field, entry);
init_ret = &field->boot_ret;
ts = iter->ts;
- nsec_rem = do_div(ts, 1000000000);
+ nsec_rem = do_div(ts, NSEC_PER_SEC);
ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
"returned %d after %llu msecs\n",
@@ -127,11 +130,11 @@ struct tracer boot_tracer __read_mostly =
void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
{
struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
struct trace_boot_call *entry;
- unsigned long irq_flags;
struct trace_array *tr = boot_trace;
- if (!pre_initcalls_finished)
+ if (!tr || !pre_initcalls_finished)
return;
/* Get its name now since this function could
@@ -140,18 +143,14 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
sprint_symbol(bt->func, (unsigned long)fn);
preempt_disable();
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
- &irq_flags);
+ buffer = tr->buffer;
+ event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL,
+ sizeof(*entry), 0, 0);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, 0, 0);
- entry->ent.type = TRACE_BOOT_CALL;
entry->boot_call = *bt;
- ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-
- trace_wake_up();
-
+ trace_buffer_unlock_commit(buffer, event, 0, 0);
out:
preempt_enable();
}
@@ -159,28 +158,24 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
{
struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
struct trace_boot_ret *entry;
- unsigned long irq_flags;
struct trace_array *tr = boot_trace;
- if (!pre_initcalls_finished)
+ if (!tr || !pre_initcalls_finished)
return;
sprint_symbol(bt->func, (unsigned long)fn);
preempt_disable();
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
- &irq_flags);
+ buffer = tr->buffer;
+ event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET,
+ sizeof(*entry), 0, 0);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, 0, 0);
- entry->ent.type = TRACE_BOOT_RET;
entry->boot_ret = *bt;
- ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-
- trace_wake_up();
-
+ trace_buffer_unlock_commit(buffer, event, 0, 0);
out:
preempt_enable();
}
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 6c00feb3bac..7a7a9fd249a 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -14,21 +14,27 @@
#include <linux/hash.h>
#include <linux/fs.h>
#include <asm/local.h>
+
#include "trace.h"
+#include "trace_stat.h"
+#include "trace_output.h"
#ifdef CONFIG_BRANCH_TRACER
+static struct tracer branch_trace;
static int branch_tracing_enabled __read_mostly;
static DEFINE_MUTEX(branch_tracing_mutex);
+
static struct trace_array *branch_tracer;
static void
probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
{
+ struct ftrace_event_call *call = &event_branch;
struct trace_array *tr = branch_tracer;
struct ring_buffer_event *event;
struct trace_branch *entry;
- unsigned long flags, irq_flags;
+ unsigned long flags;
int cpu, pc;
const char *p;
@@ -47,15 +53,13 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
goto out;
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
- &irq_flags);
+ pc = preempt_count();
+ event = trace_buffer_lock_reserve(tr, TRACE_BRANCH,
+ sizeof(*entry), flags, pc);
if (!event)
goto out;
- pc = preempt_count();
entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, flags, pc);
- entry->ent.type = TRACE_BRANCH;
/* Strip off the path, only save the file */
p = f->file + strlen(f->file);
@@ -70,7 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
entry->line = f->line;
entry->correct = val == expect;
- ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+ if (!filter_check_discard(call, entry, tr->buffer, event))
+ ring_buffer_unlock_commit(tr->buffer, event);
out:
atomic_dec(&tr->data[cpu]->disabled);
@@ -88,8 +93,6 @@ void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
int enable_branch_tracing(struct trace_array *tr)
{
- int ret = 0;
-
mutex_lock(&branch_tracing_mutex);
branch_tracer = tr;
/*
@@ -100,7 +103,7 @@ int enable_branch_tracing(struct trace_array *tr)
branch_tracing_enabled++;
mutex_unlock(&branch_tracing_mutex);
- return ret;
+ return 0;
}
void disable_branch_tracing(void)
@@ -128,11 +131,6 @@ static void stop_branch_trace(struct trace_array *tr)
static int branch_trace_init(struct trace_array *tr)
{
- int cpu;
-
- for_each_online_cpu(cpu)
- tracing_reset(tr, cpu);
-
start_branch_trace(tr);
return 0;
}
@@ -142,22 +140,61 @@ static void branch_trace_reset(struct trace_array *tr)
stop_branch_trace(tr);
}
-struct tracer branch_trace __read_mostly =
+static enum print_line_t trace_branch_print(struct trace_iterator *iter,
+ int flags)
+{
+ struct trace_branch *field;
+
+ trace_assign_type(field, iter->ent);
+
+ if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
+ field->correct ? " ok " : " MISS ",
+ field->func,
+ field->file,
+ field->line))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static void branch_print_header(struct seq_file *s)
+{
+ seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT"
+ " FUNC:FILE:LINE\n");
+ seq_puts(s, "# | | | | | "
+ " |\n");
+}
+
+static struct trace_event trace_branch_event = {
+ .type = TRACE_BRANCH,
+ .trace = trace_branch_print,
+};
+
+static struct tracer branch_trace __read_mostly =
{
.name = "branch",
.init = branch_trace_init,
.reset = branch_trace_reset,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_branch,
-#endif
+#endif /* CONFIG_FTRACE_SELFTEST */
+ .print_header = branch_print_header,
};
-__init static int init_branch_trace(void)
+__init static int init_branch_tracer(void)
{
+ int ret;
+
+ ret = register_ftrace_event(&trace_branch_event);
+ if (!ret) {
+ printk(KERN_WARNING "Warning: could not register "
+ "branch events\n");
+ return 1;
+ }
return register_tracer(&branch_trace);
}
+device_initcall(init_branch_tracer);
-device_initcall(init_branch_trace);
#else
static inline
void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
@@ -183,66 +220,39 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
}
EXPORT_SYMBOL(ftrace_likely_update);
-struct ftrace_pointer {
- void *start;
- void *stop;
- int hit;
-};
+extern unsigned long __start_annotated_branch_profile[];
+extern unsigned long __stop_annotated_branch_profile[];
-static void *
-t_next(struct seq_file *m, void *v, loff_t *pos)
+static int annotated_branch_stat_headers(struct seq_file *m)
{
- const struct ftrace_pointer *f = m->private;
- struct ftrace_branch_data *p = v;
-
- (*pos)++;
-
- if (v == (void *)1)
- return f->start;
-
- ++p;
-
- if ((void *)p >= (void *)f->stop)
- return NULL;
-
- return p;
+ seq_printf(m, " correct incorrect %% ");
+ seq_printf(m, " Function "
+ " File Line\n"
+ " ------- --------- - "
+ " -------- "
+ " ---- ----\n");
+ return 0;
}
-static void *t_start(struct seq_file *m, loff_t *pos)
+static inline long get_incorrect_percent(struct ftrace_branch_data *p)
{
- void *t = (void *)1;
- loff_t l = 0;
-
- for (; t && l < *pos; t = t_next(m, t, &l))
- ;
+ long percent;
- return t;
-}
+ if (p->correct) {
+ percent = p->incorrect * 100;
+ percent /= p->correct + p->incorrect;
+ } else
+ percent = p->incorrect ? 100 : -1;
-static void t_stop(struct seq_file *m, void *p)
-{
+ return percent;
}
-static int t_show(struct seq_file *m, void *v)
+static int branch_stat_show(struct seq_file *m, void *v)
{
- const struct ftrace_pointer *fp = m->private;
struct ftrace_branch_data *p = v;
const char *f;
long percent;
- if (v == (void *)1) {
- if (fp->hit)
- seq_printf(m, " miss hit %% ");
- else
- seq_printf(m, " correct incorrect %% ");
- seq_printf(m, " Function "
- " File Line\n"
- " ------- --------- - "
- " -------- "
- " ---- ----\n");
- return 0;
- }
-
/* Only print the file, not the path */
f = p->file + strlen(p->file);
while (f >= p->file && *f != '/')
@@ -252,11 +262,7 @@ static int t_show(struct seq_file *m, void *v)
/*
* The miss is overlayed on correct, and hit on incorrect.
*/
- if (p->correct) {
- percent = p->incorrect * 100;
- percent /= p->correct + p->incorrect;
- } else
- percent = p->incorrect ? 100 : -1;
+ percent = get_incorrect_percent(p);
seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
if (percent < 0)
@@ -267,76 +273,118 @@ static int t_show(struct seq_file *m, void *v)
return 0;
}
-static struct seq_operations tracing_likely_seq_ops = {
- .start = t_start,
- .next = t_next,
- .stop = t_stop,
- .show = t_show,
+static void *annotated_branch_stat_start(struct tracer_stat *trace)
+{
+ return __start_annotated_branch_profile;
+}
+
+static void *
+annotated_branch_stat_next(void *v, int idx)
+{
+ struct ftrace_branch_data *p = v;
+
+ ++p;
+
+ if ((void *)p >= (void *)__stop_annotated_branch_profile)
+ return NULL;
+
+ return p;
+}
+
+static int annotated_branch_stat_cmp(void *p1, void *p2)
+{
+ struct ftrace_branch_data *a = p1;
+ struct ftrace_branch_data *b = p2;
+
+ long percent_a, percent_b;
+
+ percent_a = get_incorrect_percent(a);
+ percent_b = get_incorrect_percent(b);
+
+ if (percent_a < percent_b)
+ return -1;
+ if (percent_a > percent_b)
+ return 1;
+ else
+ return 0;
+}
+
+static struct tracer_stat annotated_branch_stats = {
+ .name = "branch_annotated",
+ .stat_start = annotated_branch_stat_start,
+ .stat_next = annotated_branch_stat_next,
+ .stat_cmp = annotated_branch_stat_cmp,
+ .stat_headers = annotated_branch_stat_headers,
+ .stat_show = branch_stat_show
};
-static int tracing_branch_open(struct inode *inode, struct file *file)
+__init static int init_annotated_branch_stats(void)
{
int ret;
- ret = seq_open(file, &tracing_likely_seq_ops);
+ ret = register_stat_tracer(&annotated_branch_stats);
if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = (void *)inode->i_private;
+ printk(KERN_WARNING "Warning: could not register "
+ "annotated branches stats\n");
+ return 1;
}
-
- return ret;
+ return 0;
}
-
-static const struct file_operations tracing_branch_fops = {
- .open = tracing_branch_open,
- .read = seq_read,
- .llseek = seq_lseek,
-};
+fs_initcall(init_annotated_branch_stats);
#ifdef CONFIG_PROFILE_ALL_BRANCHES
+
extern unsigned long __start_branch_profile[];
extern unsigned long __stop_branch_profile[];
-static const struct ftrace_pointer ftrace_branch_pos = {
- .start = __start_branch_profile,
- .stop = __stop_branch_profile,
- .hit = 1,
-};
+static int all_branch_stat_headers(struct seq_file *m)
+{
+ seq_printf(m, " miss hit %% ");
+ seq_printf(m, " Function "
+ " File Line\n"
+ " ------- --------- - "
+ " -------- "
+ " ---- ----\n");
+ return 0;
+}
-#endif /* CONFIG_PROFILE_ALL_BRANCHES */
+static void *all_branch_stat_start(struct tracer_stat *trace)
+{
+ return __start_branch_profile;
+}
-extern unsigned long __start_annotated_branch_profile[];
-extern unsigned long __stop_annotated_branch_profile[];
+static void *
+all_branch_stat_next(void *v, int idx)
+{
+ struct ftrace_branch_data *p = v;
-static const struct ftrace_pointer ftrace_annotated_branch_pos = {
- .start = __start_annotated_branch_profile,
- .stop = __stop_annotated_branch_profile,
-};
+ ++p;
-static __init int ftrace_branch_init(void)
-{
- struct dentry *d_tracer;
- struct dentry *entry;
+ if ((void *)p >= (void *)__stop_branch_profile)
+ return NULL;
- d_tracer = tracing_init_dentry();
+ return p;
+}
- entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer,
- (void *)&ftrace_annotated_branch_pos,
- &tracing_branch_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'profile_annotatet_branch' entry\n");
+static struct tracer_stat all_branch_stats = {
+ .name = "branch_all",
+ .stat_start = all_branch_stat_start,
+ .stat_next = all_branch_stat_next,
+ .stat_headers = all_branch_stat_headers,
+ .stat_show = branch_stat_show
+};
-#ifdef CONFIG_PROFILE_ALL_BRANCHES
- entry = debugfs_create_file("profile_branch", 0444, d_tracer,
- (void *)&ftrace_branch_pos,
- &tracing_branch_fops);
- if (!entry)
- pr_warning("Could not create debugfs"
- " 'profile_branch' entry\n");
-#endif
+__init static int all_annotated_branch_stats(void)
+{
+ int ret;
+ ret = register_stat_tracer(&all_branch_stats);
+ if (!ret) {
+ printk(KERN_WARNING "Warning: could not register "
+ "all branches stats\n");
+ return 1;
+ }
return 0;
}
-
-device_initcall(ftrace_branch_init);
+fs_initcall(all_annotated_branch_stats);
+#endif /* CONFIG_PROFILE_ALL_BRANCHES */
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
new file mode 100644
index 00000000000..b588fd81f7f
--- /dev/null
+++ b/kernel/trace/trace_clock.c
@@ -0,0 +1,109 @@
+/*
+ * tracing clocks
+ *
+ * Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Implements 3 trace clock variants, with differing scalability/precision
+ * tradeoffs:
+ *
+ * - local: CPU-local trace clock
+ * - medium: scalable global clock with some jitter
+ * - global: globally monotonic, serialized clock
+ *
+ * Tracer plugins will chose a default from these clocks.
+ */
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/ktime.h>
+#include <linux/trace_clock.h>
+
+/*
+ * trace_clock_local(): the simplest and least coherent tracing clock.
+ *
+ * Useful for tracing that does not cross to other CPUs nor
+ * does it go through idle events.
+ */
+u64 notrace trace_clock_local(void)
+{
+ unsigned long flags;
+ u64 clock;
+
+ /*
+ * sched_clock() is an architecture implemented, fast, scalable,
+ * lockless clock. It is not guaranteed to be coherent across
+ * CPUs, nor across CPU idle events.
+ */
+ raw_local_irq_save(flags);
+ clock = sched_clock();
+ raw_local_irq_restore(flags);
+
+ return clock;
+}
+
+/*
+ * trace_clock(): 'inbetween' trace clock. Not completely serialized,
+ * but not completely incorrect when crossing CPUs either.
+ *
+ * This is based on cpu_clock(), which will allow at most ~1 jiffy of
+ * jitter between CPUs. So it's a pretty scalable clock, but there
+ * can be offsets in the trace data.
+ */
+u64 notrace trace_clock(void)
+{
+ return cpu_clock(raw_smp_processor_id());
+}
+
+
+/*
+ * trace_clock_global(): special globally coherent trace clock
+ *
+ * It has higher overhead than the other trace clocks but is still
+ * an order of magnitude faster than GTOD derived hardware clocks.
+ *
+ * Used by plugins that need globally coherent timestamps.
+ */
+
+static u64 prev_trace_clock_time;
+
+static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp =
+ (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+u64 notrace trace_clock_global(void)
+{
+ unsigned long flags;
+ int this_cpu;
+ u64 now;
+
+ raw_local_irq_save(flags);
+
+ this_cpu = raw_smp_processor_id();
+ now = cpu_clock(this_cpu);
+ /*
+ * If in an NMI context then dont risk lockups and return the
+ * cpu_clock() time:
+ */
+ if (unlikely(in_nmi()))
+ goto out;
+
+ __raw_spin_lock(&trace_clock_lock);
+
+ /*
+ * TODO: if this happens often then maybe we should reset
+ * my_scd->clock to prev_trace_clock_time+1, to make sure
+ * we start ticking with the local clock from now on?
+ */
+ if ((s64)(now - prev_trace_clock_time) < 0)
+ now = prev_trace_clock_time + 1;
+
+ prev_trace_clock_time = now;
+
+ __raw_spin_unlock(&trace_clock_lock);
+
+ out:
+ raw_local_irq_restore(flags);
+
+ return now;
+}
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
new file mode 100644
index 00000000000..11ba5bb4ed0
--- /dev/null
+++ b/kernel/trace/trace_event_profile.c
@@ -0,0 +1,39 @@
+/*
+ * trace event based perf counter profiling
+ *
+ * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ */
+
+#include "trace.h"
+
+int ftrace_profile_enable(int event_id)
+{
+ struct ftrace_event_call *event;
+ int ret = -EINVAL;
+
+ mutex_lock(&event_mutex);
+ list_for_each_entry(event, &ftrace_events, list) {
+ if (event->id == event_id && event->profile_enable) {
+ ret = event->profile_enable(event);
+ break;
+ }
+ }
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+void ftrace_profile_disable(int event_id)
+{
+ struct ftrace_event_call *event;
+
+ mutex_lock(&event_mutex);
+ list_for_each_entry(event, &ftrace_events, list) {
+ if (event->id == event_id) {
+ event->profile_disable(event);
+ break;
+ }
+ }
+ mutex_unlock(&event_mutex);
+}
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
new file mode 100644
index 00000000000..6db005e1248
--- /dev/null
+++ b/kernel/trace/trace_event_types.h
@@ -0,0 +1,178 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ftrace
+
+/*
+ * We cheat and use the proto type field as the ID
+ * and args as the entry type (minus 'struct')
+ */
+TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
+ TRACE_STRUCT(
+ TRACE_FIELD(unsigned long, ip, ip)
+ TRACE_FIELD(unsigned long, parent_ip, parent_ip)
+ ),
+ TP_RAW_FMT(" %lx <-- %lx")
+);
+
+TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
+ ftrace_graph_ent_entry, ignore,
+ TRACE_STRUCT(
+ TRACE_FIELD(unsigned long, graph_ent.func, func)
+ TRACE_FIELD(int, graph_ent.depth, depth)
+ ),
+ TP_RAW_FMT("--> %lx (%d)")
+);
+
+TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
+ ftrace_graph_ret_entry, ignore,
+ TRACE_STRUCT(
+ TRACE_FIELD(unsigned long, ret.func, func)
+ TRACE_FIELD(unsigned long long, ret.calltime, calltime)
+ TRACE_FIELD(unsigned long long, ret.rettime, rettime)
+ TRACE_FIELD(unsigned long, ret.overrun, overrun)
+ TRACE_FIELD(int, ret.depth, depth)
+ ),
+ TP_RAW_FMT("<-- %lx (%d)")
+);
+
+TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
+ TRACE_STRUCT(
+ TRACE_FIELD(unsigned int, prev_pid, prev_pid)
+ TRACE_FIELD(unsigned char, prev_prio, prev_prio)
+ TRACE_FIELD(unsigned char, prev_state, prev_state)
+ TRACE_FIELD(unsigned int, next_pid, next_pid)
+ TRACE_FIELD(unsigned char, next_prio, next_prio)
+ TRACE_FIELD(unsigned char, next_state, next_state)
+ TRACE_FIELD(unsigned int, next_cpu, next_cpu)
+ ),
+ TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
+);
+
+TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
+ TRACE_STRUCT(
+ TRACE_FIELD(unsigned int, prev_pid, prev_pid)
+ TRACE_FIELD(unsigned char, prev_prio, prev_prio)
+ TRACE_FIELD(unsigned char, prev_state, prev_state)
+ TRACE_FIELD(unsigned int, next_pid, next_pid)
+ TRACE_FIELD(unsigned char, next_prio, next_prio)
+ TRACE_FIELD(unsigned char, next_state, next_state)
+ TRACE_FIELD(unsigned int, next_cpu, next_cpu)
+ ),
+ TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
+);
+
+TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
+ TRACE_STRUCT(
+ TRACE_FIELD(unsigned long, arg1, arg1)
+ TRACE_FIELD(unsigned long, arg2, arg2)
+ TRACE_FIELD(unsigned long, arg3, arg3)
+ ),
+ TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
+);
+
+/*
+ * Stack-trace entry:
+ */
+
+/* #define FTRACE_STACK_ENTRIES 8 */
+
+TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
+ TRACE_STRUCT(
+ TRACE_FIELD(unsigned long, caller[0], stack0)
+ TRACE_FIELD(unsigned long, caller[1], stack1)
+ TRACE_FIELD(unsigned long, caller[2], stack2)
+ TRACE_FIELD(unsigned long, caller[3], stack3)
+ TRACE_FIELD(unsigned long, caller[4], stack4)
+ TRACE_FIELD(unsigned long, caller[5], stack5)
+ TRACE_FIELD(unsigned long, caller[6], stack6)
+ TRACE_FIELD(unsigned long, caller[7], stack7)
+ ),
+ TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+ "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
+);
+
+TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
+ TRACE_STRUCT(
+ TRACE_FIELD(unsigned long, caller[0], stack0)
+ TRACE_FIELD(unsigned long, caller[1], stack1)
+ TRACE_FIELD(unsigned long, caller[2], stack2)
+ TRACE_FIELD(unsigned long, caller[3], stack3)
+ TRACE_FIELD(unsigned long, caller[4], stack4)
+ TRACE_FIELD(unsigned long, caller[5], stack5)
+ TRACE_FIELD(unsigned long, caller[6], stack6)
+ TRACE_FIELD(unsigned long, caller[7], stack7)
+ ),
+ TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+ "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
+);
+
+TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
+ TRACE_STRUCT(
+ TRACE_FIELD(unsigned long, ip, ip)
+ TRACE_FIELD(char *, fmt, fmt)
+ TRACE_FIELD_ZERO_CHAR(buf)
+ ),
+ TP_RAW_FMT("%08lx (%d) fmt:%p %s")
+);
+
+TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
+ TRACE_STRUCT(
+ TRACE_FIELD(unsigned long, ip, ip)
+ TRACE_FIELD_ZERO_CHAR(buf)
+ ),
+ TP_RAW_FMT("%08lx (%d) fmt:%p %s")
+);
+
+TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
+ TRACE_STRUCT(
+ TRACE_FIELD(unsigned int, line, line)
+ TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
+ TRACE_FUNC_SIZE+1, func)
+ TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
+ TRACE_FUNC_SIZE+1, file)
+ TRACE_FIELD(char, correct, correct)
+ ),
+ TP_RAW_FMT("%u:%s:%s (%u)")
+);
+
+TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
+ TRACE_STRUCT(
+ TRACE_FIELD(u64, from, from)
+ TRACE_FIELD(u64, to, to)
+ ),
+ TP_RAW_FMT("from: %llx to: %llx")
+);
+
+TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
+ TRACE_STRUCT(
+ TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
+ TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
+ TRACE_FIELD(int, state_data.type, type)
+ TRACE_FIELD(int, state_data.state, state)
+ ),
+ TP_RAW_FMT("%llx->%llx type:%u state:%u")
+);
+
+TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
+ TRACE_STRUCT(
+ TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
+ TRACE_FIELD(unsigned long, call_site, call_site)
+ TRACE_FIELD(const void *, ptr, ptr)
+ TRACE_FIELD(size_t, bytes_req, bytes_req)
+ TRACE_FIELD(size_t, bytes_alloc, bytes_alloc)
+ TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
+ TRACE_FIELD(int, node, node)
+ ),
+ TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
+ " flags:%x node:%d")
+);
+
+TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
+ TRACE_STRUCT(
+ TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
+ TRACE_FIELD(unsigned long, call_site, call_site)
+ TRACE_FIELD(const void *, ptr, ptr)
+ ),
+ TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
+);
+
+#undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
new file mode 100644
index 00000000000..78b1ed23017
--- /dev/null
+++ b/kernel/trace/trace_events.c
@@ -0,0 +1,1505 @@
+/*
+ * event tracer
+ *
+ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * - Added format output of fields of the trace point.
+ * This was based off of work by Tom Zanussi <tzanussi@gmail.com>.
+ *
+ */
+
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/delay.h>
+
+#include <asm/setup.h>
+
+#include "trace_output.h"
+
+#define TRACE_SYSTEM "TRACE_SYSTEM"
+
+DEFINE_MUTEX(event_mutex);
+
+LIST_HEAD(ftrace_events);
+
+int trace_define_field(struct ftrace_event_call *call, const char *type,
+ const char *name, int offset, int size, int is_signed,
+ int filter_type)
+{
+ struct ftrace_event_field *field;
+
+ field = kzalloc(sizeof(*field), GFP_KERNEL);
+ if (!field)
+ goto err;
+
+ field->name = kstrdup(name, GFP_KERNEL);
+ if (!field->name)
+ goto err;
+
+ field->type = kstrdup(type, GFP_KERNEL);
+ if (!field->type)
+ goto err;
+
+ if (filter_type == FILTER_OTHER)
+ field->filter_type = filter_assign_type(type);
+ else
+ field->filter_type = filter_type;
+
+ field->offset = offset;
+ field->size = size;
+ field->is_signed = is_signed;
+
+ list_add(&field->link, &call->fields);
+
+ return 0;
+
+err:
+ if (field) {
+ kfree(field->name);
+ kfree(field->type);
+ }
+ kfree(field);
+
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(trace_define_field);
+
+#define __common_field(type, item) \
+ ret = trace_define_field(call, #type, "common_" #item, \
+ offsetof(typeof(ent), item), \
+ sizeof(ent.item), \
+ is_signed_type(type), FILTER_OTHER); \
+ if (ret) \
+ return ret;
+
+int trace_define_common_fields(struct ftrace_event_call *call)
+{
+ int ret;
+ struct trace_entry ent;
+
+ __common_field(unsigned short, type);
+ __common_field(unsigned char, flags);
+ __common_field(unsigned char, preempt_count);
+ __common_field(int, pid);
+ __common_field(int, tgid);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(trace_define_common_fields);
+
+#ifdef CONFIG_MODULES
+
+static void trace_destroy_fields(struct ftrace_event_call *call)
+{
+ struct ftrace_event_field *field, *next;
+
+ list_for_each_entry_safe(field, next, &call->fields, link) {
+ list_del(&field->link);
+ kfree(field->type);
+ kfree(field->name);
+ kfree(field);
+ }
+}
+
+#endif /* CONFIG_MODULES */
+
+static void ftrace_event_enable_disable(struct ftrace_event_call *call,
+ int enable)
+{
+ switch (enable) {
+ case 0:
+ if (call->enabled) {
+ call->enabled = 0;
+ tracing_stop_cmdline_record();
+ call->unregfunc(call->data);
+ }
+ break;
+ case 1:
+ if (!call->enabled) {
+ call->enabled = 1;
+ tracing_start_cmdline_record();
+ call->regfunc(call->data);
+ }
+ break;
+ }
+}
+
+static void ftrace_clear_events(void)
+{
+ struct ftrace_event_call *call;
+
+ mutex_lock(&event_mutex);
+ list_for_each_entry(call, &ftrace_events, list) {
+ ftrace_event_enable_disable(call, 0);
+ }
+ mutex_unlock(&event_mutex);
+}
+
+/*
+ * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
+ */
+static int __ftrace_set_clr_event(const char *match, const char *sub,
+ const char *event, int set)
+{
+ struct ftrace_event_call *call;
+ int ret = -EINVAL;
+
+ mutex_lock(&event_mutex);
+ list_for_each_entry(call, &ftrace_events, list) {
+
+ if (!call->name || !call->regfunc)
+ continue;
+
+ if (match &&
+ strcmp(match, call->name) != 0 &&
+ strcmp(match, call->system) != 0)
+ continue;
+
+ if (sub && strcmp(sub, call->system) != 0)
+ continue;
+
+ if (event && strcmp(event, call->name) != 0)
+ continue;
+
+ ftrace_event_enable_disable(call, set);
+
+ ret = 0;
+ }
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+static int ftrace_set_clr_event(char *buf, int set)
+{
+ char *event = NULL, *sub = NULL, *match;
+
+ /*
+ * The buf format can be <subsystem>:<event-name>
+ * *:<event-name> means any event by that name.
+ * :<event-name> is the same.
+ *
+ * <subsystem>:* means all events in that subsystem
+ * <subsystem>: means the same.
+ *
+ * <name> (no ':') means all events in a subsystem with
+ * the name <name> or any event that matches <name>
+ */
+
+ match = strsep(&buf, ":");
+ if (buf) {
+ sub = match;
+ event = buf;
+ match = NULL;
+
+ if (!strlen(sub) || strcmp(sub, "*") == 0)
+ sub = NULL;
+ if (!strlen(event) || strcmp(event, "*") == 0)
+ event = NULL;
+ }
+
+ return __ftrace_set_clr_event(match, sub, event, set);
+}
+
+/**
+ * trace_set_clr_event - enable or disable an event
+ * @system: system name to match (NULL for any system)
+ * @event: event name to match (NULL for all events, within system)
+ * @set: 1 to enable, 0 to disable
+ *
+ * This is a way for other parts of the kernel to enable or disable
+ * event recording.
+ *
+ * Returns 0 on success, -EINVAL if the parameters do not match any
+ * registered events.
+ */
+int trace_set_clr_event(const char *system, const char *event, int set)
+{
+ return __ftrace_set_clr_event(NULL, system, event, set);
+}
+
+/* 128 should be much more than enough */
+#define EVENT_BUF_SIZE 127
+
+static ssize_t
+ftrace_event_write(struct file *file, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ size_t read = 0;
+ int i, set = 1;
+ ssize_t ret;
+ char *buf;
+ char ch;
+
+ if (!cnt || cnt < 0)
+ return 0;
+
+ ret = tracing_update_buffers();
+ if (ret < 0)
+ return ret;
+
+ ret = get_user(ch, ubuf++);
+ if (ret)
+ return ret;
+ read++;
+ cnt--;
+
+ /* skip white space */
+ while (cnt && isspace(ch)) {
+ ret = get_user(ch, ubuf++);
+ if (ret)
+ return ret;
+ read++;
+ cnt--;
+ }
+
+ /* Only white space found? */
+ if (isspace(ch)) {
+ file->f_pos += read;
+ ret = read;
+ return ret;
+ }
+
+ buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ if (cnt > EVENT_BUF_SIZE)
+ cnt = EVENT_BUF_SIZE;
+
+ i = 0;
+ while (cnt && !isspace(ch)) {
+ if (!i && ch == '!')
+ set = 0;
+ else
+ buf[i++] = ch;
+
+ ret = get_user(ch, ubuf++);
+ if (ret)
+ goto out_free;
+ read++;
+ cnt--;
+ }
+ buf[i] = 0;
+
+ file->f_pos += read;
+
+ ret = ftrace_set_clr_event(buf, set);
+ if (ret)
+ goto out_free;
+
+ ret = read;
+
+ out_free:
+ kfree(buf);
+
+ return ret;
+}
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct list_head *list = m->private;
+ struct ftrace_event_call *call;
+
+ (*pos)++;
+
+ for (;;) {
+ if (list == &ftrace_events)
+ return NULL;
+
+ call = list_entry(list, struct ftrace_event_call, list);
+
+ /*
+ * The ftrace subsystem is for showing formats only.
+ * They can not be enabled or disabled via the event files.
+ */
+ if (call->regfunc)
+ break;
+
+ list = list->next;
+ }
+
+ m->private = list->next;
+
+ return call;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+ struct ftrace_event_call *call = NULL;
+ loff_t l;
+
+ mutex_lock(&event_mutex);
+
+ m->private = ftrace_events.next;
+ for (l = 0; l <= *pos; ) {
+ call = t_next(m, NULL, &l);
+ if (!call)
+ break;
+ }
+ return call;
+}
+
+static void *
+s_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct list_head *list = m->private;
+ struct ftrace_event_call *call;
+
+ (*pos)++;
+
+ retry:
+ if (list == &ftrace_events)
+ return NULL;
+
+ call = list_entry(list, struct ftrace_event_call, list);
+
+ if (!call->enabled) {
+ list = list->next;
+ goto retry;
+ }
+
+ m->private = list->next;
+
+ return call;
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+ struct ftrace_event_call *call = NULL;
+ loff_t l;
+
+ mutex_lock(&event_mutex);
+
+ m->private = ftrace_events.next;
+ for (l = 0; l <= *pos; ) {
+ call = s_next(m, NULL, &l);
+ if (!call)
+ break;
+ }
+ return call;
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+ struct ftrace_event_call *call = v;
+
+ if (strcmp(call->system, TRACE_SYSTEM) != 0)
+ seq_printf(m, "%s:", call->system);
+ seq_printf(m, "%s\n", call->name);
+
+ return 0;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&event_mutex);
+}
+
+static int
+ftrace_event_seq_open(struct inode *inode, struct file *file)
+{
+ const struct seq_operations *seq_ops;
+
+ if ((file->f_mode & FMODE_WRITE) &&
+ (file->f_flags & O_TRUNC))
+ ftrace_clear_events();
+
+ seq_ops = inode->i_private;
+ return seq_open(file, seq_ops);
+}
+
+static ssize_t
+event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct ftrace_event_call *call = filp->private_data;
+ char *buf;
+
+ if (call->enabled)
+ buf = "1\n";
+ else
+ buf = "0\n";
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+}
+
+static ssize_t
+event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct ftrace_event_call *call = filp->private_data;
+ char buf[64];
+ unsigned long val;
+ int ret;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ ret = tracing_update_buffers();
+ if (ret < 0)
+ return ret;
+
+ switch (val) {
+ case 0:
+ case 1:
+ mutex_lock(&event_mutex);
+ ftrace_event_enable_disable(call, val);
+ mutex_unlock(&event_mutex);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static ssize_t
+system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ const char set_to_char[4] = { '?', '0', '1', 'X' };
+ const char *system = filp->private_data;
+ struct ftrace_event_call *call;
+ char buf[2];
+ int set = 0;
+ int ret;
+
+ mutex_lock(&event_mutex);
+ list_for_each_entry(call, &ftrace_events, list) {
+ if (!call->name || !call->regfunc)
+ continue;
+
+ if (system && strcmp(call->system, system) != 0)
+ continue;
+
+ /*
+ * We need to find out if all the events are set
+ * or if all events or cleared, or if we have
+ * a mixture.
+ */
+ set |= (1 << !!call->enabled);
+
+ /*
+ * If we have a mixture, no need to look further.
+ */
+ if (set == 3)
+ break;
+ }
+ mutex_unlock(&event_mutex);
+
+ buf[0] = set_to_char[set];
+ buf[1] = '\n';
+
+ ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+
+ return ret;
+}
+
+static ssize_t
+system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ const char *system = filp->private_data;
+ unsigned long val;
+ char buf[64];
+ ssize_t ret;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ ret = tracing_update_buffers();
+ if (ret < 0)
+ return ret;
+
+ if (val != 0 && val != 1)
+ return -EINVAL;
+
+ ret = __ftrace_set_clr_event(NULL, system, NULL, val);
+ if (ret)
+ goto out;
+
+ ret = cnt;
+
+out:
+ *ppos += cnt;
+
+ return ret;
+}
+
+extern char *__bad_type_size(void);
+
+#undef FIELD
+#define FIELD(type, name) \
+ sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
+ #type, "common_" #name, offsetof(typeof(field), name), \
+ sizeof(field.name)
+
+static int trace_write_header(struct trace_seq *s)
+{
+ struct trace_entry field;
+
+ /* struct trace_entry */
+ return trace_seq_printf(s,
+ "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+ "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+ "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+ "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+ "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+ "\n",
+ FIELD(unsigned short, type),
+ FIELD(unsigned char, flags),
+ FIELD(unsigned char, preempt_count),
+ FIELD(int, pid),
+ FIELD(int, tgid));
+}
+
+static ssize_t
+event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct ftrace_event_call *call = filp->private_data;
+ struct trace_seq *s;
+ char *buf;
+ int r;
+
+ if (*ppos)
+ return 0;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ /* If any of the first writes fail, so will the show_format. */
+
+ trace_seq_printf(s, "name: %s\n", call->name);
+ trace_seq_printf(s, "ID: %d\n", call->id);
+ trace_seq_printf(s, "format:\n");
+ trace_write_header(s);
+
+ r = call->show_format(call, s);
+ if (!r) {
+ /*
+ * ug! The format output is bigger than a PAGE!!
+ */
+ buf = "FORMAT TOO BIG\n";
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ buf, strlen(buf));
+ goto out;
+ }
+
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, s->len);
+ out:
+ kfree(s);
+ return r;
+}
+
+static ssize_t
+event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct ftrace_event_call *call = filp->private_data;
+ struct trace_seq *s;
+ int r;
+
+ if (*ppos)
+ return 0;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+ trace_seq_printf(s, "%d\n", call->id);
+
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, s->len);
+ kfree(s);
+ return r;
+}
+
+static ssize_t
+event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct ftrace_event_call *call = filp->private_data;
+ struct trace_seq *s;
+ int r;
+
+ if (*ppos)
+ return 0;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ print_event_filter(call, s);
+ r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+
+ kfree(s);
+
+ return r;
+}
+
+static ssize_t
+event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct ftrace_event_call *call = filp->private_data;
+ char *buf;
+ int err;
+
+ if (cnt >= PAGE_SIZE)
+ return -EINVAL;
+
+ buf = (char *)__get_free_page(GFP_TEMPORARY);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, ubuf, cnt)) {
+ free_page((unsigned long) buf);
+ return -EFAULT;
+ }
+ buf[cnt] = '\0';
+
+ err = apply_event_filter(call, buf);
+ free_page((unsigned long) buf);
+ if (err < 0)
+ return err;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static ssize_t
+subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct event_subsystem *system = filp->private_data;
+ struct trace_seq *s;
+ int r;
+
+ if (*ppos)
+ return 0;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ print_subsystem_event_filter(system, s);
+ r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+
+ kfree(s);
+
+ return r;
+}
+
+static ssize_t
+subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct event_subsystem *system = filp->private_data;
+ char *buf;
+ int err;
+
+ if (cnt >= PAGE_SIZE)
+ return -EINVAL;
+
+ buf = (char *)__get_free_page(GFP_TEMPORARY);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, ubuf, cnt)) {
+ free_page((unsigned long) buf);
+ return -EFAULT;
+ }
+ buf[cnt] = '\0';
+
+ err = apply_subsystem_event_filter(system, buf);
+ free_page((unsigned long) buf);
+ if (err < 0)
+ return err;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static ssize_t
+show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ int (*func)(struct trace_seq *s) = filp->private_data;
+ struct trace_seq *s;
+ int r;
+
+ if (*ppos)
+ return 0;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ func(s);
+ r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+
+ kfree(s);
+
+ return r;
+}
+
+static const struct seq_operations show_event_seq_ops = {
+ .start = t_start,
+ .next = t_next,
+ .show = t_show,
+ .stop = t_stop,
+};
+
+static const struct seq_operations show_set_event_seq_ops = {
+ .start = s_start,
+ .next = s_next,
+ .show = t_show,
+ .stop = t_stop,
+};
+
+static const struct file_operations ftrace_avail_fops = {
+ .open = ftrace_event_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations ftrace_set_event_fops = {
+ .open = ftrace_event_seq_open,
+ .read = seq_read,
+ .write = ftrace_event_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations ftrace_enable_fops = {
+ .open = tracing_open_generic,
+ .read = event_enable_read,
+ .write = event_enable_write,
+};
+
+static const struct file_operations ftrace_event_format_fops = {
+ .open = tracing_open_generic,
+ .read = event_format_read,
+};
+
+static const struct file_operations ftrace_event_id_fops = {
+ .open = tracing_open_generic,
+ .read = event_id_read,
+};
+
+static const struct file_operations ftrace_event_filter_fops = {
+ .open = tracing_open_generic,
+ .read = event_filter_read,
+ .write = event_filter_write,
+};
+
+static const struct file_operations ftrace_subsystem_filter_fops = {
+ .open = tracing_open_generic,
+ .read = subsystem_filter_read,
+ .write = subsystem_filter_write,
+};
+
+static const struct file_operations ftrace_system_enable_fops = {
+ .open = tracing_open_generic,
+ .read = system_enable_read,
+ .write = system_enable_write,
+};
+
+static const struct file_operations ftrace_show_header_fops = {
+ .open = tracing_open_generic,
+ .read = show_header,
+};
+
+static struct dentry *event_trace_events_dir(void)
+{
+ static struct dentry *d_tracer;
+ static struct dentry *d_events;
+
+ if (d_events)
+ return d_events;
+
+ d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return NULL;
+
+ d_events = debugfs_create_dir("events", d_tracer);
+ if (!d_events)
+ pr_warning("Could not create debugfs "
+ "'events' directory\n");
+
+ return d_events;
+}
+
+static LIST_HEAD(event_subsystems);
+
+static struct dentry *
+event_subsystem_dir(const char *name, struct dentry *d_events)
+{
+ struct event_subsystem *system;
+ struct dentry *entry;
+
+ /* First see if we did not already create this dir */
+ list_for_each_entry(system, &event_subsystems, list) {
+ if (strcmp(system->name, name) == 0) {
+ system->nr_events++;
+ return system->entry;
+ }
+ }
+
+ /* need to create new entry */
+ system = kmalloc(sizeof(*system), GFP_KERNEL);
+ if (!system) {
+ pr_warning("No memory to create event subsystem %s\n",
+ name);
+ return d_events;
+ }
+
+ system->entry = debugfs_create_dir(name, d_events);
+ if (!system->entry) {
+ pr_warning("Could not create event subsystem %s\n",
+ name);
+ kfree(system);
+ return d_events;
+ }
+
+ system->nr_events = 1;
+ system->name = kstrdup(name, GFP_KERNEL);
+ if (!system->name) {
+ debugfs_remove(system->entry);
+ kfree(system);
+ return d_events;
+ }
+
+ list_add(&system->list, &event_subsystems);
+
+ system->filter = NULL;
+
+ system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
+ if (!system->filter) {
+ pr_warning("Could not allocate filter for subsystem "
+ "'%s'\n", name);
+ return system->entry;
+ }
+
+ entry = debugfs_create_file("filter", 0644, system->entry, system,
+ &ftrace_subsystem_filter_fops);
+ if (!entry) {
+ kfree(system->filter);
+ system->filter = NULL;
+ pr_warning("Could not create debugfs "
+ "'%s/filter' entry\n", name);
+ }
+
+ entry = trace_create_file("enable", 0644, system->entry,
+ (void *)system->name,
+ &ftrace_system_enable_fops);
+
+ return system->entry;
+}
+
+static int
+event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
+ const struct file_operations *id,
+ const struct file_operations *enable,
+ const struct file_operations *filter,
+ const struct file_operations *format)
+{
+ struct dentry *entry;
+ int ret;
+
+ /*
+ * If the trace point header did not define TRACE_SYSTEM
+ * then the system would be called "TRACE_SYSTEM".
+ */
+ if (strcmp(call->system, TRACE_SYSTEM) != 0)
+ d_events = event_subsystem_dir(call->system, d_events);
+
+ call->dir = debugfs_create_dir(call->name, d_events);
+ if (!call->dir) {
+ pr_warning("Could not create debugfs "
+ "'%s' directory\n", call->name);
+ return -1;
+ }
+
+ if (call->regfunc)
+ entry = trace_create_file("enable", 0644, call->dir, call,
+ enable);
+
+ if (call->id && call->profile_enable)
+ entry = trace_create_file("id", 0444, call->dir, call,
+ id);
+
+ if (call->define_fields) {
+ ret = call->define_fields(call);
+ if (ret < 0) {
+ pr_warning("Could not initialize trace point"
+ " events/%s\n", call->name);
+ return ret;
+ }
+ entry = trace_create_file("filter", 0644, call->dir, call,
+ filter);
+ }
+
+ /* A trace may not want to export its format */
+ if (!call->show_format)
+ return 0;
+
+ entry = trace_create_file("format", 0444, call->dir, call,
+ format);
+
+ return 0;
+}
+
+#define for_each_event(event, start, end) \
+ for (event = start; \
+ (unsigned long)event < (unsigned long)end; \
+ event++)
+
+#ifdef CONFIG_MODULES
+
+static LIST_HEAD(ftrace_module_file_list);
+
+/*
+ * Modules must own their file_operations to keep up with
+ * reference counting.
+ */
+struct ftrace_module_file_ops {
+ struct list_head list;
+ struct module *mod;
+ struct file_operations id;
+ struct file_operations enable;
+ struct file_operations format;
+ struct file_operations filter;
+};
+
+static void remove_subsystem_dir(const char *name)
+{
+ struct event_subsystem *system;
+
+ if (strcmp(name, TRACE_SYSTEM) == 0)
+ return;
+
+ list_for_each_entry(system, &event_subsystems, list) {
+ if (strcmp(system->name, name) == 0) {
+ if (!--system->nr_events) {
+ struct event_filter *filter = system->filter;
+
+ debugfs_remove_recursive(system->entry);
+ list_del(&system->list);
+ if (filter) {
+ kfree(filter->filter_string);
+ kfree(filter);
+ }
+ kfree(system->name);
+ kfree(system);
+ }
+ break;
+ }
+ }
+}
+
+static struct ftrace_module_file_ops *
+trace_create_file_ops(struct module *mod)
+{
+ struct ftrace_module_file_ops *file_ops;
+
+ /*
+ * This is a bit of a PITA. To allow for correct reference
+ * counting, modules must "own" their file_operations.
+ * To do this, we allocate the file operations that will be
+ * used in the event directory.
+ */
+
+ file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL);
+ if (!file_ops)
+ return NULL;
+
+ file_ops->mod = mod;
+
+ file_ops->id = ftrace_event_id_fops;
+ file_ops->id.owner = mod;
+
+ file_ops->enable = ftrace_enable_fops;
+ file_ops->enable.owner = mod;
+
+ file_ops->filter = ftrace_event_filter_fops;
+ file_ops->filter.owner = mod;
+
+ file_ops->format = ftrace_event_format_fops;
+ file_ops->format.owner = mod;
+
+ list_add(&file_ops->list, &ftrace_module_file_list);
+
+ return file_ops;
+}
+
+static void trace_module_add_events(struct module *mod)
+{
+ struct ftrace_module_file_ops *file_ops = NULL;
+ struct ftrace_event_call *call, *start, *end;
+ struct dentry *d_events;
+ int ret;
+
+ start = mod->trace_events;
+ end = mod->trace_events + mod->num_trace_events;
+
+ if (start == end)
+ return;
+
+ d_events = event_trace_events_dir();
+ if (!d_events)
+ return;
+
+ for_each_event(call, start, end) {
+ /* The linker may leave blanks */
+ if (!call->name)
+ continue;
+ if (call->raw_init) {
+ ret = call->raw_init();
+ if (ret < 0) {
+ if (ret != -ENOSYS)
+ pr_warning("Could not initialize trace "
+ "point events/%s\n", call->name);
+ continue;
+ }
+ }
+ /*
+ * This module has events, create file ops for this module
+ * if not already done.
+ */
+ if (!file_ops) {
+ file_ops = trace_create_file_ops(mod);
+ if (!file_ops)
+ return;
+ }
+ call->mod = mod;
+ list_add(&call->list, &ftrace_events);
+ event_create_dir(call, d_events,
+ &file_ops->id, &file_ops->enable,
+ &file_ops->filter, &file_ops->format);
+ }
+}
+
+static void trace_module_remove_events(struct module *mod)
+{
+ struct ftrace_module_file_ops *file_ops;
+ struct ftrace_event_call *call, *p;
+ bool found = false;
+
+ down_write(&trace_event_mutex);
+ list_for_each_entry_safe(call, p, &ftrace_events, list) {
+ if (call->mod == mod) {
+ found = true;
+ ftrace_event_enable_disable(call, 0);
+ if (call->event)
+ __unregister_ftrace_event(call->event);
+ debugfs_remove_recursive(call->dir);
+ list_del(&call->list);
+ trace_destroy_fields(call);
+ destroy_preds(call);
+ remove_subsystem_dir(call->system);
+ }
+ }
+
+ /* Now free the file_operations */
+ list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
+ if (file_ops->mod == mod)
+ break;
+ }
+ if (&file_ops->list != &ftrace_module_file_list) {
+ list_del(&file_ops->list);
+ kfree(file_ops);
+ }
+
+ /*
+ * It is safest to reset the ring buffer if the module being unloaded
+ * registered any events.
+ */
+ if (found)
+ tracing_reset_current_online_cpus();
+ up_write(&trace_event_mutex);
+}
+
+static int trace_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+
+ mutex_lock(&event_mutex);
+ switch (val) {
+ case MODULE_STATE_COMING:
+ trace_module_add_events(mod);
+ break;
+ case MODULE_STATE_GOING:
+ trace_module_remove_events(mod);
+ break;
+ }
+ mutex_unlock(&event_mutex);
+
+ return 0;
+}
+#else
+static int trace_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULES */
+
+struct notifier_block trace_module_nb = {
+ .notifier_call = trace_module_notify,
+ .priority = 0,
+};
+
+extern struct ftrace_event_call __start_ftrace_events[];
+extern struct ftrace_event_call __stop_ftrace_events[];
+
+static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
+
+static __init int setup_trace_event(char *str)
+{
+ strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
+ ring_buffer_expanded = 1;
+ tracing_selftest_disabled = 1;
+
+ return 1;
+}
+__setup("trace_event=", setup_trace_event);
+
+static __init int event_trace_init(void)
+{
+ struct ftrace_event_call *call;
+ struct dentry *d_tracer;
+ struct dentry *entry;
+ struct dentry *d_events;
+ int ret;
+ char *buf = bootup_event_buf;
+ char *token;
+
+ d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
+
+ entry = debugfs_create_file("available_events", 0444, d_tracer,
+ (void *)&show_event_seq_ops,
+ &ftrace_avail_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'available_events' entry\n");
+
+ entry = debugfs_create_file("set_event", 0644, d_tracer,
+ (void *)&show_set_event_seq_ops,
+ &ftrace_set_event_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'set_event' entry\n");
+
+ d_events = event_trace_events_dir();
+ if (!d_events)
+ return 0;
+
+ /* ring buffer internal formats */
+ trace_create_file("header_page", 0444, d_events,
+ ring_buffer_print_page_header,
+ &ftrace_show_header_fops);
+
+ trace_create_file("header_event", 0444, d_events,
+ ring_buffer_print_entry_header,
+ &ftrace_show_header_fops);
+
+ trace_create_file("enable", 0644, d_events,
+ NULL, &ftrace_system_enable_fops);
+
+ for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
+ /* The linker may leave blanks */
+ if (!call->name)
+ continue;
+ if (call->raw_init) {
+ ret = call->raw_init();
+ if (ret < 0) {
+ if (ret != -ENOSYS)
+ pr_warning("Could not initialize trace "
+ "point events/%s\n", call->name);
+ continue;
+ }
+ }
+ list_add(&call->list, &ftrace_events);
+ event_create_dir(call, d_events, &ftrace_event_id_fops,
+ &ftrace_enable_fops, &ftrace_event_filter_fops,
+ &ftrace_event_format_fops);
+ }
+
+ while (true) {
+ token = strsep(&buf, ",");
+
+ if (!token)
+ break;
+ if (!*token)
+ continue;
+
+ ret = ftrace_set_clr_event(token, 1);
+ if (ret)
+ pr_warning("Failed to enable trace event: %s\n", token);
+ }
+
+ ret = register_module_notifier(&trace_module_nb);
+ if (ret)
+ pr_warning("Failed to register trace events module notifier\n");
+
+ return 0;
+}
+fs_initcall(event_trace_init);
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+
+static DEFINE_SPINLOCK(test_spinlock);
+static DEFINE_SPINLOCK(test_spinlock_irq);
+static DEFINE_MUTEX(test_mutex);
+
+static __init void test_work(struct work_struct *dummy)
+{
+ spin_lock(&test_spinlock);
+ spin_lock_irq(&test_spinlock_irq);
+ udelay(1);
+ spin_unlock_irq(&test_spinlock_irq);
+ spin_unlock(&test_spinlock);
+
+ mutex_lock(&test_mutex);
+ msleep(1);
+ mutex_unlock(&test_mutex);
+}
+
+static __init int event_test_thread(void *unused)
+{
+ void *test_malloc;
+
+ test_malloc = kmalloc(1234, GFP_KERNEL);
+ if (!test_malloc)
+ pr_info("failed to kmalloc\n");
+
+ schedule_on_each_cpu(test_work);
+
+ kfree(test_malloc);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop())
+ schedule();
+
+ return 0;
+}
+
+/*
+ * Do various things that may trigger events.
+ */
+static __init void event_test_stuff(void)
+{
+ struct task_struct *test_thread;
+
+ test_thread = kthread_run(event_test_thread, NULL, "test-events");
+ msleep(1);
+ kthread_stop(test_thread);
+}
+
+/*
+ * For every trace event defined, we will test each trace point separately,
+ * and then by groups, and finally all trace points.
+ */
+static __init void event_trace_self_tests(void)
+{
+ struct ftrace_event_call *call;
+ struct event_subsystem *system;
+ int ret;
+
+ pr_info("Running tests on trace events:\n");
+
+ list_for_each_entry(call, &ftrace_events, list) {
+
+ /* Only test those that have a regfunc */
+ if (!call->regfunc)
+ continue;
+
+ pr_info("Testing event %s: ", call->name);
+
+ /*
+ * If an event is already enabled, someone is using
+ * it and the self test should not be on.
+ */
+ if (call->enabled) {
+ pr_warning("Enabled event during self test!\n");
+ WARN_ON_ONCE(1);
+ continue;
+ }
+
+ ftrace_event_enable_disable(call, 1);
+ event_test_stuff();
+ ftrace_event_enable_disable(call, 0);
+
+ pr_cont("OK\n");
+ }
+
+ /* Now test at the sub system level */
+
+ pr_info("Running tests on trace event systems:\n");
+
+ list_for_each_entry(system, &event_subsystems, list) {
+
+ /* the ftrace system is special, skip it */
+ if (strcmp(system->name, "ftrace") == 0)
+ continue;
+
+ pr_info("Testing event system %s: ", system->name);
+
+ ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1);
+ if (WARN_ON_ONCE(ret)) {
+ pr_warning("error enabling system %s\n",
+ system->name);
+ continue;
+ }
+
+ event_test_stuff();
+
+ ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
+ if (WARN_ON_ONCE(ret))
+ pr_warning("error disabling system %s\n",
+ system->name);
+
+ pr_cont("OK\n");
+ }
+
+ /* Test with all events enabled */
+
+ pr_info("Running tests on all trace events:\n");
+ pr_info("Testing all events: ");
+
+ ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1);
+ if (WARN_ON_ONCE(ret)) {
+ pr_warning("error enabling all events\n");
+ return;
+ }
+
+ event_test_stuff();
+
+ /* reset sysname */
+ ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0);
+ if (WARN_ON_ONCE(ret)) {
+ pr_warning("error disabling all events\n");
+ return;
+ }
+
+ pr_cont("OK\n");
+}
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+static DEFINE_PER_CPU(atomic_t, test_event_disable);
+
+static void
+function_test_events_call(unsigned long ip, unsigned long parent_ip)
+{
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ struct ftrace_entry *entry;
+ unsigned long flags;
+ long disabled;
+ int resched;
+ int cpu;
+ int pc;
+
+ pc = preempt_count();
+ resched = ftrace_preempt_disable();
+ cpu = raw_smp_processor_id();
+ disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
+
+ if (disabled != 1)
+ goto out;
+
+ local_save_flags(flags);
+
+ event = trace_current_buffer_lock_reserve(&buffer,
+ TRACE_FN, sizeof(*entry),
+ flags, pc);
+ if (!event)
+ goto out;
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+ entry->parent_ip = parent_ip;
+
+ trace_nowake_buffer_unlock_commit(buffer, event, flags, pc);
+
+ out:
+ atomic_dec(&per_cpu(test_event_disable, cpu));
+ ftrace_preempt_enable(resched);
+}
+
+static struct ftrace_ops trace_ops __initdata =
+{
+ .func = function_test_events_call,
+};
+
+static __init void event_trace_self_test_with_function(void)
+{
+ register_ftrace_function(&trace_ops);
+ pr_info("Running tests again, along with the function tracer\n");
+ event_trace_self_tests();
+ unregister_ftrace_function(&trace_ops);
+}
+#else
+static __init void event_trace_self_test_with_function(void)
+{
+}
+#endif
+
+static __init int event_trace_self_tests_init(void)
+{
+ if (!tracing_selftest_disabled) {
+ event_trace_self_tests();
+ event_trace_self_test_with_function();
+ }
+
+ return 0;
+}
+
+late_initcall(event_trace_self_tests_init);
+
+#endif
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
new file mode 100644
index 00000000000..93660fbbf62
--- /dev/null
+++ b/kernel/trace/trace_events_filter.c
@@ -0,0 +1,1223 @@
+/*
+ * trace_events_filter - generic event filtering
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
+ */
+
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/mutex.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+enum filter_op_ids
+{
+ OP_OR,
+ OP_AND,
+ OP_NE,
+ OP_EQ,
+ OP_LT,
+ OP_LE,
+ OP_GT,
+ OP_GE,
+ OP_NONE,
+ OP_OPEN_PAREN,
+};
+
+struct filter_op {
+ int id;
+ char *string;
+ int precedence;
+};
+
+static struct filter_op filter_ops[] = {
+ { OP_OR, "||", 1 },
+ { OP_AND, "&&", 2 },
+ { OP_NE, "!=", 4 },
+ { OP_EQ, "==", 4 },
+ { OP_LT, "<", 5 },
+ { OP_LE, "<=", 5 },
+ { OP_GT, ">", 5 },
+ { OP_GE, ">=", 5 },
+ { OP_NONE, "OP_NONE", 0 },
+ { OP_OPEN_PAREN, "(", 0 },
+};
+
+enum {
+ FILT_ERR_NONE,
+ FILT_ERR_INVALID_OP,
+ FILT_ERR_UNBALANCED_PAREN,
+ FILT_ERR_TOO_MANY_OPERANDS,
+ FILT_ERR_OPERAND_TOO_LONG,
+ FILT_ERR_FIELD_NOT_FOUND,
+ FILT_ERR_ILLEGAL_FIELD_OP,
+ FILT_ERR_ILLEGAL_INTVAL,
+ FILT_ERR_BAD_SUBSYS_FILTER,
+ FILT_ERR_TOO_MANY_PREDS,
+ FILT_ERR_MISSING_FIELD,
+ FILT_ERR_INVALID_FILTER,
+};
+
+static char *err_text[] = {
+ "No error",
+ "Invalid operator",
+ "Unbalanced parens",
+ "Too many operands",
+ "Operand too long",
+ "Field not found",
+ "Illegal operation for field type",
+ "Illegal integer value",
+ "Couldn't find or set field in one of a subsystem's events",
+ "Too many terms in predicate expression",
+ "Missing field name and/or value",
+ "Meaningless filter expression",
+};
+
+struct opstack_op {
+ int op;
+ struct list_head list;
+};
+
+struct postfix_elt {
+ int op;
+ char *operand;
+ struct list_head list;
+};
+
+struct filter_parse_state {
+ struct filter_op *ops;
+ struct list_head opstack;
+ struct list_head postfix;
+ int lasterr;
+ int lasterr_pos;
+
+ struct {
+ char *string;
+ unsigned int cnt;
+ unsigned int tail;
+ } infix;
+
+ struct {
+ char string[MAX_FILTER_STR_VAL];
+ int pos;
+ unsigned int tail;
+ } operand;
+};
+
+DEFINE_COMPARISON_PRED(s64);
+DEFINE_COMPARISON_PRED(u64);
+DEFINE_COMPARISON_PRED(s32);
+DEFINE_COMPARISON_PRED(u32);
+DEFINE_COMPARISON_PRED(s16);
+DEFINE_COMPARISON_PRED(u16);
+DEFINE_COMPARISON_PRED(s8);
+DEFINE_COMPARISON_PRED(u8);
+
+DEFINE_EQUALITY_PRED(64);
+DEFINE_EQUALITY_PRED(32);
+DEFINE_EQUALITY_PRED(16);
+DEFINE_EQUALITY_PRED(8);
+
+static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
+ void *event __attribute((unused)),
+ int val1, int val2)
+{
+ return val1 && val2;
+}
+
+static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
+ void *event __attribute((unused)),
+ int val1, int val2)
+{
+ return val1 || val2;
+}
+
+/* Filter predicate for fixed sized arrays of characters */
+static int filter_pred_string(struct filter_pred *pred, void *event,
+ int val1, int val2)
+{
+ char *addr = (char *)(event + pred->offset);
+ int cmp, match;
+
+ cmp = strncmp(addr, pred->str_val, pred->str_len);
+
+ match = (!cmp) ^ pred->not;
+
+ return match;
+}
+
+/* Filter predicate for char * pointers */
+static int filter_pred_pchar(struct filter_pred *pred, void *event,
+ int val1, int val2)
+{
+ char **addr = (char **)(event + pred->offset);
+ int cmp, match;
+
+ cmp = strncmp(*addr, pred->str_val, pred->str_len);
+
+ match = (!cmp) ^ pred->not;
+
+ return match;
+}
+
+/*
+ * Filter predicate for dynamic sized arrays of characters.
+ * These are implemented through a list of strings at the end
+ * of the entry.
+ * Also each of these strings have a field in the entry which
+ * contains its offset from the beginning of the entry.
+ * We have then first to get this field, dereference it
+ * and add it to the address of the entry, and at last we have
+ * the address of the string.
+ */
+static int filter_pred_strloc(struct filter_pred *pred, void *event,
+ int val1, int val2)
+{
+ u32 str_item = *(u32 *)(event + pred->offset);
+ int str_loc = str_item & 0xffff;
+ int str_len = str_item >> 16;
+ char *addr = (char *)(event + str_loc);
+ int cmp, match;
+
+ cmp = strncmp(addr, pred->str_val, str_len);
+
+ match = (!cmp) ^ pred->not;
+
+ return match;
+}
+
+static int filter_pred_none(struct filter_pred *pred, void *event,
+ int val1, int val2)
+{
+ return 0;
+}
+
+/* return 1 if event matches, 0 otherwise (discard) */
+int filter_match_preds(struct ftrace_event_call *call, void *rec)
+{
+ struct event_filter *filter = call->filter;
+ int match, top = 0, val1 = 0, val2 = 0;
+ int stack[MAX_FILTER_PRED];
+ struct filter_pred *pred;
+ int i;
+
+ for (i = 0; i < filter->n_preds; i++) {
+ pred = filter->preds[i];
+ if (!pred->pop_n) {
+ match = pred->fn(pred, rec, val1, val2);
+ stack[top++] = match;
+ continue;
+ }
+ if (pred->pop_n > top) {
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+ val1 = stack[--top];
+ val2 = stack[--top];
+ match = pred->fn(pred, rec, val1, val2);
+ stack[top++] = match;
+ }
+
+ return stack[--top];
+}
+EXPORT_SYMBOL_GPL(filter_match_preds);
+
+static void parse_error(struct filter_parse_state *ps, int err, int pos)
+{
+ ps->lasterr = err;
+ ps->lasterr_pos = pos;
+}
+
+static void remove_filter_string(struct event_filter *filter)
+{
+ kfree(filter->filter_string);
+ filter->filter_string = NULL;
+}
+
+static int replace_filter_string(struct event_filter *filter,
+ char *filter_string)
+{
+ kfree(filter->filter_string);
+ filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
+ if (!filter->filter_string)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int append_filter_string(struct event_filter *filter,
+ char *string)
+{
+ int newlen;
+ char *new_filter_string;
+
+ BUG_ON(!filter->filter_string);
+ newlen = strlen(filter->filter_string) + strlen(string) + 1;
+ new_filter_string = kmalloc(newlen, GFP_KERNEL);
+ if (!new_filter_string)
+ return -ENOMEM;
+
+ strcpy(new_filter_string, filter->filter_string);
+ strcat(new_filter_string, string);
+ kfree(filter->filter_string);
+ filter->filter_string = new_filter_string;
+
+ return 0;
+}
+
+static void append_filter_err(struct filter_parse_state *ps,
+ struct event_filter *filter)
+{
+ int pos = ps->lasterr_pos;
+ char *buf, *pbuf;
+
+ buf = (char *)__get_free_page(GFP_TEMPORARY);
+ if (!buf)
+ return;
+
+ append_filter_string(filter, "\n");
+ memset(buf, ' ', PAGE_SIZE);
+ if (pos > PAGE_SIZE - 128)
+ pos = 0;
+ buf[pos] = '^';
+ pbuf = &buf[pos] + 1;
+
+ sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]);
+ append_filter_string(filter, buf);
+ free_page((unsigned long) buf);
+}
+
+void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
+{
+ struct event_filter *filter = call->filter;
+
+ mutex_lock(&event_mutex);
+ if (filter && filter->filter_string)
+ trace_seq_printf(s, "%s\n", filter->filter_string);
+ else
+ trace_seq_printf(s, "none\n");
+ mutex_unlock(&event_mutex);
+}
+
+void print_subsystem_event_filter(struct event_subsystem *system,
+ struct trace_seq *s)
+{
+ struct event_filter *filter = system->filter;
+
+ mutex_lock(&event_mutex);
+ if (filter && filter->filter_string)
+ trace_seq_printf(s, "%s\n", filter->filter_string);
+ else
+ trace_seq_printf(s, "none\n");
+ mutex_unlock(&event_mutex);
+}
+
+static struct ftrace_event_field *
+find_event_field(struct ftrace_event_call *call, char *name)
+{
+ struct ftrace_event_field *field;
+
+ list_for_each_entry(field, &call->fields, link) {
+ if (!strcmp(field->name, name))
+ return field;
+ }
+
+ return NULL;
+}
+
+static void filter_free_pred(struct filter_pred *pred)
+{
+ if (!pred)
+ return;
+
+ kfree(pred->field_name);
+ kfree(pred);
+}
+
+static void filter_clear_pred(struct filter_pred *pred)
+{
+ kfree(pred->field_name);
+ pred->field_name = NULL;
+ pred->str_len = 0;
+}
+
+static int filter_set_pred(struct filter_pred *dest,
+ struct filter_pred *src,
+ filter_pred_fn_t fn)
+{
+ *dest = *src;
+ if (src->field_name) {
+ dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
+ if (!dest->field_name)
+ return -ENOMEM;
+ }
+ dest->fn = fn;
+
+ return 0;
+}
+
+static void filter_disable_preds(struct ftrace_event_call *call)
+{
+ struct event_filter *filter = call->filter;
+ int i;
+
+ call->filter_active = 0;
+ filter->n_preds = 0;
+
+ for (i = 0; i < MAX_FILTER_PRED; i++)
+ filter->preds[i]->fn = filter_pred_none;
+}
+
+void destroy_preds(struct ftrace_event_call *call)
+{
+ struct event_filter *filter = call->filter;
+ int i;
+
+ if (!filter)
+ return;
+
+ for (i = 0; i < MAX_FILTER_PRED; i++) {
+ if (filter->preds[i])
+ filter_free_pred(filter->preds[i]);
+ }
+ kfree(filter->preds);
+ kfree(filter->filter_string);
+ kfree(filter);
+ call->filter = NULL;
+}
+
+static int init_preds(struct ftrace_event_call *call)
+{
+ struct event_filter *filter;
+ struct filter_pred *pred;
+ int i;
+
+ if (call->filter)
+ return 0;
+
+ filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ if (!call->filter)
+ return -ENOMEM;
+
+ filter->n_preds = 0;
+
+ filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
+ if (!filter->preds)
+ goto oom;
+
+ for (i = 0; i < MAX_FILTER_PRED; i++) {
+ pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+ if (!pred)
+ goto oom;
+ pred->fn = filter_pred_none;
+ filter->preds[i] = pred;
+ }
+
+ return 0;
+
+oom:
+ destroy_preds(call);
+
+ return -ENOMEM;
+}
+
+static int init_subsystem_preds(struct event_subsystem *system)
+{
+ struct ftrace_event_call *call;
+ int err;
+
+ list_for_each_entry(call, &ftrace_events, list) {
+ if (!call->define_fields)
+ continue;
+
+ if (strcmp(call->system, system->name) != 0)
+ continue;
+
+ err = init_preds(call);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+enum {
+ FILTER_DISABLE_ALL,
+ FILTER_INIT_NO_RESET,
+ FILTER_SKIP_NO_RESET,
+};
+
+static void filter_free_subsystem_preds(struct event_subsystem *system,
+ int flag)
+{
+ struct ftrace_event_call *call;
+
+ list_for_each_entry(call, &ftrace_events, list) {
+ if (!call->define_fields)
+ continue;
+
+ if (strcmp(call->system, system->name) != 0)
+ continue;
+
+ if (flag == FILTER_INIT_NO_RESET) {
+ call->filter->no_reset = false;
+ continue;
+ }
+
+ if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
+ continue;
+
+ filter_disable_preds(call);
+ remove_filter_string(call->filter);
+ }
+}
+
+static int filter_add_pred_fn(struct filter_parse_state *ps,
+ struct ftrace_event_call *call,
+ struct filter_pred *pred,
+ filter_pred_fn_t fn)
+{
+ struct event_filter *filter = call->filter;
+ int idx, err;
+
+ if (filter->n_preds == MAX_FILTER_PRED) {
+ parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+ return -ENOSPC;
+ }
+
+ idx = filter->n_preds;
+ filter_clear_pred(filter->preds[idx]);
+ err = filter_set_pred(filter->preds[idx], pred, fn);
+ if (err)
+ return err;
+
+ filter->n_preds++;
+ call->filter_active = 1;
+
+ return 0;
+}
+
+int filter_assign_type(const char *type)
+{
+ if (strstr(type, "__data_loc") && strstr(type, "char"))
+ return FILTER_DYN_STRING;
+
+ if (strchr(type, '[') && strstr(type, "char"))
+ return FILTER_STATIC_STRING;
+
+ return FILTER_OTHER;
+}
+
+static bool is_string_field(struct ftrace_event_field *field)
+{
+ return field->filter_type == FILTER_DYN_STRING ||
+ field->filter_type == FILTER_STATIC_STRING ||
+ field->filter_type == FILTER_PTR_STRING;
+}
+
+static int is_legal_op(struct ftrace_event_field *field, int op)
+{
+ if (is_string_field(field) && (op != OP_EQ && op != OP_NE))
+ return 0;
+
+ return 1;
+}
+
+static filter_pred_fn_t select_comparison_fn(int op, int field_size,
+ int field_is_signed)
+{
+ filter_pred_fn_t fn = NULL;
+
+ switch (field_size) {
+ case 8:
+ if (op == OP_EQ || op == OP_NE)
+ fn = filter_pred_64;
+ else if (field_is_signed)
+ fn = filter_pred_s64;
+ else
+ fn = filter_pred_u64;
+ break;
+ case 4:
+ if (op == OP_EQ || op == OP_NE)
+ fn = filter_pred_32;
+ else if (field_is_signed)
+ fn = filter_pred_s32;
+ else
+ fn = filter_pred_u32;
+ break;
+ case 2:
+ if (op == OP_EQ || op == OP_NE)
+ fn = filter_pred_16;
+ else if (field_is_signed)
+ fn = filter_pred_s16;
+ else
+ fn = filter_pred_u16;
+ break;
+ case 1:
+ if (op == OP_EQ || op == OP_NE)
+ fn = filter_pred_8;
+ else if (field_is_signed)
+ fn = filter_pred_s8;
+ else
+ fn = filter_pred_u8;
+ break;
+ }
+
+ return fn;
+}
+
+static int filter_add_pred(struct filter_parse_state *ps,
+ struct ftrace_event_call *call,
+ struct filter_pred *pred,
+ bool dry_run)
+{
+ struct ftrace_event_field *field;
+ filter_pred_fn_t fn;
+ unsigned long long val;
+ int ret;
+
+ pred->fn = filter_pred_none;
+
+ if (pred->op == OP_AND) {
+ pred->pop_n = 2;
+ fn = filter_pred_and;
+ goto add_pred_fn;
+ } else if (pred->op == OP_OR) {
+ pred->pop_n = 2;
+ fn = filter_pred_or;
+ goto add_pred_fn;
+ }
+
+ field = find_event_field(call, pred->field_name);
+ if (!field) {
+ parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
+ return -EINVAL;
+ }
+
+ pred->offset = field->offset;
+
+ if (!is_legal_op(field, pred->op)) {
+ parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0);
+ return -EINVAL;
+ }
+
+ if (is_string_field(field)) {
+ pred->str_len = field->size;
+
+ if (field->filter_type == FILTER_STATIC_STRING)
+ fn = filter_pred_string;
+ else if (field->filter_type == FILTER_DYN_STRING)
+ fn = filter_pred_strloc;
+ else {
+ fn = filter_pred_pchar;
+ pred->str_len = strlen(pred->str_val);
+ }
+ } else {
+ if (field->is_signed)
+ ret = strict_strtoll(pred->str_val, 0, &val);
+ else
+ ret = strict_strtoull(pred->str_val, 0, &val);
+ if (ret) {
+ parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
+ return -EINVAL;
+ }
+ pred->val = val;
+
+ fn = select_comparison_fn(pred->op, field->size,
+ field->is_signed);
+ if (!fn) {
+ parse_error(ps, FILT_ERR_INVALID_OP, 0);
+ return -EINVAL;
+ }
+ }
+
+ if (pred->op == OP_NE)
+ pred->not = 1;
+
+add_pred_fn:
+ if (!dry_run)
+ return filter_add_pred_fn(ps, call, pred, fn);
+ return 0;
+}
+
+static int filter_add_subsystem_pred(struct filter_parse_state *ps,
+ struct event_subsystem *system,
+ struct filter_pred *pred,
+ char *filter_string,
+ bool dry_run)
+{
+ struct ftrace_event_call *call;
+ int err = 0;
+ bool fail = true;
+
+ list_for_each_entry(call, &ftrace_events, list) {
+
+ if (!call->define_fields)
+ continue;
+
+ if (strcmp(call->system, system->name))
+ continue;
+
+ if (call->filter->no_reset)
+ continue;
+
+ err = filter_add_pred(ps, call, pred, dry_run);
+ if (err)
+ call->filter->no_reset = true;
+ else
+ fail = false;
+
+ if (!dry_run)
+ replace_filter_string(call->filter, filter_string);
+ }
+
+ if (fail) {
+ parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+ return err;
+ }
+ return 0;
+}
+
+static void parse_init(struct filter_parse_state *ps,
+ struct filter_op *ops,
+ char *infix_string)
+{
+ memset(ps, '\0', sizeof(*ps));
+
+ ps->infix.string = infix_string;
+ ps->infix.cnt = strlen(infix_string);
+ ps->ops = ops;
+
+ INIT_LIST_HEAD(&ps->opstack);
+ INIT_LIST_HEAD(&ps->postfix);
+}
+
+static char infix_next(struct filter_parse_state *ps)
+{
+ ps->infix.cnt--;
+
+ return ps->infix.string[ps->infix.tail++];
+}
+
+static char infix_peek(struct filter_parse_state *ps)
+{
+ if (ps->infix.tail == strlen(ps->infix.string))
+ return 0;
+
+ return ps->infix.string[ps->infix.tail];
+}
+
+static void infix_advance(struct filter_parse_state *ps)
+{
+ ps->infix.cnt--;
+ ps->infix.tail++;
+}
+
+static inline int is_precedence_lower(struct filter_parse_state *ps,
+ int a, int b)
+{
+ return ps->ops[a].precedence < ps->ops[b].precedence;
+}
+
+static inline int is_op_char(struct filter_parse_state *ps, char c)
+{
+ int i;
+
+ for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+ if (ps->ops[i].string[0] == c)
+ return 1;
+ }
+
+ return 0;
+}
+
+static int infix_get_op(struct filter_parse_state *ps, char firstc)
+{
+ char nextc = infix_peek(ps);
+ char opstr[3];
+ int i;
+
+ opstr[0] = firstc;
+ opstr[1] = nextc;
+ opstr[2] = '\0';
+
+ for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+ if (!strcmp(opstr, ps->ops[i].string)) {
+ infix_advance(ps);
+ return ps->ops[i].id;
+ }
+ }
+
+ opstr[1] = '\0';
+
+ for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+ if (!strcmp(opstr, ps->ops[i].string))
+ return ps->ops[i].id;
+ }
+
+ return OP_NONE;
+}
+
+static inline void clear_operand_string(struct filter_parse_state *ps)
+{
+ memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL);
+ ps->operand.tail = 0;
+}
+
+static inline int append_operand_char(struct filter_parse_state *ps, char c)
+{
+ if (ps->operand.tail == MAX_FILTER_STR_VAL - 1)
+ return -EINVAL;
+
+ ps->operand.string[ps->operand.tail++] = c;
+
+ return 0;
+}
+
+static int filter_opstack_push(struct filter_parse_state *ps, int op)
+{
+ struct opstack_op *opstack_op;
+
+ opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL);
+ if (!opstack_op)
+ return -ENOMEM;
+
+ opstack_op->op = op;
+ list_add(&opstack_op->list, &ps->opstack);
+
+ return 0;
+}
+
+static int filter_opstack_empty(struct filter_parse_state *ps)
+{
+ return list_empty(&ps->opstack);
+}
+
+static int filter_opstack_top(struct filter_parse_state *ps)
+{
+ struct opstack_op *opstack_op;
+
+ if (filter_opstack_empty(ps))
+ return OP_NONE;
+
+ opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+
+ return opstack_op->op;
+}
+
+static int filter_opstack_pop(struct filter_parse_state *ps)
+{
+ struct opstack_op *opstack_op;
+ int op;
+
+ if (filter_opstack_empty(ps))
+ return OP_NONE;
+
+ opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+ op = opstack_op->op;
+ list_del(&opstack_op->list);
+
+ kfree(opstack_op);
+
+ return op;
+}
+
+static void filter_opstack_clear(struct filter_parse_state *ps)
+{
+ while (!filter_opstack_empty(ps))
+ filter_opstack_pop(ps);
+}
+
+static char *curr_operand(struct filter_parse_state *ps)
+{
+ return ps->operand.string;
+}
+
+static int postfix_append_operand(struct filter_parse_state *ps, char *operand)
+{
+ struct postfix_elt *elt;
+
+ elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+ if (!elt)
+ return -ENOMEM;
+
+ elt->op = OP_NONE;
+ elt->operand = kstrdup(operand, GFP_KERNEL);
+ if (!elt->operand) {
+ kfree(elt);
+ return -ENOMEM;
+ }
+
+ list_add_tail(&elt->list, &ps->postfix);
+
+ return 0;
+}
+
+static int postfix_append_op(struct filter_parse_state *ps, int op)
+{
+ struct postfix_elt *elt;
+
+ elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+ if (!elt)
+ return -ENOMEM;
+
+ elt->op = op;
+ elt->operand = NULL;
+
+ list_add_tail(&elt->list, &ps->postfix);
+
+ return 0;
+}
+
+static void postfix_clear(struct filter_parse_state *ps)
+{
+ struct postfix_elt *elt;
+
+ while (!list_empty(&ps->postfix)) {
+ elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
+ kfree(elt->operand);
+ list_del(&elt->list);
+ }
+}
+
+static int filter_parse(struct filter_parse_state *ps)
+{
+ int in_string = 0;
+ int op, top_op;
+ char ch;
+
+ while ((ch = infix_next(ps))) {
+ if (ch == '"') {
+ in_string ^= 1;
+ continue;
+ }
+
+ if (in_string)
+ goto parse_operand;
+
+ if (isspace(ch))
+ continue;
+
+ if (is_op_char(ps, ch)) {
+ op = infix_get_op(ps, ch);
+ if (op == OP_NONE) {
+ parse_error(ps, FILT_ERR_INVALID_OP, 0);
+ return -EINVAL;
+ }
+
+ if (strlen(curr_operand(ps))) {
+ postfix_append_operand(ps, curr_operand(ps));
+ clear_operand_string(ps);
+ }
+
+ while (!filter_opstack_empty(ps)) {
+ top_op = filter_opstack_top(ps);
+ if (!is_precedence_lower(ps, top_op, op)) {
+ top_op = filter_opstack_pop(ps);
+ postfix_append_op(ps, top_op);
+ continue;
+ }
+ break;
+ }
+
+ filter_opstack_push(ps, op);
+ continue;
+ }
+
+ if (ch == '(') {
+ filter_opstack_push(ps, OP_OPEN_PAREN);
+ continue;
+ }
+
+ if (ch == ')') {
+ if (strlen(curr_operand(ps))) {
+ postfix_append_operand(ps, curr_operand(ps));
+ clear_operand_string(ps);
+ }
+
+ top_op = filter_opstack_pop(ps);
+ while (top_op != OP_NONE) {
+ if (top_op == OP_OPEN_PAREN)
+ break;
+ postfix_append_op(ps, top_op);
+ top_op = filter_opstack_pop(ps);
+ }
+ if (top_op == OP_NONE) {
+ parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+ return -EINVAL;
+ }
+ continue;
+ }
+parse_operand:
+ if (append_operand_char(ps, ch)) {
+ parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0);
+ return -EINVAL;
+ }
+ }
+
+ if (strlen(curr_operand(ps)))
+ postfix_append_operand(ps, curr_operand(ps));
+
+ while (!filter_opstack_empty(ps)) {
+ top_op = filter_opstack_pop(ps);
+ if (top_op == OP_NONE)
+ break;
+ if (top_op == OP_OPEN_PAREN) {
+ parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+ return -EINVAL;
+ }
+ postfix_append_op(ps, top_op);
+ }
+
+ return 0;
+}
+
+static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
+{
+ struct filter_pred *pred;
+
+ pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+ if (!pred)
+ return NULL;
+
+ pred->field_name = kstrdup(operand1, GFP_KERNEL);
+ if (!pred->field_name) {
+ kfree(pred);
+ return NULL;
+ }
+
+ strcpy(pred->str_val, operand2);
+ pred->str_len = strlen(operand2);
+
+ pred->op = op;
+
+ return pred;
+}
+
+static struct filter_pred *create_logical_pred(int op)
+{
+ struct filter_pred *pred;
+
+ pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+ if (!pred)
+ return NULL;
+
+ pred->op = op;
+
+ return pred;
+}
+
+static int check_preds(struct filter_parse_state *ps)
+{
+ int n_normal_preds = 0, n_logical_preds = 0;
+ struct postfix_elt *elt;
+
+ list_for_each_entry(elt, &ps->postfix, list) {
+ if (elt->op == OP_NONE)
+ continue;
+
+ if (elt->op == OP_AND || elt->op == OP_OR) {
+ n_logical_preds++;
+ continue;
+ }
+ n_normal_preds++;
+ }
+
+ if (!n_normal_preds || n_logical_preds >= n_normal_preds) {
+ parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int replace_preds(struct event_subsystem *system,
+ struct ftrace_event_call *call,
+ struct filter_parse_state *ps,
+ char *filter_string,
+ bool dry_run)
+{
+ char *operand1 = NULL, *operand2 = NULL;
+ struct filter_pred *pred;
+ struct postfix_elt *elt;
+ int err;
+ int n_preds = 0;
+
+ err = check_preds(ps);
+ if (err)
+ return err;
+
+ list_for_each_entry(elt, &ps->postfix, list) {
+ if (elt->op == OP_NONE) {
+ if (!operand1)
+ operand1 = elt->operand;
+ else if (!operand2)
+ operand2 = elt->operand;
+ else {
+ parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
+ return -EINVAL;
+ }
+ continue;
+ }
+
+ if (n_preds++ == MAX_FILTER_PRED) {
+ parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+ return -ENOSPC;
+ }
+
+ if (elt->op == OP_AND || elt->op == OP_OR) {
+ pred = create_logical_pred(elt->op);
+ goto add_pred;
+ }
+
+ if (!operand1 || !operand2) {
+ parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
+ return -EINVAL;
+ }
+
+ pred = create_pred(elt->op, operand1, operand2);
+add_pred:
+ if (!pred)
+ return -ENOMEM;
+ if (call)
+ err = filter_add_pred(ps, call, pred, false);
+ else
+ err = filter_add_subsystem_pred(ps, system, pred,
+ filter_string, dry_run);
+ filter_free_pred(pred);
+ if (err)
+ return err;
+
+ operand1 = operand2 = NULL;
+ }
+
+ return 0;
+}
+
+int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+{
+ int err;
+
+ struct filter_parse_state *ps;
+
+ mutex_lock(&event_mutex);
+
+ err = init_preds(call);
+ if (err)
+ goto out_unlock;
+
+ if (!strcmp(strstrip(filter_string), "0")) {
+ filter_disable_preds(call);
+ remove_filter_string(call->filter);
+ mutex_unlock(&event_mutex);
+ return 0;
+ }
+
+ err = -ENOMEM;
+ ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+ if (!ps)
+ goto out_unlock;
+
+ filter_disable_preds(call);
+ replace_filter_string(call->filter, filter_string);
+
+ parse_init(ps, filter_ops, filter_string);
+ err = filter_parse(ps);
+ if (err) {
+ append_filter_err(ps, call->filter);
+ goto out;
+ }
+
+ err = replace_preds(NULL, call, ps, filter_string, false);
+ if (err)
+ append_filter_err(ps, call->filter);
+
+out:
+ filter_opstack_clear(ps);
+ postfix_clear(ps);
+ kfree(ps);
+out_unlock:
+ mutex_unlock(&event_mutex);
+
+ return err;
+}
+
+int apply_subsystem_event_filter(struct event_subsystem *system,
+ char *filter_string)
+{
+ int err;
+
+ struct filter_parse_state *ps;
+
+ mutex_lock(&event_mutex);
+
+ err = init_subsystem_preds(system);
+ if (err)
+ goto out_unlock;
+
+ if (!strcmp(strstrip(filter_string), "0")) {
+ filter_free_subsystem_preds(system, FILTER_DISABLE_ALL);
+ remove_filter_string(system->filter);
+ mutex_unlock(&event_mutex);
+ return 0;
+ }
+
+ err = -ENOMEM;
+ ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+ if (!ps)
+ goto out_unlock;
+
+ replace_filter_string(system->filter, filter_string);
+
+ parse_init(ps, filter_ops, filter_string);
+ err = filter_parse(ps);
+ if (err) {
+ append_filter_err(ps, system->filter);
+ goto out;
+ }
+
+ filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET);
+
+ /* try to see the filter can be applied to which events */
+ err = replace_preds(system, NULL, ps, filter_string, true);
+ if (err) {
+ append_filter_err(ps, system->filter);
+ goto out;
+ }
+
+ filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET);
+
+ /* really apply the filter to the events */
+ err = replace_preds(system, NULL, ps, filter_string, false);
+ if (err) {
+ append_filter_err(ps, system->filter);
+ filter_free_subsystem_preds(system, 2);
+ }
+
+out:
+ filter_opstack_clear(ps);
+ postfix_clear(ps);
+ kfree(ps);
+out_unlock:
+ mutex_unlock(&event_mutex);
+
+ return err;
+}
+
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
new file mode 100644
index 00000000000..df1bf6e48bb
--- /dev/null
+++ b/kernel/trace/trace_export.c
@@ -0,0 +1,206 @@
+/*
+ * trace_export.c - export basic ftrace utilities to user space
+ *
+ * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/stringify.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+
+#include "trace_output.h"
+
+
+#undef TRACE_STRUCT
+#define TRACE_STRUCT(args...) args
+
+extern void __bad_type_size(void);
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign) \
+ if (sizeof(type) != sizeof(field.item)) \
+ __bad_type_size(); \
+ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
+ "offset:%u;\tsize:%u;\n", \
+ (unsigned int)offsetof(typeof(field), item), \
+ (unsigned int)sizeof(field.item)); \
+ if (!ret) \
+ return 0;
+
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
+ ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \
+ "offset:%u;\tsize:%u;\n", \
+ (unsigned int)offsetof(typeof(field), item), \
+ (unsigned int)sizeof(field.item)); \
+ if (!ret) \
+ return 0;
+
+#undef TRACE_FIELD_ZERO_CHAR
+#define TRACE_FIELD_ZERO_CHAR(item) \
+ ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \
+ "offset:%u;\tsize:0;\n", \
+ (unsigned int)offsetof(typeof(field), item)); \
+ if (!ret) \
+ return 0;
+
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
+ TRACE_FIELD(type, item, assign)
+
+#undef TP_RAW_FMT
+#define TP_RAW_FMT(args...) args
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
+static int \
+ftrace_format_##call(struct ftrace_event_call *unused, \
+ struct trace_seq *s) \
+{ \
+ struct args field; \
+ int ret; \
+ \
+ tstruct; \
+ \
+ trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \
+ \
+ return ret; \
+}
+
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
+ tpfmt) \
+static int \
+ftrace_format_##call(struct ftrace_event_call *unused, \
+ struct trace_seq *s) \
+{ \
+ struct args field; \
+ int ret; \
+ \
+ tstruct; \
+ \
+ trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \
+ \
+ return ret; \
+}
+
+#include "trace_event_types.h"
+
+#undef TRACE_ZERO_CHAR
+#define TRACE_ZERO_CHAR(arg)
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)\
+ entry->item = assign;
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)\
+ entry->item = assign;
+
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
+ TRACE_FIELD(type, item, assign)
+
+#undef TP_CMD
+#define TP_CMD(cmd...) cmd
+
+#undef TRACE_ENTRY
+#define TRACE_ENTRY entry
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
+ cmd;
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
+int ftrace_define_fields_##call(struct ftrace_event_call *event_call); \
+static int ftrace_raw_init_event_##call(void); \
+ \
+struct ftrace_event_call __used \
+__attribute__((__aligned__(4))) \
+__attribute__((section("_ftrace_events"))) event_##call = { \
+ .name = #call, \
+ .id = proto, \
+ .system = __stringify(TRACE_SYSTEM), \
+ .raw_init = ftrace_raw_init_event_##call, \
+ .show_format = ftrace_format_##call, \
+ .define_fields = ftrace_define_fields_##call, \
+}; \
+static int ftrace_raw_init_event_##call(void) \
+{ \
+ INIT_LIST_HEAD(&event_##call.fields); \
+ return 0; \
+} \
+
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
+ tpfmt) \
+ \
+struct ftrace_event_call __used \
+__attribute__((__aligned__(4))) \
+__attribute__((section("_ftrace_events"))) event_##call = { \
+ .name = #call, \
+ .id = proto, \
+ .system = __stringify(TRACE_SYSTEM), \
+ .show_format = ftrace_format_##call, \
+};
+
+#include "trace_event_types.h"
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign) \
+ ret = trace_define_field(event_call, #type, #item, \
+ offsetof(typeof(field), item), \
+ sizeof(field.item), \
+ is_signed_type(type), FILTER_OTHER); \
+ if (ret) \
+ return ret;
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \
+ ret = trace_define_field(event_call, #type "[" #len "]", #item, \
+ offsetof(typeof(field), item), \
+ sizeof(field.item), 0, FILTER_OTHER); \
+ if (ret) \
+ return ret;
+
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
+ ret = trace_define_field(event_call, #type, #item, \
+ offsetof(typeof(field), item), \
+ sizeof(field.item), is_signed, \
+ FILTER_OTHER); \
+ if (ret) \
+ return ret;
+
+#undef TRACE_FIELD_ZERO_CHAR
+#define TRACE_FIELD_ZERO_CHAR(item)
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
+int \
+ftrace_define_fields_##call(struct ftrace_event_call *event_call) \
+{ \
+ struct args field; \
+ int ret; \
+ \
+ ret = trace_define_common_fields(event_call); \
+ if (ret) \
+ return ret; \
+ \
+ tstruct; \
+ \
+ return ret; \
+}
+
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
+ tpfmt)
+
+#include "trace_event_types.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 9236d7e25a1..5b01b94518f 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -9,6 +9,7 @@
* Copyright (C) 2004-2006 Ingo Molnar
* Copyright (C) 2004 William Lee Irwin III
*/
+#include <linux/ring_buffer.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
@@ -16,52 +17,387 @@
#include "trace.h"
-static void start_function_trace(struct trace_array *tr)
+/* function tracing enabled */
+static int ftrace_function_enabled;
+
+static struct trace_array *func_trace;
+
+static void tracing_start_function_trace(void);
+static void tracing_stop_function_trace(void);
+
+static int function_trace_init(struct trace_array *tr)
{
+ func_trace = tr;
tr->cpu = get_cpu();
- tracing_reset_online_cpus(tr);
put_cpu();
tracing_start_cmdline_record();
tracing_start_function_trace();
+ return 0;
}
-static void stop_function_trace(struct trace_array *tr)
+static void function_trace_reset(struct trace_array *tr)
{
tracing_stop_function_trace();
tracing_stop_cmdline_record();
}
-static int function_trace_init(struct trace_array *tr)
+static void function_trace_start(struct trace_array *tr)
{
- start_function_trace(tr);
- return 0;
+ tracing_reset_online_cpus(tr);
}
-static void function_trace_reset(struct trace_array *tr)
+static void
+function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
+{
+ struct trace_array *tr = func_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int cpu, resched;
+ int pc;
+
+ if (unlikely(!ftrace_function_enabled))
+ return;
+
+ pc = preempt_count();
+ resched = ftrace_preempt_disable();
+ local_save_flags(flags);
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+
+ if (likely(disabled == 1))
+ trace_function(tr, ip, parent_ip, flags, pc);
+
+ atomic_dec(&data->disabled);
+ ftrace_preempt_enable(resched);
+}
+
+static void
+function_trace_call(unsigned long ip, unsigned long parent_ip)
{
- stop_function_trace(tr);
+ struct trace_array *tr = func_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int cpu;
+ int pc;
+
+ if (unlikely(!ftrace_function_enabled))
+ return;
+
+ /*
+ * Need to use raw, since this must be called before the
+ * recursive protection is performed.
+ */
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+
+ if (likely(disabled == 1)) {
+ pc = preempt_count();
+ trace_function(tr, ip, parent_ip, flags, pc);
+ }
+
+ atomic_dec(&data->disabled);
+ local_irq_restore(flags);
}
-static void function_trace_start(struct trace_array *tr)
+static void
+function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
{
- tracing_reset_online_cpus(tr);
+ struct trace_array *tr = func_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int cpu;
+ int pc;
+
+ if (unlikely(!ftrace_function_enabled))
+ return;
+
+ /*
+ * Need to use raw, since this must be called before the
+ * recursive protection is performed.
+ */
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+
+ if (likely(disabled == 1)) {
+ pc = preempt_count();
+ trace_function(tr, ip, parent_ip, flags, pc);
+ /*
+ * skip over 5 funcs:
+ * __ftrace_trace_stack,
+ * __trace_stack,
+ * function_stack_trace_call
+ * ftrace_list_func
+ * ftrace_call
+ */
+ __trace_stack(tr, flags, 5, pc);
+ }
+
+ atomic_dec(&data->disabled);
+ local_irq_restore(flags);
+}
+
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+ .func = function_trace_call,
+};
+
+static struct ftrace_ops trace_stack_ops __read_mostly =
+{
+ .func = function_stack_trace_call,
+};
+
+/* Our two options */
+enum {
+ TRACE_FUNC_OPT_STACK = 0x1,
+};
+
+static struct tracer_opt func_opts[] = {
+#ifdef CONFIG_STACKTRACE
+ { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
+#endif
+ { } /* Always set a last empty entry */
+};
+
+static struct tracer_flags func_flags = {
+ .val = 0, /* By default: all flags disabled */
+ .opts = func_opts
+};
+
+static void tracing_start_function_trace(void)
+{
+ ftrace_function_enabled = 0;
+
+ if (trace_flags & TRACE_ITER_PREEMPTONLY)
+ trace_ops.func = function_trace_call_preempt_only;
+ else
+ trace_ops.func = function_trace_call;
+
+ if (func_flags.val & TRACE_FUNC_OPT_STACK)
+ register_ftrace_function(&trace_stack_ops);
+ else
+ register_ftrace_function(&trace_ops);
+
+ ftrace_function_enabled = 1;
+}
+
+static void tracing_stop_function_trace(void)
+{
+ ftrace_function_enabled = 0;
+
+ if (func_flags.val & TRACE_FUNC_OPT_STACK)
+ unregister_ftrace_function(&trace_stack_ops);
+ else
+ unregister_ftrace_function(&trace_ops);
+}
+
+static int func_set_flag(u32 old_flags, u32 bit, int set)
+{
+ if (bit == TRACE_FUNC_OPT_STACK) {
+ /* do nothing if already set */
+ if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
+ return 0;
+
+ if (set) {
+ unregister_ftrace_function(&trace_ops);
+ register_ftrace_function(&trace_stack_ops);
+ } else {
+ unregister_ftrace_function(&trace_stack_ops);
+ register_ftrace_function(&trace_ops);
+ }
+
+ return 0;
+ }
+
+ return -EINVAL;
}
static struct tracer function_trace __read_mostly =
{
- .name = "function",
- .init = function_trace_init,
- .reset = function_trace_reset,
- .start = function_trace_start,
+ .name = "function",
+ .init = function_trace_init,
+ .reset = function_trace_reset,
+ .start = function_trace_start,
+ .wait_pipe = poll_wait_pipe,
+ .flags = &func_flags,
+ .set_flag = func_set_flag,
#ifdef CONFIG_FTRACE_SELFTEST
- .selftest = trace_selftest_startup_function,
+ .selftest = trace_selftest_startup_function,
#endif
};
+#ifdef CONFIG_DYNAMIC_FTRACE
+static void
+ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ long *count = (long *)data;
+
+ if (tracing_is_on())
+ return;
+
+ if (!*count)
+ return;
+
+ if (*count != -1)
+ (*count)--;
+
+ tracing_on();
+}
+
+static void
+ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ long *count = (long *)data;
+
+ if (!tracing_is_on())
+ return;
+
+ if (!*count)
+ return;
+
+ if (*count != -1)
+ (*count)--;
+
+ tracing_off();
+}
+
+static int
+ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data);
+
+static struct ftrace_probe_ops traceon_probe_ops = {
+ .func = ftrace_traceon,
+ .print = ftrace_trace_onoff_print,
+};
+
+static struct ftrace_probe_ops traceoff_probe_ops = {
+ .func = ftrace_traceoff,
+ .print = ftrace_trace_onoff_print,
+};
+
+static int
+ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
+{
+ long count = (long)data;
+
+ seq_printf(m, "%pf:", (void *)ip);
+
+ if (ops == &traceon_probe_ops)
+ seq_printf(m, "traceon");
+ else
+ seq_printf(m, "traceoff");
+
+ if (count == -1)
+ seq_printf(m, ":unlimited\n");
+ else
+ seq_printf(m, ":count=%ld\n", count);
+
+ return 0;
+}
+
+static int
+ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
+{
+ struct ftrace_probe_ops *ops;
+
+ /* we register both traceon and traceoff to this callback */
+ if (strcmp(cmd, "traceon") == 0)
+ ops = &traceon_probe_ops;
+ else
+ ops = &traceoff_probe_ops;
+
+ unregister_ftrace_function_probe_func(glob, ops);
+
+ return 0;
+}
+
+static int
+ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
+{
+ struct ftrace_probe_ops *ops;
+ void *count = (void *)-1;
+ char *number;
+ int ret;
+
+ /* hash funcs only work with set_ftrace_filter */
+ if (!enable)
+ return -EINVAL;
+
+ if (glob[0] == '!')
+ return ftrace_trace_onoff_unreg(glob+1, cmd, param);
+
+ /* we register both traceon and traceoff to this callback */
+ if (strcmp(cmd, "traceon") == 0)
+ ops = &traceon_probe_ops;
+ else
+ ops = &traceoff_probe_ops;
+
+ if (!param)
+ goto out_reg;
+
+ number = strsep(&param, ":");
+
+ if (!strlen(number))
+ goto out_reg;
+
+ /*
+ * We use the callback data field (which is a pointer)
+ * as our counter.
+ */
+ ret = strict_strtoul(number, 0, (unsigned long *)&count);
+ if (ret)
+ return ret;
+
+ out_reg:
+ ret = register_ftrace_function_probe(glob, ops, count);
+
+ return ret < 0 ? ret : 0;
+}
+
+static struct ftrace_func_command ftrace_traceon_cmd = {
+ .name = "traceon",
+ .func = ftrace_trace_onoff_callback,
+};
+
+static struct ftrace_func_command ftrace_traceoff_cmd = {
+ .name = "traceoff",
+ .func = ftrace_trace_onoff_callback,
+};
+
+static int __init init_func_cmd_traceon(void)
+{
+ int ret;
+
+ ret = register_ftrace_command(&ftrace_traceoff_cmd);
+ if (ret)
+ return ret;
+
+ ret = register_ftrace_command(&ftrace_traceon_cmd);
+ if (ret)
+ unregister_ftrace_command(&ftrace_traceoff_cmd);
+ return ret;
+}
+#else
+static inline int init_func_cmd_traceon(void)
+{
+ return 0;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
static __init int init_function_trace(void)
{
+ init_func_cmd_traceon();
return register_tracer(&function_trace);
}
-
device_initcall(init_function_trace);
+
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 930c08e5b38..b3749a2c313 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1,7 +1,7 @@
/*
*
* Function graph tracer.
- * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ * Copyright (c) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
* Mostly borrowed from function tracer which
* is Copyright (c) Steven Rostedt <srostedt@redhat.com>
*
@@ -12,6 +12,12 @@
#include <linux/fs.h>
#include "trace.h"
+#include "trace_output.h"
+
+struct fgraph_data {
+ pid_t last_pid;
+ int depth;
+};
#define TRACE_GRAPH_INDENT 2
@@ -20,9 +26,11 @@
#define TRACE_GRAPH_PRINT_CPU 0x2
#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
#define TRACE_GRAPH_PRINT_PROC 0x8
+#define TRACE_GRAPH_PRINT_DURATION 0x10
+#define TRACE_GRAPH_PRINT_ABS_TIME 0X20
static struct tracer_opt trace_opts[] = {
- /* Display overruns ? */
+ /* Display overruns? (for self-debug purpose) */
{ TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
/* Display CPU ? */
{ TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
@@ -30,27 +38,251 @@ static struct tracer_opt trace_opts[] = {
{ TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
/* Display proc name/pid */
{ TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) },
+ /* Display duration of execution */
+ { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
+ /* Display absolute time of an entry */
+ { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
{ } /* Empty entry */
};
static struct tracer_flags tracer_flags = {
/* Don't display overruns and proc by default */
- .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD,
+ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
+ TRACE_GRAPH_PRINT_DURATION,
.opts = trace_opts
};
-/* pid on the last trace processed */
-static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
+static struct trace_array *graph_array;
-static int graph_trace_init(struct trace_array *tr)
+
+/* Add a function return address to the trace stack on thread info.*/
+int
+ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
+ unsigned long frame_pointer)
+{
+ unsigned long long calltime;
+ int index;
+
+ if (!current->ret_stack)
+ return -EBUSY;
+
+ /*
+ * We must make sure the ret_stack is tested before we read
+ * anything else.
+ */
+ smp_rmb();
+
+ /* The return trace stack is full */
+ if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+ atomic_inc(&current->trace_overrun);
+ return -EBUSY;
+ }
+
+ calltime = trace_clock_local();
+
+ index = ++current->curr_ret_stack;
+ barrier();
+ current->ret_stack[index].ret = ret;
+ current->ret_stack[index].func = func;
+ current->ret_stack[index].calltime = calltime;
+ current->ret_stack[index].subtime = 0;
+ current->ret_stack[index].fp = frame_pointer;
+ *depth = index;
+
+ return 0;
+}
+
+/* Retrieve a function return address to the trace stack on thread info.*/
+static void
+ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
+ unsigned long frame_pointer)
{
- int cpu, ret;
+ int index;
- for_each_online_cpu(cpu)
- tracing_reset(tr, cpu);
+ index = current->curr_ret_stack;
+ if (unlikely(index < 0)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ /* Might as well panic, otherwise we have no where to go */
+ *ret = (unsigned long)panic;
+ return;
+ }
+
+#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
+ /*
+ * The arch may choose to record the frame pointer used
+ * and check it here to make sure that it is what we expect it
+ * to be. If gcc does not set the place holder of the return
+ * address in the frame pointer, and does a copy instead, then
+ * the function graph trace will fail. This test detects this
+ * case.
+ *
+ * Currently, x86_32 with optimize for size (-Os) makes the latest
+ * gcc do the above.
+ */
+ if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
+ ftrace_graph_stop();
+ WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
+ " from func %pF return to %lx\n",
+ current->ret_stack[index].fp,
+ frame_pointer,
+ (void *)current->ret_stack[index].func,
+ current->ret_stack[index].ret);
+ *ret = (unsigned long)panic;
+ return;
+ }
+#endif
+
+ *ret = current->ret_stack[index].ret;
+ trace->func = current->ret_stack[index].func;
+ trace->calltime = current->ret_stack[index].calltime;
+ trace->overrun = atomic_read(&current->trace_overrun);
+ trace->depth = index;
+}
+
+/*
+ * Send the trace to the ring-buffer.
+ * @return the original return address.
+ */
+unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
+{
+ struct ftrace_graph_ret trace;
+ unsigned long ret;
+
+ ftrace_pop_return_trace(&trace, &ret, frame_pointer);
+ trace.rettime = trace_clock_local();
+ ftrace_graph_return(&trace);
+ barrier();
+ current->curr_ret_stack--;
+
+ if (unlikely(!ret)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ /* Might as well panic. What else to do? */
+ ret = (unsigned long)panic;
+ }
+
+ return ret;
+}
+
+static int __trace_graph_entry(struct trace_array *tr,
+ struct ftrace_graph_ent *trace,
+ unsigned long flags,
+ int pc)
+{
+ struct ftrace_event_call *call = &event_funcgraph_entry;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer = tr->buffer;
+ struct ftrace_graph_ent_entry *entry;
+
+ if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+ return 0;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return 0;
+ entry = ring_buffer_event_data(event);
+ entry->graph_ent = *trace;
+ if (!filter_current_check_discard(buffer, call, entry, event))
+ ring_buffer_unlock_commit(buffer, event);
+
+ return 1;
+}
+
+int trace_graph_entry(struct ftrace_graph_ent *trace)
+{
+ struct trace_array *tr = graph_array;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int ret;
+ int cpu;
+ int pc;
+
+ if (unlikely(!tr))
+ return 0;
+
+ if (!ftrace_trace_task(current))
+ return 0;
+
+ if (!ftrace_graph_addr(trace->func))
+ return 0;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+ if (likely(disabled == 1)) {
+ pc = preempt_count();
+ ret = __trace_graph_entry(tr, trace, flags, pc);
+ } else {
+ ret = 0;
+ }
+ /* Only do the atomic if it is not already set */
+ if (!test_tsk_trace_graph(current))
+ set_tsk_trace_graph(current);
+
+ atomic_dec(&data->disabled);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static void __trace_graph_return(struct trace_array *tr,
+ struct ftrace_graph_ret *trace,
+ unsigned long flags,
+ int pc)
+{
+ struct ftrace_event_call *call = &event_funcgraph_exit;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer = tr->buffer;
+ struct ftrace_graph_ret_entry *entry;
+
+ if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+ return;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->ret = *trace;
+ if (!filter_current_check_discard(buffer, call, entry, event))
+ ring_buffer_unlock_commit(buffer, event);
+}
+
+void trace_graph_return(struct ftrace_graph_ret *trace)
+{
+ struct trace_array *tr = graph_array;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int cpu;
+ int pc;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+ if (likely(disabled == 1)) {
+ pc = preempt_count();
+ __trace_graph_return(tr, trace, flags, pc);
+ }
+ if (!trace->depth)
+ clear_tsk_trace_graph(current);
+ atomic_dec(&data->disabled);
+ local_irq_restore(flags);
+}
+
+static int graph_trace_init(struct trace_array *tr)
+{
+ int ret;
+
+ graph_array = tr;
ret = register_ftrace_graph(&trace_graph_return,
- &trace_graph_entry);
+ &trace_graph_entry);
if (ret)
return ret;
tracing_start_cmdline_record();
@@ -58,49 +290,30 @@ static int graph_trace_init(struct trace_array *tr)
return 0;
}
+void set_graph_array(struct trace_array *tr)
+{
+ graph_array = tr;
+}
+
static void graph_trace_reset(struct trace_array *tr)
{
tracing_stop_cmdline_record();
unregister_ftrace_graph();
}
-static inline int log10_cpu(int nb)
-{
- if (nb / 100)
- return 3;
- if (nb / 10)
- return 2;
- return 1;
-}
+static int max_bytes_for_cpu;
static enum print_line_t
print_graph_cpu(struct trace_seq *s, int cpu)
{
- int i;
int ret;
- int log10_this = log10_cpu(cpu);
- int log10_all = log10_cpu(cpumask_weight(cpu_online_mask));
-
/*
* Start with a space character - to make it stand out
* to the right a bit when trace output is pasted into
* email:
*/
- ret = trace_seq_printf(s, " ");
-
- /*
- * Tricky - we space the CPU field according to the max
- * number of online CPUs. On a 2-cpu system it would take
- * a maximum of 1 digit - on a 128 cpu system it would
- * take up to 3 digits:
- */
- for (i = 0; i < log10_all - log10_this; i++) {
- ret = trace_seq_printf(s, " ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
- ret = trace_seq_printf(s, "%d) ", cpu);
+ ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -112,15 +325,15 @@ print_graph_cpu(struct trace_seq *s, int cpu)
static enum print_line_t
print_graph_proc(struct trace_seq *s, pid_t pid)
{
- int i;
- int ret;
- int len;
- char comm[8];
- int spaces = 0;
+ char comm[TASK_COMM_LEN];
/* sign + log10(MAX_INT) + '\0' */
char pid_str[11];
+ int spaces = 0;
+ int ret;
+ int len;
+ int i;
- strncpy(comm, trace_find_cmdline(pid), 7);
+ trace_find_cmdline(pid, comm);
comm[7] = '\0';
sprintf(pid_str, "%d", pid);
@@ -153,17 +366,25 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
/* If the pid changed since the last trace, output this event */
static enum print_line_t
-verif_pid(struct trace_seq *s, pid_t pid, int cpu)
+verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
{
pid_t prev_pid;
+ pid_t *last_pid;
int ret;
- if (last_pid[cpu] != -1 && last_pid[cpu] == pid)
+ if (!data)
+ return TRACE_TYPE_HANDLED;
+
+ last_pid = &(per_cpu_ptr(data, cpu)->last_pid);
+
+ if (*last_pid == pid)
return TRACE_TYPE_HANDLED;
- prev_pid = last_pid[cpu];
- last_pid[cpu] = pid;
+ prev_pid = *last_pid;
+ *last_pid = pid;
+ if (prev_pid == -1)
+ return TRACE_TYPE_HANDLED;
/*
* Context-switch trace line:
@@ -175,34 +396,34 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu)
ret = trace_seq_printf(s,
" ------------------------------------------\n");
if (!ret)
- TRACE_TYPE_PARTIAL_LINE;
+ return TRACE_TYPE_PARTIAL_LINE;
ret = print_graph_cpu(s, cpu);
if (ret == TRACE_TYPE_PARTIAL_LINE)
- TRACE_TYPE_PARTIAL_LINE;
+ return TRACE_TYPE_PARTIAL_LINE;
ret = print_graph_proc(s, prev_pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
- TRACE_TYPE_PARTIAL_LINE;
+ return TRACE_TYPE_PARTIAL_LINE;
ret = trace_seq_printf(s, " => ");
if (!ret)
- TRACE_TYPE_PARTIAL_LINE;
+ return TRACE_TYPE_PARTIAL_LINE;
ret = print_graph_proc(s, pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
- TRACE_TYPE_PARTIAL_LINE;
+ return TRACE_TYPE_PARTIAL_LINE;
ret = trace_seq_printf(s,
"\n ------------------------------------------\n\n");
if (!ret)
- TRACE_TYPE_PARTIAL_LINE;
+ return TRACE_TYPE_PARTIAL_LINE;
- return ret;
+ return TRACE_TYPE_HANDLED;
}
-static bool
-trace_branch_is_leaf(struct trace_iterator *iter,
+static struct ftrace_graph_ret_entry *
+get_return_for_leaf(struct trace_iterator *iter,
struct ftrace_graph_ent_entry *curr)
{
struct ring_buffer_iter *ring_iter;
@@ -211,72 +432,130 @@ trace_branch_is_leaf(struct trace_iterator *iter,
ring_iter = iter->buffer_iter[iter->cpu];
- if (!ring_iter)
- return false;
-
- event = ring_buffer_iter_peek(ring_iter, NULL);
+ /* First peek to compare current entry and the next one */
+ if (ring_iter)
+ event = ring_buffer_iter_peek(ring_iter, NULL);
+ else {
+ /* We need to consume the current entry to see the next one */
+ ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+ event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
+ NULL);
+ }
if (!event)
- return false;
+ return NULL;
next = ring_buffer_event_data(event);
if (next->ent.type != TRACE_GRAPH_RET)
- return false;
+ return NULL;
if (curr->ent.pid != next->ent.pid ||
curr->graph_ent.func != next->ret.func)
- return false;
+ return NULL;
+
+ /* this is a leaf, now advance the iterator */
+ if (ring_iter)
+ ring_buffer_read(ring_iter, NULL);
- return true;
+ return next;
+}
+
+/* Signal a overhead of time execution to the output */
+static int
+print_graph_overhead(unsigned long long duration, struct trace_seq *s)
+{
+ /* If duration disappear, we don't need anything */
+ if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION))
+ return 1;
+
+ /* Non nested entry or return */
+ if (duration == -1)
+ return trace_seq_printf(s, " ");
+
+ if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+ /* Duration exceeded 100 msecs */
+ if (duration > 100000ULL)
+ return trace_seq_printf(s, "! ");
+
+ /* Duration exceeded 10 msecs */
+ if (duration > 10000ULL)
+ return trace_seq_printf(s, "+ ");
+ }
+
+ return trace_seq_printf(s, " ");
+}
+
+static int print_graph_abs_time(u64 t, struct trace_seq *s)
+{
+ unsigned long usecs_rem;
+
+ usecs_rem = do_div(t, NSEC_PER_SEC);
+ usecs_rem /= 1000;
+
+ return trace_seq_printf(s, "%5lu.%06lu | ",
+ (unsigned long)t, usecs_rem);
}
static enum print_line_t
-print_graph_irq(struct trace_seq *s, unsigned long addr,
- enum trace_type type, int cpu, pid_t pid)
+print_graph_irq(struct trace_iterator *iter, unsigned long addr,
+ enum trace_type type, int cpu, pid_t pid)
{
int ret;
+ struct trace_seq *s = &iter->seq;
if (addr < (unsigned long)__irqentry_text_start ||
addr >= (unsigned long)__irqentry_text_end)
return TRACE_TYPE_UNHANDLED;
- if (type == TRACE_GRAPH_ENT) {
- ret = trace_seq_printf(s, "==========> | ");
- } else {
- /* Cpu */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
- ret = print_graph_cpu(s, cpu);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
- /* Proc */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
- ret = print_graph_proc(s, pid);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ /* Absolute time */
+ if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+ ret = print_graph_abs_time(iter->ts, s);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
- ret = trace_seq_printf(s, " | ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ /* Cpu */
+ if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+ ret = print_graph_cpu(s, cpu);
+ if (ret == TRACE_TYPE_PARTIAL_LINE)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+ /* Proc */
+ if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+ ret = print_graph_proc(s, pid);
+ if (ret == TRACE_TYPE_PARTIAL_LINE)
+ return TRACE_TYPE_PARTIAL_LINE;
+ ret = trace_seq_printf(s, " | ");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
- /* No overhead */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
- ret = trace_seq_printf(s, " ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ /* No overhead */
+ ret = print_graph_overhead(-1, s);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ if (type == TRACE_GRAPH_ENT)
+ ret = trace_seq_printf(s, "==========>");
+ else
+ ret = trace_seq_printf(s, "<==========");
+
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* Don't close the duration column if haven't one */
+ if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+ trace_seq_printf(s, " |");
+ ret = trace_seq_printf(s, "\n");
- ret = trace_seq_printf(s, "<========== |\n");
- }
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t
-print_graph_duration(unsigned long long duration, struct trace_seq *s)
+enum print_line_t
+trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
{
unsigned long nsecs_rem = do_div(duration, 1000);
/* log10(ULONG_MAX) + '\0' */
@@ -288,7 +567,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
sprintf(msecs_str, "%lu", (unsigned long) duration);
/* Print msecs */
- ret = trace_seq_printf(s, msecs_str);
+ ret = trace_seq_printf(s, "%s", msecs_str);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -313,60 +592,66 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
-
- ret = trace_seq_printf(s, "| ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_HANDLED;
-
}
-/* Signal a overhead of time execution to the output */
-static int
-print_graph_overhead(unsigned long long duration, struct trace_seq *s)
+static enum print_line_t
+print_graph_duration(unsigned long long duration, struct trace_seq *s)
{
- /* Duration exceeded 100 msecs */
- if (duration > 100000ULL)
- return trace_seq_printf(s, "! ");
+ int ret;
- /* Duration exceeded 10 msecs */
- if (duration > 10000ULL)
- return trace_seq_printf(s, "+ ");
+ ret = trace_print_graph_duration(duration, s);
+ if (ret != TRACE_TYPE_HANDLED)
+ return ret;
- return trace_seq_printf(s, " ");
+ ret = trace_seq_printf(s, "| ");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
}
/* Case of a leaf function on its call entry */
static enum print_line_t
print_graph_entry_leaf(struct trace_iterator *iter,
- struct ftrace_graph_ent_entry *entry, struct trace_seq *s)
+ struct ftrace_graph_ent_entry *entry,
+ struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s)
{
- struct ftrace_graph_ret_entry *ret_entry;
+ struct fgraph_data *data = iter->private;
struct ftrace_graph_ret *graph_ret;
- struct ring_buffer_event *event;
struct ftrace_graph_ent *call;
unsigned long long duration;
int ret;
int i;
- event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
- ret_entry = ring_buffer_event_data(event);
graph_ret = &ret_entry->ret;
call = &entry->graph_ent;
duration = graph_ret->rettime - graph_ret->calltime;
- /* Overhead */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
- ret = print_graph_overhead(duration, s);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ if (data) {
+ int cpu = iter->cpu;
+ int *depth = &(per_cpu_ptr(data, cpu)->depth);
+
+ /*
+ * Comments display at + 1 to depth. Since
+ * this is a leaf function, keep the comments
+ * equal to this depth.
+ */
+ *depth = call->depth - 1;
}
- /* Duration */
- ret = print_graph_duration(duration, s);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
+ /* Overhead */
+ ret = print_graph_overhead(duration, s);
+ if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
+ /* Duration */
+ if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+ ret = print_graph_duration(duration, s);
+ if (ret == TRACE_TYPE_PARTIAL_LINE)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+
/* Function */
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
ret = trace_seq_printf(s, " ");
@@ -374,11 +659,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
return TRACE_TYPE_PARTIAL_LINE;
}
- ret = seq_print_ip_sym(s, call->func, 0);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_printf(s, "();\n");
+ ret = trace_seq_printf(s, "%pf();\n", (void *)call->func);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -386,33 +667,34 @@ print_graph_entry_leaf(struct trace_iterator *iter,
}
static enum print_line_t
-print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
- struct trace_seq *s, pid_t pid, int cpu)
+print_graph_entry_nested(struct trace_iterator *iter,
+ struct ftrace_graph_ent_entry *entry,
+ struct trace_seq *s, int cpu)
{
- int i;
- int ret;
struct ftrace_graph_ent *call = &entry->graph_ent;
+ struct fgraph_data *data = iter->private;
+ int ret;
+ int i;
- /* No overhead */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
- ret = trace_seq_printf(s, " ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ if (data) {
+ int cpu = iter->cpu;
+ int *depth = &(per_cpu_ptr(data, cpu)->depth);
+
+ *depth = call->depth;
}
- /* Interrupt */
- ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid);
- if (ret == TRACE_TYPE_UNHANDLED) {
- /* No time */
+ /* No overhead */
+ ret = print_graph_overhead(-1, s);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* No time */
+ if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
ret = trace_seq_printf(s, " | ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
- } else {
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
}
-
/* Function */
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
ret = trace_seq_printf(s, " ");
@@ -420,28 +702,44 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
return TRACE_TYPE_PARTIAL_LINE;
}
- ret = seq_print_ip_sym(s, call->func, 0);
+ ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
- ret = trace_seq_printf(s, "() {\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ /*
+ * we already consumed the current entry to check the next one
+ * and see if this is a leaf.
+ */
+ return TRACE_TYPE_NO_CONSUME;
}
static enum print_line_t
-print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
- struct trace_iterator *iter, int cpu)
+print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
+ int type, unsigned long addr)
{
- int ret;
+ struct fgraph_data *data = iter->private;
struct trace_entry *ent = iter->ent;
+ int cpu = iter->cpu;
+ int ret;
/* Pid */
- if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
+ if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
+ if (type) {
+ /* Interrupt */
+ ret = print_graph_irq(iter, addr, type, cpu, ent->pid);
+ if (ret == TRACE_TYPE_PARTIAL_LINE)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+
+ /* Absolute time */
+ if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+ ret = print_graph_abs_time(iter->ts, s);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+
/* Cpu */
if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
ret = print_graph_cpu(s, cpu);
@@ -460,54 +758,65 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
return TRACE_TYPE_PARTIAL_LINE;
}
- if (trace_branch_is_leaf(iter, field))
- return print_graph_entry_leaf(iter, field, s);
+ return 0;
+}
+
+static enum print_line_t
+print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
+ struct trace_iterator *iter)
+{
+ int cpu = iter->cpu;
+ struct ftrace_graph_ent *call = &field->graph_ent;
+ struct ftrace_graph_ret_entry *leaf_ret;
+
+ if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ leaf_ret = get_return_for_leaf(iter, field);
+ if (leaf_ret)
+ return print_graph_entry_leaf(iter, field, leaf_ret, s);
else
- return print_graph_entry_nested(field, s, iter->ent->pid, cpu);
+ return print_graph_entry_nested(iter, field, s, cpu);
}
static enum print_line_t
print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
- struct trace_entry *ent, int cpu)
+ struct trace_entry *ent, struct trace_iterator *iter)
{
- int i;
- int ret;
unsigned long long duration = trace->rettime - trace->calltime;
+ struct fgraph_data *data = iter->private;
+ pid_t pid = ent->pid;
+ int cpu = iter->cpu;
+ int ret;
+ int i;
- /* Pid */
- if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ if (data) {
+ int cpu = iter->cpu;
+ int *depth = &(per_cpu_ptr(data, cpu)->depth);
- /* Cpu */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
- ret = print_graph_cpu(s, cpu);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ /*
+ * Comments display at + 1 to depth. This is the
+ * return from a function, we now want the comments
+ * to display at the same level of the bracket.
+ */
+ *depth = trace->depth - 1;
}
- /* Proc */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
- ret = print_graph_proc(s, ent->pid);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_printf(s, " | ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (print_graph_prologue(iter, s, 0, 0))
+ return TRACE_TYPE_PARTIAL_LINE;
/* Overhead */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
- ret = print_graph_overhead(duration, s);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ ret = print_graph_overhead(duration, s);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
/* Duration */
- ret = print_graph_duration(duration, s);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+ ret = print_graph_duration(duration, s);
+ if (ret == TRACE_TYPE_PARTIAL_LINE)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
/* Closing brace */
for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
@@ -528,7 +837,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
return TRACE_TYPE_PARTIAL_LINE;
}
- ret = print_graph_irq(s, trace->func, TRACE_GRAPH_RET, cpu, ent->pid);
+ ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
@@ -536,61 +845,73 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
}
static enum print_line_t
-print_graph_comment(struct print_entry *trace, struct trace_seq *s,
- struct trace_entry *ent, struct trace_iterator *iter)
+print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
+ struct trace_iterator *iter)
{
- int i;
+ unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+ struct fgraph_data *data = iter->private;
+ struct trace_event *event;
+ int depth = 0;
int ret;
+ int i;
- /* Pid */
- if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
-
- /* Cpu */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
- ret = print_graph_cpu(s, iter->cpu);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
-
- /* Proc */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
- ret = print_graph_proc(s, ent->pid);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ if (data)
+ depth = per_cpu_ptr(data, iter->cpu)->depth;
- ret = trace_seq_printf(s, " | ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (print_graph_prologue(iter, s, 0, 0))
+ return TRACE_TYPE_PARTIAL_LINE;
/* No overhead */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
- ret = trace_seq_printf(s, " ");
+ ret = print_graph_overhead(-1, s);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /* No time */
+ if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+ ret = trace_seq_printf(s, " | ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
- /* No time */
- ret = trace_seq_printf(s, " | ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
/* Indentation */
- if (trace->depth > 0)
- for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) {
+ if (depth > 0)
+ for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) {
ret = trace_seq_printf(s, " ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* The comment */
- ret = trace_seq_printf(s, "/* %s", trace->buf);
+ ret = trace_seq_printf(s, "/* ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
- if (ent->flags & TRACE_FLAG_CONT)
- trace_seq_print_cont(s, iter);
+ switch (iter->ent->type) {
+ case TRACE_BPRINT:
+ ret = trace_print_bprintk_msg_only(iter);
+ if (ret != TRACE_TYPE_HANDLED)
+ return ret;
+ break;
+ case TRACE_PRINT:
+ ret = trace_print_printk_msg_only(iter);
+ if (ret != TRACE_TYPE_HANDLED)
+ return ret;
+ break;
+ default:
+ event = ftrace_find_event(ent->type);
+ if (!event)
+ return TRACE_TYPE_UNHANDLED;
+
+ ret = event->trace(iter, sym_flags);
+ if (ret != TRACE_TYPE_HANDLED)
+ return ret;
+ }
+
+ /* Strip ending newline */
+ if (s->buffer[s->len - 1] == '\n') {
+ s->buffer[s->len - 1] = '\0';
+ s->len--;
+ }
ret = trace_seq_printf(s, " */\n");
if (!ret)
@@ -603,66 +924,104 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
enum print_line_t
print_graph_function(struct trace_iterator *iter)
{
- struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
+ struct trace_seq *s = &iter->seq;
switch (entry->type) {
case TRACE_GRAPH_ENT: {
- struct ftrace_graph_ent_entry *field;
+ /*
+ * print_graph_entry() may consume the current event,
+ * thus @field may become invalid, so we need to save it.
+ * sizeof(struct ftrace_graph_ent_entry) is very small,
+ * it can be safely saved at the stack.
+ */
+ struct ftrace_graph_ent_entry *field, saved;
trace_assign_type(field, entry);
- return print_graph_entry(field, s, iter,
- iter->cpu);
+ saved = *field;
+ return print_graph_entry(&saved, s, iter);
}
case TRACE_GRAPH_RET: {
struct ftrace_graph_ret_entry *field;
trace_assign_type(field, entry);
- return print_graph_return(&field->ret, s, entry, iter->cpu);
- }
- case TRACE_PRINT: {
- struct print_entry *field;
- trace_assign_type(field, entry);
- return print_graph_comment(field, s, entry, iter);
+ return print_graph_return(&field->ret, s, entry, iter);
}
default:
- return TRACE_TYPE_UNHANDLED;
+ return print_graph_comment(s, entry, iter);
}
+
+ return TRACE_TYPE_HANDLED;
}
static void print_graph_headers(struct seq_file *s)
{
/* 1st line */
seq_printf(s, "# ");
+ if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+ seq_printf(s, " TIME ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
- seq_printf(s, "CPU ");
+ seq_printf(s, "CPU");
if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
- seq_printf(s, "TASK/PID ");
- if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD)
- seq_printf(s, "OVERHEAD/");
- seq_printf(s, "DURATION FUNCTION CALLS\n");
+ seq_printf(s, " TASK/PID ");
+ if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+ seq_printf(s, " DURATION ");
+ seq_printf(s, " FUNCTION CALLS\n");
/* 2nd line */
seq_printf(s, "# ");
+ if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+ seq_printf(s, " | ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
- seq_printf(s, "| ");
+ seq_printf(s, "| ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
- seq_printf(s, "| | ");
- if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
- seq_printf(s, "| ");
- seq_printf(s, "| | | | |\n");
- } else
- seq_printf(s, " | | | | |\n");
+ seq_printf(s, " | | ");
+ if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+ seq_printf(s, " | | ");
+ seq_printf(s, " | | | |\n");
}
+
+static void graph_trace_open(struct trace_iterator *iter)
+{
+ /* pid and depth on the last trace processed */
+ struct fgraph_data *data = alloc_percpu(struct fgraph_data);
+ int cpu;
+
+ if (!data)
+ pr_warning("function graph tracer: not enough memory\n");
+ else
+ for_each_possible_cpu(cpu) {
+ pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid);
+ int *depth = &(per_cpu_ptr(data, cpu)->depth);
+ *pid = -1;
+ *depth = 0;
+ }
+
+ iter->private = data;
+}
+
+static void graph_trace_close(struct trace_iterator *iter)
+{
+ free_percpu(iter->private);
+}
+
static struct tracer graph_trace __read_mostly = {
- .name = "function_graph",
- .init = graph_trace_init,
- .reset = graph_trace_reset,
+ .name = "function_graph",
+ .open = graph_trace_open,
+ .close = graph_trace_close,
+ .wait_pipe = poll_wait_pipe,
+ .init = graph_trace_init,
+ .reset = graph_trace_reset,
.print_line = print_graph_function,
.print_header = print_graph_headers,
.flags = &tracer_flags,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_function_graph,
+#endif
};
static __init int init_graph_trace(void)
{
+ max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
+
return register_tracer(&graph_trace);
}
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 649df22d435..ca7d7c4d0c2 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -1,89 +1,147 @@
/*
- * h/w branch tracer for x86 based on bts
- *
- * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
+ * h/w branch tracer for x86 based on BTS
*
+ * Copyright (C) 2008-2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
*/
-
-#include <linux/module.h>
-#include <linux/fs.h>
+#include <linux/kallsyms.h>
#include <linux/debugfs.h>
#include <linux/ftrace.h>
-#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/fs.h>
#include <asm/ds.h>
+#include "trace_output.h"
#include "trace.h"
-#define SIZEOF_BTS (1 << 13)
+#define BTS_BUFFER_SIZE (1 << 13)
static DEFINE_PER_CPU(struct bts_tracer *, tracer);
-static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
+static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer);
#define this_tracer per_cpu(tracer, smp_processor_id())
-#define this_buffer per_cpu(buffer, smp_processor_id())
+static int trace_hw_branches_enabled __read_mostly;
+static int trace_hw_branches_suspended __read_mostly;
+static struct trace_array *hw_branch_trace __read_mostly;
-static void bts_trace_start_cpu(void *arg)
+
+static void bts_trace_init_cpu(int cpu)
{
- if (this_tracer)
- ds_release_bts(this_tracer);
+ per_cpu(tracer, cpu) =
+ ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE,
+ NULL, (size_t)-1, BTS_KERNEL);
- this_tracer =
- ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
- /* ovfl = */ NULL, /* th = */ (size_t)-1,
- BTS_KERNEL);
- if (IS_ERR(this_tracer)) {
- this_tracer = NULL;
- return;
- }
+ if (IS_ERR(per_cpu(tracer, cpu)))
+ per_cpu(tracer, cpu) = NULL;
}
-static void bts_trace_start(struct trace_array *tr)
+static int bts_trace_init(struct trace_array *tr)
{
int cpu;
- tracing_reset_online_cpus(tr);
+ hw_branch_trace = tr;
+ trace_hw_branches_enabled = 0;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ bts_trace_init_cpu(cpu);
+
+ if (likely(per_cpu(tracer, cpu)))
+ trace_hw_branches_enabled = 1;
+ }
+ trace_hw_branches_suspended = 0;
+ put_online_cpus();
- for_each_cpu(cpu, cpu_possible_mask)
- smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+ /* If we could not enable tracing on a single cpu, we fail. */
+ return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
}
-static void bts_trace_stop_cpu(void *arg)
+static void bts_trace_reset(struct trace_array *tr)
{
- if (this_tracer) {
- ds_release_bts(this_tracer);
- this_tracer = NULL;
+ int cpu;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ if (likely(per_cpu(tracer, cpu))) {
+ ds_release_bts(per_cpu(tracer, cpu));
+ per_cpu(tracer, cpu) = NULL;
+ }
}
+ trace_hw_branches_enabled = 0;
+ trace_hw_branches_suspended = 0;
+ put_online_cpus();
+}
+
+static void bts_trace_start(struct trace_array *tr)
+{
+ int cpu;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ if (likely(per_cpu(tracer, cpu)))
+ ds_resume_bts(per_cpu(tracer, cpu));
+ trace_hw_branches_suspended = 0;
+ put_online_cpus();
}
static void bts_trace_stop(struct trace_array *tr)
{
int cpu;
- for_each_cpu(cpu, cpu_possible_mask)
- smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ if (likely(per_cpu(tracer, cpu)))
+ ds_suspend_bts(per_cpu(tracer, cpu));
+ trace_hw_branches_suspended = 1;
+ put_online_cpus();
}
-static int bts_trace_init(struct trace_array *tr)
+static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
{
- tracing_reset_online_cpus(tr);
- bts_trace_start(tr);
+ int cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ /* The notification is sent with interrupts enabled. */
+ if (trace_hw_branches_enabled) {
+ bts_trace_init_cpu(cpu);
+
+ if (trace_hw_branches_suspended &&
+ likely(per_cpu(tracer, cpu)))
+ ds_suspend_bts(per_cpu(tracer, cpu));
+ }
+ break;
- return 0;
+ case CPU_DOWN_PREPARE:
+ /* The notification is sent with interrupts enabled. */
+ if (likely(per_cpu(tracer, cpu))) {
+ ds_release_bts(per_cpu(tracer, cpu));
+ per_cpu(tracer, cpu) = NULL;
+ }
+ }
+
+ return NOTIFY_DONE;
}
+static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
+ .notifier_call = bts_hotcpu_handler
+};
+
static void bts_trace_print_header(struct seq_file *m)
{
- seq_puts(m,
- "# CPU# FROM TO FUNCTION\n");
- seq_puts(m,
- "# | | | |\n");
+ seq_puts(m, "# CPU# TO <- FROM\n");
}
static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
{
+ unsigned long symflags = TRACE_ITER_SYM_OFFSET;
struct trace_entry *entry = iter->ent;
struct trace_seq *seq = &iter->seq;
struct hw_branch_entry *it;
@@ -91,11 +149,10 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
trace_assign_type(it, entry);
if (entry->type == TRACE_HW_BRANCHES) {
- if (trace_seq_printf(seq, "%4d ", entry->cpu) &&
- trace_seq_printf(seq, "0x%016llx -> 0x%016llx ",
- it->from, it->to) &&
- (!it->from ||
- seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) &&
+ if (trace_seq_printf(seq, "%4d ", iter->cpu) &&
+ seq_print_ip_sym(seq, it->to, symflags) &&
+ trace_seq_printf(seq, "\t <- ") &&
+ seq_print_ip_sym(seq, it->from, symflags) &&
trace_seq_printf(seq, "\n"))
return TRACE_TYPE_HANDLED;
return TRACE_TYPE_PARTIAL_LINE;;
@@ -103,26 +160,44 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
return TRACE_TYPE_UNHANDLED;
}
-void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
+void trace_hw_branch(u64 from, u64 to)
{
+ struct ftrace_event_call *call = &event_hw_branch;
+ struct trace_array *tr = hw_branch_trace;
struct ring_buffer_event *event;
struct hw_branch_entry *entry;
- unsigned long irq;
+ unsigned long irq1;
+ int cpu;
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
- if (!event)
+ if (unlikely(!tr))
+ return;
+
+ if (unlikely(!trace_hw_branches_enabled))
return;
+
+ local_irq_save(irq1);
+ cpu = raw_smp_processor_id();
+ if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
+ goto out;
+
+ event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES,
+ sizeof(*entry), 0, 0);
+ if (!event)
+ goto out;
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, from);
entry->ent.type = TRACE_HW_BRANCHES;
- entry->ent.cpu = smp_processor_id();
entry->from = from;
entry->to = to;
- ring_buffer_unlock_commit(tr->buffer, event, irq);
+ if (!filter_check_discard(call, entry, tr->buffer, event))
+ trace_buffer_unlock_commit(tr, event, 0, 0);
+
+ out:
+ atomic_dec(&tr->data[cpu]->disabled);
+ local_irq_restore(irq1);
}
-static void trace_bts_at(struct trace_array *tr,
- const struct bts_trace *trace, void *at)
+static void trace_bts_at(const struct bts_trace *trace, void *at)
{
struct bts_struct bts;
int err = 0;
@@ -137,59 +212,98 @@ static void trace_bts_at(struct trace_array *tr,
switch (bts.qualifier) {
case BTS_BRANCH:
- trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to);
+ trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
break;
}
}
+/*
+ * Collect the trace on the current cpu and write it into the ftrace buffer.
+ *
+ * pre: tracing must be suspended on the current cpu
+ */
static void trace_bts_cpu(void *arg)
{
- struct trace_array *tr = (struct trace_array *) arg;
+ struct trace_array *tr = (struct trace_array *)arg;
const struct bts_trace *trace;
unsigned char *at;
- if (!this_tracer)
+ if (unlikely(!tr))
+ return;
+
+ if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
+ return;
+
+ if (unlikely(!this_tracer))
return;
- ds_suspend_bts(this_tracer);
trace = ds_read_bts(this_tracer);
if (!trace)
- goto out;
+ return;
for (at = trace->ds.top; (void *)at < trace->ds.end;
at += trace->ds.size)
- trace_bts_at(tr, trace, at);
+ trace_bts_at(trace, at);
for (at = trace->ds.begin; (void *)at < trace->ds.top;
at += trace->ds.size)
- trace_bts_at(tr, trace, at);
-
-out:
- ds_resume_bts(this_tracer);
+ trace_bts_at(trace, at);
}
static void trace_bts_prepare(struct trace_iterator *iter)
{
int cpu;
- for_each_cpu(cpu, cpu_possible_mask)
- smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ if (likely(per_cpu(tracer, cpu)))
+ ds_suspend_bts(per_cpu(tracer, cpu));
+ /*
+ * We need to collect the trace on the respective cpu since ftrace
+ * implicitly adds the record for the current cpu.
+ * Once that is more flexible, we could collect the data from any cpu.
+ */
+ on_each_cpu(trace_bts_cpu, iter->tr, 1);
+
+ for_each_online_cpu(cpu)
+ if (likely(per_cpu(tracer, cpu)))
+ ds_resume_bts(per_cpu(tracer, cpu));
+ put_online_cpus();
+}
+
+static void trace_bts_close(struct trace_iterator *iter)
+{
+ tracing_reset_online_cpus(iter->tr);
+}
+
+void trace_hw_branch_oops(void)
+{
+ if (this_tracer) {
+ ds_suspend_bts_noirq(this_tracer);
+ trace_bts_cpu(hw_branch_trace);
+ ds_resume_bts_noirq(this_tracer);
+ }
}
struct tracer bts_tracer __read_mostly =
{
.name = "hw-branch-tracer",
.init = bts_trace_init,
- .reset = bts_trace_stop,
+ .reset = bts_trace_reset,
.print_header = bts_trace_print_header,
.print_line = bts_trace_print_line,
.start = bts_trace_start,
.stop = bts_trace_stop,
- .open = trace_bts_prepare
+ .open = trace_bts_prepare,
+ .close = trace_bts_close,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_hw_branches,
+#endif /* CONFIG_FTRACE_SELFTEST */
};
__init static int init_bts_trace(void)
{
+ register_hotcpu_notifier(&bts_hotcpu_notifier);
return register_tracer(&bts_tracer);
}
device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 7c2e326bbc8..5555b75a0d1 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -1,5 +1,5 @@
/*
- * trace irqs off criticall timings
+ * trace irqs off critical timings
*
* Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
* Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
@@ -32,6 +32,8 @@ enum {
static int trace_type __read_mostly;
+static int save_lat_flag;
+
#ifdef CONFIG_PREEMPT_TRACER
static inline int
preempt_trace(void)
@@ -95,7 +97,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1))
- trace_function(tr, data, ip, parent_ip, flags, preempt_count());
+ trace_function(tr, ip, parent_ip, flags, preempt_count());
atomic_dec(&data->disabled);
}
@@ -153,7 +155,7 @@ check_critical_timing(struct trace_array *tr,
if (!report_latency(delta))
goto out_unlock;
- trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
+ trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
latency = nsecs_to_usecs(delta);
@@ -176,8 +178,7 @@ out_unlock:
out:
data->critical_sequence = max_sequence;
data->preempt_timestamp = ftrace_now(cpu);
- tracing_reset(tr, cpu);
- trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
+ trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
}
static inline void
@@ -206,11 +207,10 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
data->critical_sequence = max_sequence;
data->preempt_timestamp = ftrace_now(cpu);
data->critical_start = parent_ip ? : ip;
- tracing_reset(tr, cpu);
local_save_flags(flags);
- trace_function(tr, data, ip, parent_ip, flags, preempt_count());
+ trace_function(tr, ip, parent_ip, flags, preempt_count());
per_cpu(tracing_cpu, cpu) = 1;
@@ -244,7 +244,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
atomic_inc(&data->disabled);
local_save_flags(flags);
- trace_function(tr, data, ip, parent_ip, flags, preempt_count());
+ trace_function(tr, ip, parent_ip, flags, preempt_count());
check_critical_timing(tr, data, parent_ip ? : ip, cpu);
data->critical_start = 0;
atomic_dec(&data->disabled);
@@ -353,66 +353,50 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
}
#endif /* CONFIG_PREEMPT_TRACER */
-/*
- * save_tracer_enabled is used to save the state of the tracer_enabled
- * variable when we disable it when we open a trace output file.
- */
-static int save_tracer_enabled;
-
static void start_irqsoff_tracer(struct trace_array *tr)
{
register_ftrace_function(&trace_ops);
- if (tracing_is_enabled()) {
+ if (tracing_is_enabled())
tracer_enabled = 1;
- save_tracer_enabled = 1;
- } else {
+ else
tracer_enabled = 0;
- save_tracer_enabled = 0;
- }
}
static void stop_irqsoff_tracer(struct trace_array *tr)
{
tracer_enabled = 0;
- save_tracer_enabled = 0;
unregister_ftrace_function(&trace_ops);
}
static void __irqsoff_tracer_init(struct trace_array *tr)
{
+ save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
+ trace_flags |= TRACE_ITER_LATENCY_FMT;
+
+ tracing_max_latency = 0;
irqsoff_trace = tr;
/* make sure that the tracer is visible */
smp_wmb();
+ tracing_reset_online_cpus(tr);
start_irqsoff_tracer(tr);
}
static void irqsoff_tracer_reset(struct trace_array *tr)
{
stop_irqsoff_tracer(tr);
+
+ if (!save_lat_flag)
+ trace_flags &= ~TRACE_ITER_LATENCY_FMT;
}
static void irqsoff_tracer_start(struct trace_array *tr)
{
tracer_enabled = 1;
- save_tracer_enabled = 1;
}
static void irqsoff_tracer_stop(struct trace_array *tr)
{
tracer_enabled = 0;
- save_tracer_enabled = 0;
-}
-
-static void irqsoff_tracer_open(struct trace_iterator *iter)
-{
- /* stop the trace while dumping */
- tracer_enabled = 0;
-}
-
-static void irqsoff_tracer_close(struct trace_iterator *iter)
-{
- /* restart tracing */
- tracer_enabled = save_tracer_enabled;
}
#ifdef CONFIG_IRQSOFF_TRACER
@@ -430,8 +414,6 @@ static struct tracer irqsoff_tracer __read_mostly =
.reset = irqsoff_tracer_reset,
.start = irqsoff_tracer_start,
.stop = irqsoff_tracer_stop,
- .open = irqsoff_tracer_open,
- .close = irqsoff_tracer_close,
.print_max = 1,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_irqsoff,
@@ -458,8 +440,6 @@ static struct tracer preemptoff_tracer __read_mostly =
.reset = irqsoff_tracer_reset,
.start = irqsoff_tracer_start,
.stop = irqsoff_tracer_stop,
- .open = irqsoff_tracer_open,
- .close = irqsoff_tracer_close,
.print_max = 1,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptoff,
@@ -488,8 +468,6 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
.reset = irqsoff_tracer_reset,
.start = irqsoff_tracer_start,
.stop = irqsoff_tracer_stop,
- .open = irqsoff_tracer_open,
- .close = irqsoff_tracer_close,
.print_max = 1,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptirqsoff,
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fffcb069f1d..c4c9bbda53d 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,8 +9,12 @@
#include <linux/kernel.h>
#include <linux/mmiotrace.h>
#include <linux/pci.h>
+#include <linux/time.h>
+
+#include <asm/atomic.h>
#include "trace.h"
+#include "trace_output.h"
struct header_iter {
struct pci_dev *dev;
@@ -19,6 +23,7 @@ struct header_iter {
static struct trace_array *mmio_trace_array;
static bool overrun_detected;
static unsigned long prev_overruns;
+static atomic_t dropped_count;
static void mmio_reset_data(struct trace_array *tr)
{
@@ -121,11 +126,11 @@ static void mmio_close(struct trace_iterator *iter)
static unsigned long count_overruns(struct trace_iterator *iter)
{
- unsigned long cnt = 0;
+ unsigned long cnt = atomic_xchg(&dropped_count, 0);
unsigned long over = ring_buffer_overruns(iter->tr->buffer);
if (over > prev_overruns)
- cnt = over - prev_overruns;
+ cnt += over - prev_overruns;
prev_overruns = over;
return cnt;
}
@@ -171,7 +176,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
struct mmiotrace_rw *rw;
struct trace_seq *s = &iter->seq;
unsigned long long t = ns2usecs(iter->ts);
- unsigned long usec_rem = do_div(t, 1000000ULL);
+ unsigned long usec_rem = do_div(t, USEC_PER_SEC);
unsigned secs = (unsigned long)t;
int ret = 1;
@@ -181,21 +186,22 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
switch (rw->opcode) {
case MMIO_READ:
ret = trace_seq_printf(s,
- "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+ "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
rw->width, secs, usec_rem, rw->map_id,
(unsigned long long)rw->phys,
rw->value, rw->pc, 0);
break;
case MMIO_WRITE:
ret = trace_seq_printf(s,
- "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+ "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
rw->width, secs, usec_rem, rw->map_id,
(unsigned long long)rw->phys,
rw->value, rw->pc, 0);
break;
case MMIO_UNKNOWN_OP:
ret = trace_seq_printf(s,
- "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n",
+ "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx,"
+ "%02lx 0x%lx %d\n",
secs, usec_rem, rw->map_id,
(unsigned long long)rw->phys,
(rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
@@ -217,7 +223,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
struct mmiotrace_map *m;
struct trace_seq *s = &iter->seq;
unsigned long long t = ns2usecs(iter->ts);
- unsigned long usec_rem = do_div(t, 1000000ULL);
+ unsigned long usec_rem = do_div(t, USEC_PER_SEC);
unsigned secs = (unsigned long)t;
int ret;
@@ -227,14 +233,14 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
switch (m->opcode) {
case MMIO_PROBE:
ret = trace_seq_printf(s,
- "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
+ "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
secs, usec_rem, m->map_id,
(unsigned long long)m->phys, m->virt, m->len,
0UL, 0);
break;
case MMIO_UNPROBE:
ret = trace_seq_printf(s,
- "UNMAP %lu.%06lu %d 0x%lx %d\n",
+ "UNMAP %u.%06lu %d 0x%lx %d\n",
secs, usec_rem, m->map_id, 0UL, 0);
break;
default:
@@ -253,18 +259,15 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
const char *msg = print->buf;
struct trace_seq *s = &iter->seq;
unsigned long long t = ns2usecs(iter->ts);
- unsigned long usec_rem = do_div(t, 1000000ULL);
+ unsigned long usec_rem = do_div(t, USEC_PER_SEC);
unsigned secs = (unsigned long)t;
int ret;
/* The trailing newline must be in the message. */
- ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg);
+ ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
- if (entry->flags & TRACE_FLAG_CONT)
- trace_seq_print_cont(s, iter);
-
return TRACE_TYPE_HANDLED;
}
@@ -304,21 +307,20 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
struct trace_array_cpu *data,
struct mmiotrace_rw *rw)
{
+ struct ring_buffer *buffer = tr->buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_rw *entry;
- unsigned long irq_flags;
+ int pc = preempt_count();
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
- &irq_flags);
- if (!event)
+ event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW,
+ sizeof(*entry), 0, pc);
+ if (!event) {
+ atomic_inc(&dropped_count);
return;
+ }
entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, 0, preempt_count());
- entry->ent.type = TRACE_MMIO_RW;
entry->rw = *rw;
- ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-
- trace_wake_up();
+ trace_buffer_unlock_commit(buffer, event, 0, pc);
}
void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -332,21 +334,20 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
struct trace_array_cpu *data,
struct mmiotrace_map *map)
{
+ struct ring_buffer *buffer = tr->buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_map *entry;
- unsigned long irq_flags;
+ int pc = preempt_count();
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
- &irq_flags);
- if (!event)
+ event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP,
+ sizeof(*entry), 0, pc);
+ if (!event) {
+ atomic_inc(&dropped_count);
return;
+ }
entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, 0, preempt_count());
- entry->ent.type = TRACE_MMIO_MAP;
entry->map = *map;
- ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-
- trace_wake_up();
+ trace_buffer_unlock_commit(buffer, event, 0, pc);
}
void mmio_trace_mapping(struct mmiotrace_map *map)
@@ -362,5 +363,5 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
int mmio_trace_printk(const char *fmt, va_list args)
{
- return trace_vprintk(0, -1, fmt, args);
+ return trace_vprintk(0, fmt, args);
}
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index b9767acd30a..394f94417e2 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -47,12 +47,7 @@ static void stop_nop_trace(struct trace_array *tr)
static int nop_trace_init(struct trace_array *tr)
{
- int cpu;
ctx_trace = tr;
-
- for_each_online_cpu(cpu)
- tracing_reset(tr, cpu);
-
start_nop_trace(tr);
return 0;
}
@@ -96,6 +91,7 @@ struct tracer nop_trace __read_mostly =
.name = "nop",
.init = nop_trace_init,
.reset = nop_trace_reset,
+ .wait_pipe = poll_wait_pipe,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_nop,
#endif
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
new file mode 100644
index 00000000000..e0c2545622e
--- /dev/null
+++ b/kernel/trace/trace_output.c
@@ -0,0 +1,1202 @@
+/*
+ * trace_output.c
+ *
+ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/ftrace.h>
+
+#include "trace_output.h"
+
+/* must be a power of 2 */
+#define EVENT_HASHSIZE 128
+
+DECLARE_RWSEM(trace_event_mutex);
+
+DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
+EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
+
+static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
+
+static int next_event_type = __TRACE_LAST_TYPE + 1;
+
+void trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+ int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+
+ seq_write(m, s->buffer, len);
+
+ trace_seq_init(s);
+}
+
+enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry = iter->ent;
+ struct bprint_entry *field;
+ int ret;
+
+ trace_assign_type(field, entry);
+
+ ret = trace_seq_bprintf(s, field->fmt, field->buf);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry = iter->ent;
+ struct print_entry *field;
+ int ret;
+
+ trace_assign_type(field, entry);
+
+ ret = trace_seq_printf(s, "%s", field->buf);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+/**
+ * trace_seq_printf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+ int len = (PAGE_SIZE - 1) - s->len;
+ va_list ap;
+ int ret;
+
+ if (!len)
+ return 0;
+
+ va_start(ap, fmt);
+ ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+ va_end(ap);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (ret >= len)
+ return 0;
+
+ s->len += ret;
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_printf);
+
+/**
+ * trace_seq_vprintf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+{
+ int len = (PAGE_SIZE - 1) - s->len;
+ int ret;
+
+ if (!len)
+ return 0;
+
+ ret = vsnprintf(s->buffer + s->len, len, fmt, args);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (ret >= len)
+ return 0;
+
+ s->len += ret;
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_vprintf);
+
+int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+{
+ int len = (PAGE_SIZE - 1) - s->len;
+ int ret;
+
+ if (!len)
+ return 0;
+
+ ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (ret >= len)
+ return 0;
+
+ s->len += ret;
+
+ return len;
+}
+
+/**
+ * trace_seq_puts - trace sequence printing of simple string
+ * @s: trace sequence descriptor
+ * @str: simple string to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ */
+int trace_seq_puts(struct trace_seq *s, const char *str)
+{
+ int len = strlen(str);
+
+ if (len > ((PAGE_SIZE - 1) - s->len))
+ return 0;
+
+ memcpy(s->buffer + s->len, str, len);
+ s->len += len;
+
+ return len;
+}
+
+int trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+ if (s->len >= (PAGE_SIZE - 1))
+ return 0;
+
+ s->buffer[s->len++] = c;
+
+ return 1;
+}
+
+int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
+{
+ if (len > ((PAGE_SIZE - 1) - s->len))
+ return 0;
+
+ memcpy(s->buffer + s->len, mem, len);
+ s->len += len;
+
+ return len;
+}
+
+int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
+{
+ unsigned char hex[HEX_CHARS];
+ const unsigned char *data = mem;
+ int i, j;
+
+#ifdef __BIG_ENDIAN
+ for (i = 0, j = 0; i < len; i++) {
+#else
+ for (i = len-1, j = 0; i >= 0; i--) {
+#endif
+ hex[j++] = hex_asc_hi(data[i]);
+ hex[j++] = hex_asc_lo(data[i]);
+ }
+ hex[j++] = ' ';
+
+ return trace_seq_putmem(s, hex, j);
+}
+
+void *trace_seq_reserve(struct trace_seq *s, size_t len)
+{
+ void *ret;
+
+ if (len > ((PAGE_SIZE - 1) - s->len))
+ return NULL;
+
+ ret = s->buffer + s->len;
+ s->len += len;
+
+ return ret;
+}
+
+int trace_seq_path(struct trace_seq *s, struct path *path)
+{
+ unsigned char *p;
+
+ if (s->len >= (PAGE_SIZE - 1))
+ return 0;
+ p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
+ if (!IS_ERR(p)) {
+ p = mangle_path(s->buffer + s->len, p, "\n");
+ if (p) {
+ s->len = p - s->buffer;
+ return 1;
+ }
+ } else {
+ s->buffer[s->len++] = '?';
+ return 1;
+ }
+
+ return 0;
+}
+
+const char *
+ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
+ unsigned long flags,
+ const struct trace_print_flags *flag_array)
+{
+ unsigned long mask;
+ const char *str;
+ const char *ret = p->buffer + p->len;
+ int i;
+
+ for (i = 0; flag_array[i].name && flags; i++) {
+
+ mask = flag_array[i].mask;
+ if ((flags & mask) != mask)
+ continue;
+
+ str = flag_array[i].name;
+ flags &= ~mask;
+ if (p->len && delim)
+ trace_seq_puts(p, delim);
+ trace_seq_puts(p, str);
+ }
+
+ /* check for left over flags */
+ if (flags) {
+ if (p->len && delim)
+ trace_seq_puts(p, delim);
+ trace_seq_printf(p, "0x%lx", flags);
+ }
+
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+EXPORT_SYMBOL(ftrace_print_flags_seq);
+
+const char *
+ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+ const struct trace_print_flags *symbol_array)
+{
+ int i;
+ const char *ret = p->buffer + p->len;
+
+ for (i = 0; symbol_array[i].name; i++) {
+
+ if (val != symbol_array[i].mask)
+ continue;
+
+ trace_seq_puts(p, symbol_array[i].name);
+ break;
+ }
+
+ if (!p->len)
+ trace_seq_printf(p, "0x%lx", val);
+
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+EXPORT_SYMBOL(ftrace_print_symbols_seq);
+
+#ifdef CONFIG_KRETPROBES
+static inline const char *kretprobed(const char *name)
+{
+ static const char tramp_name[] = "kretprobe_trampoline";
+ int size = sizeof(tramp_name);
+
+ if (strncmp(tramp_name, name, size) == 0)
+ return "[unknown/kretprobe'd]";
+ return name;
+}
+#else
+static inline const char *kretprobed(const char *name)
+{
+ return name;
+}
+#endif /* CONFIG_KRETPROBES */
+
+static int
+seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+ char str[KSYM_SYMBOL_LEN];
+ const char *name;
+
+ kallsyms_lookup(address, NULL, NULL, NULL, str);
+
+ name = kretprobed(str);
+
+ return trace_seq_printf(s, fmt, name);
+#endif
+ return 1;
+}
+
+static int
+seq_print_sym_offset(struct trace_seq *s, const char *fmt,
+ unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+ char str[KSYM_SYMBOL_LEN];
+ const char *name;
+
+ sprint_symbol(str, address);
+ name = kretprobed(str);
+
+ return trace_seq_printf(s, fmt, name);
+#endif
+ return 1;
+}
+
+#ifndef CONFIG_64BIT
+# define IP_FMT "%08lx"
+#else
+# define IP_FMT "%016lx"
+#endif
+
+int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+ unsigned long ip, unsigned long sym_flags)
+{
+ struct file *file = NULL;
+ unsigned long vmstart = 0;
+ int ret = 1;
+
+ if (mm) {
+ const struct vm_area_struct *vma;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, ip);
+ if (vma) {
+ file = vma->vm_file;
+ vmstart = vma->vm_start;
+ }
+ if (file) {
+ ret = trace_seq_path(s, &file->f_path);
+ if (ret)
+ ret = trace_seq_printf(s, "[+0x%lx]",
+ ip - vmstart);
+ }
+ up_read(&mm->mmap_sem);
+ }
+ if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
+ ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+ return ret;
+}
+
+int
+seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
+ unsigned long sym_flags)
+{
+ struct mm_struct *mm = NULL;
+ int ret = 1;
+ unsigned int i;
+
+ if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
+ struct task_struct *task;
+ /*
+ * we do the lookup on the thread group leader,
+ * since individual threads might have already quit!
+ */
+ rcu_read_lock();
+ task = find_task_by_vpid(entry->ent.tgid);
+ if (task)
+ mm = get_task_mm(task);
+ rcu_read_unlock();
+ }
+
+ for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+ unsigned long ip = entry->caller[i];
+
+ if (ip == ULONG_MAX || !ret)
+ break;
+ if (ret)
+ ret = trace_seq_puts(s, " => ");
+ if (!ip) {
+ if (ret)
+ ret = trace_seq_puts(s, "??");
+ if (ret)
+ ret = trace_seq_puts(s, "\n");
+ continue;
+ }
+ if (!ret)
+ break;
+ if (ret)
+ ret = seq_print_user_ip(s, mm, ip, sym_flags);
+ ret = trace_seq_puts(s, "\n");
+ }
+
+ if (mm)
+ mmput(mm);
+ return ret;
+}
+
+int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
+{
+ int ret;
+
+ if (!ip)
+ return trace_seq_printf(s, "0");
+
+ if (sym_flags & TRACE_ITER_SYM_OFFSET)
+ ret = seq_print_sym_offset(s, "%s", ip);
+ else
+ ret = seq_print_sym_short(s, "%s", ip);
+
+ if (!ret)
+ return 0;
+
+ if (sym_flags & TRACE_ITER_SYM_ADDR)
+ ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+ return ret;
+}
+
+static int
+lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
+{
+ int hardirq, softirq;
+ char comm[TASK_COMM_LEN];
+
+ trace_find_cmdline(entry->pid, comm);
+ hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+ softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+
+ if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c",
+ comm, entry->pid, cpu,
+ (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+ (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
+ 'X' : '.',
+ (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
+ 'N' : '.',
+ (hardirq && softirq) ? 'H' :
+ hardirq ? 'h' : softirq ? 's' : '.'))
+ return 0;
+
+ if (entry->preempt_count)
+ return trace_seq_printf(s, "%x", entry->preempt_count);
+ return trace_seq_puts(s, ".");
+}
+
+static unsigned long preempt_mark_thresh = 100;
+
+static int
+lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
+ unsigned long rel_usecs)
+{
+ return trace_seq_printf(s, " %4lldus%c: ", abs_usecs,
+ rel_usecs > preempt_mark_thresh ? '!' :
+ rel_usecs > 1 ? '+' : ' ');
+}
+
+int trace_print_context(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry = iter->ent;
+ unsigned long long t = ns2usecs(iter->ts);
+ unsigned long usec_rem = do_div(t, USEC_PER_SEC);
+ unsigned long secs = (unsigned long)t;
+ char comm[TASK_COMM_LEN];
+
+ trace_find_cmdline(entry->pid, comm);
+
+ return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ",
+ comm, entry->pid, iter->cpu, secs, usec_rem);
+}
+
+int trace_print_lat_context(struct trace_iterator *iter)
+{
+ u64 next_ts;
+ int ret;
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry = iter->ent,
+ *next_entry = trace_find_next_entry(iter, NULL,
+ &next_ts);
+ unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
+ unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
+ unsigned long rel_usecs;
+
+ if (!next_entry)
+ next_ts = iter->ts;
+ rel_usecs = ns2usecs(next_ts - iter->ts);
+
+ if (verbose) {
+ char comm[TASK_COMM_LEN];
+
+ trace_find_cmdline(entry->pid, comm);
+
+ ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]"
+ " %ld.%03ldms (+%ld.%03ldms): ", comm,
+ entry->pid, iter->cpu, entry->flags,
+ entry->preempt_count, iter->idx,
+ ns2usecs(iter->ts),
+ abs_usecs / USEC_PER_MSEC,
+ abs_usecs % USEC_PER_MSEC,
+ rel_usecs / USEC_PER_MSEC,
+ rel_usecs % USEC_PER_MSEC);
+ } else {
+ ret = lat_print_generic(s, entry, iter->cpu);
+ if (ret)
+ ret = lat_print_timestamp(s, abs_usecs, rel_usecs);
+ }
+
+ return ret;
+}
+
+static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
+
+static int task_state_char(unsigned long state)
+{
+ int bit = state ? __ffs(state) + 1 : 0;
+
+ return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
+}
+
+/**
+ * ftrace_find_event - find a registered event
+ * @type: the type of event to look for
+ *
+ * Returns an event of type @type otherwise NULL
+ * Called with trace_event_read_lock() held.
+ */
+struct trace_event *ftrace_find_event(int type)
+{
+ struct trace_event *event;
+ struct hlist_node *n;
+ unsigned key;
+
+ key = type & (EVENT_HASHSIZE - 1);
+
+ hlist_for_each_entry(event, n, &event_hash[key], node) {
+ if (event->type == type)
+ return event;
+ }
+
+ return NULL;
+}
+
+static LIST_HEAD(ftrace_event_list);
+
+static int trace_search_list(struct list_head **list)
+{
+ struct trace_event *e;
+ int last = __TRACE_LAST_TYPE;
+
+ if (list_empty(&ftrace_event_list)) {
+ *list = &ftrace_event_list;
+ return last + 1;
+ }
+
+ /*
+ * We used up all possible max events,
+ * lets see if somebody freed one.
+ */
+ list_for_each_entry(e, &ftrace_event_list, list) {
+ if (e->type != last + 1)
+ break;
+ last++;
+ }
+
+ /* Did we used up all 65 thousand events??? */
+ if ((last + 1) > FTRACE_MAX_EVENT)
+ return 0;
+
+ *list = &e->list;
+ return last + 1;
+}
+
+void trace_event_read_lock(void)
+{
+ down_read(&trace_event_mutex);
+}
+
+void trace_event_read_unlock(void)
+{
+ up_read(&trace_event_mutex);
+}
+
+/**
+ * register_ftrace_event - register output for an event type
+ * @event: the event type to register
+ *
+ * Event types are stored in a hash and this hash is used to
+ * find a way to print an event. If the @event->type is set
+ * then it will use that type, otherwise it will assign a
+ * type to use.
+ *
+ * If you assign your own type, please make sure it is added
+ * to the trace_type enum in trace.h, to avoid collisions
+ * with the dynamic types.
+ *
+ * Returns the event type number or zero on error.
+ */
+int register_ftrace_event(struct trace_event *event)
+{
+ unsigned key;
+ int ret = 0;
+
+ down_write(&trace_event_mutex);
+
+ if (WARN_ON(!event))
+ goto out;
+
+ INIT_LIST_HEAD(&event->list);
+
+ if (!event->type) {
+ struct list_head *list = NULL;
+
+ if (next_event_type > FTRACE_MAX_EVENT) {
+
+ event->type = trace_search_list(&list);
+ if (!event->type)
+ goto out;
+
+ } else {
+
+ event->type = next_event_type++;
+ list = &ftrace_event_list;
+ }
+
+ if (WARN_ON(ftrace_find_event(event->type)))
+ goto out;
+
+ list_add_tail(&event->list, list);
+
+ } else if (event->type > __TRACE_LAST_TYPE) {
+ printk(KERN_WARNING "Need to add type to trace.h\n");
+ WARN_ON(1);
+ goto out;
+ } else {
+ /* Is this event already used */
+ if (ftrace_find_event(event->type))
+ goto out;
+ }
+
+ if (event->trace == NULL)
+ event->trace = trace_nop_print;
+ if (event->raw == NULL)
+ event->raw = trace_nop_print;
+ if (event->hex == NULL)
+ event->hex = trace_nop_print;
+ if (event->binary == NULL)
+ event->binary = trace_nop_print;
+
+ key = event->type & (EVENT_HASHSIZE - 1);
+
+ hlist_add_head(&event->node, &event_hash[key]);
+
+ ret = event->type;
+ out:
+ up_write(&trace_event_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_ftrace_event);
+
+/*
+ * Used by module code with the trace_event_mutex held for write.
+ */
+int __unregister_ftrace_event(struct trace_event *event)
+{
+ hlist_del(&event->node);
+ list_del(&event->list);
+ return 0;
+}
+
+/**
+ * unregister_ftrace_event - remove a no longer used event
+ * @event: the event to remove
+ */
+int unregister_ftrace_event(struct trace_event *event)
+{
+ down_write(&trace_event_mutex);
+ __unregister_ftrace_event(event);
+ up_write(&trace_event_mutex);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(unregister_ftrace_event);
+
+/*
+ * Standard events
+ */
+
+enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags)
+{
+ return TRACE_TYPE_HANDLED;
+}
+
+/* TRACE_FN */
+static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
+{
+ struct ftrace_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ if (!seq_print_ip_sym(s, field->ip, flags))
+ goto partial;
+
+ if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
+ if (!trace_seq_printf(s, " <-"))
+ goto partial;
+ if (!seq_print_ip_sym(s,
+ field->parent_ip,
+ flags))
+ goto partial;
+ }
+ if (!trace_seq_printf(s, "\n"))
+ goto partial;
+
+ return TRACE_TYPE_HANDLED;
+
+ partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
+{
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, iter->ent);
+
+ if (!trace_seq_printf(&iter->seq, "%lx %lx\n",
+ field->ip,
+ field->parent_ip))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
+{
+ struct ftrace_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_HEX_FIELD_RET(s, field->ip);
+ SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
+{
+ struct ftrace_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_FIELD_RET(s, field->ip);
+ SEQ_PUT_FIELD_RET(s, field->parent_ip);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static struct trace_event trace_fn_event = {
+ .type = TRACE_FN,
+ .trace = trace_fn_trace,
+ .raw = trace_fn_raw,
+ .hex = trace_fn_hex,
+ .binary = trace_fn_bin,
+};
+
+/* TRACE_CTX an TRACE_WAKE */
+static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
+ char *delim)
+{
+ struct ctx_switch_entry *field;
+ char comm[TASK_COMM_LEN];
+ int S, T;
+
+
+ trace_assign_type(field, iter->ent);
+
+ T = task_state_char(field->next_state);
+ S = task_state_char(field->prev_state);
+ trace_find_cmdline(field->next_pid, comm);
+ if (!trace_seq_printf(&iter->seq,
+ " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+ field->prev_pid,
+ field->prev_prio,
+ S, delim,
+ field->next_cpu,
+ field->next_pid,
+ field->next_prio,
+ T, comm))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags)
+{
+ return trace_ctxwake_print(iter, "==>");
+}
+
+static enum print_line_t trace_wake_print(struct trace_iterator *iter,
+ int flags)
+{
+ return trace_ctxwake_print(iter, " +");
+}
+
+static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
+{
+ struct ctx_switch_entry *field;
+ int T;
+
+ trace_assign_type(field, iter->ent);
+
+ if (!S)
+ task_state_char(field->prev_state);
+ T = task_state_char(field->next_state);
+ if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
+ field->prev_pid,
+ field->prev_prio,
+ S,
+ field->next_cpu,
+ field->next_pid,
+ field->next_prio,
+ T))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags)
+{
+ return trace_ctxwake_raw(iter, 0);
+}
+
+static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags)
+{
+ return trace_ctxwake_raw(iter, '+');
+}
+
+
+static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
+{
+ struct ctx_switch_entry *field;
+ struct trace_seq *s = &iter->seq;
+ int T;
+
+ trace_assign_type(field, iter->ent);
+
+ if (!S)
+ task_state_char(field->prev_state);
+ T = task_state_char(field->next_state);
+
+ SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
+ SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
+ SEQ_PUT_HEX_FIELD_RET(s, S);
+ SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
+ SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
+ SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
+ SEQ_PUT_HEX_FIELD_RET(s, T);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags)
+{
+ return trace_ctxwake_hex(iter, 0);
+}
+
+static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags)
+{
+ return trace_ctxwake_hex(iter, '+');
+}
+
+static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
+ int flags)
+{
+ struct ctx_switch_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_FIELD_RET(s, field->prev_pid);
+ SEQ_PUT_FIELD_RET(s, field->prev_prio);
+ SEQ_PUT_FIELD_RET(s, field->prev_state);
+ SEQ_PUT_FIELD_RET(s, field->next_pid);
+ SEQ_PUT_FIELD_RET(s, field->next_prio);
+ SEQ_PUT_FIELD_RET(s, field->next_state);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static struct trace_event trace_ctx_event = {
+ .type = TRACE_CTX,
+ .trace = trace_ctx_print,
+ .raw = trace_ctx_raw,
+ .hex = trace_ctx_hex,
+ .binary = trace_ctxwake_bin,
+};
+
+static struct trace_event trace_wake_event = {
+ .type = TRACE_WAKE,
+ .trace = trace_wake_print,
+ .raw = trace_wake_raw,
+ .hex = trace_wake_hex,
+ .binary = trace_ctxwake_bin,
+};
+
+/* TRACE_SPECIAL */
+static enum print_line_t trace_special_print(struct trace_iterator *iter,
+ int flags)
+{
+ struct special_entry *field;
+
+ trace_assign_type(field, iter->ent);
+
+ if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n",
+ field->arg1,
+ field->arg2,
+ field->arg3))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_special_hex(struct trace_iterator *iter,
+ int flags)
+{
+ struct special_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
+ SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
+ SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_special_bin(struct trace_iterator *iter,
+ int flags)
+{
+ struct special_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_FIELD_RET(s, field->arg1);
+ SEQ_PUT_FIELD_RET(s, field->arg2);
+ SEQ_PUT_FIELD_RET(s, field->arg3);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static struct trace_event trace_special_event = {
+ .type = TRACE_SPECIAL,
+ .trace = trace_special_print,
+ .raw = trace_special_print,
+ .hex = trace_special_hex,
+ .binary = trace_special_bin,
+};
+
+/* TRACE_STACK */
+
+static enum print_line_t trace_stack_print(struct trace_iterator *iter,
+ int flags)
+{
+ struct stack_entry *field;
+ struct trace_seq *s = &iter->seq;
+ int i;
+
+ trace_assign_type(field, iter->ent);
+
+ if (!trace_seq_puts(s, "<stack trace>\n"))
+ goto partial;
+ for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+ if (!field->caller[i] || (field->caller[i] == ULONG_MAX))
+ break;
+ if (!trace_seq_puts(s, " => "))
+ goto partial;
+
+ if (!seq_print_ip_sym(s, field->caller[i], flags))
+ goto partial;
+ if (!trace_seq_puts(s, "\n"))
+ goto partial;
+ }
+
+ return TRACE_TYPE_HANDLED;
+
+ partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event trace_stack_event = {
+ .type = TRACE_STACK,
+ .trace = trace_stack_print,
+ .raw = trace_special_print,
+ .hex = trace_special_hex,
+ .binary = trace_special_bin,
+};
+
+/* TRACE_USER_STACK */
+static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
+ int flags)
+{
+ struct userstack_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ if (!trace_seq_puts(s, "<user stack trace>\n"))
+ goto partial;
+
+ if (!seq_print_userip_objs(field, s, flags))
+ goto partial;
+
+ return TRACE_TYPE_HANDLED;
+
+ partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event trace_user_stack_event = {
+ .type = TRACE_USER_STACK,
+ .trace = trace_user_stack_print,
+ .raw = trace_special_print,
+ .hex = trace_special_hex,
+ .binary = trace_special_bin,
+};
+
+/* TRACE_BPRINT */
+static enum print_line_t
+trace_bprint_print(struct trace_iterator *iter, int flags)
+{
+ struct trace_entry *entry = iter->ent;
+ struct trace_seq *s = &iter->seq;
+ struct bprint_entry *field;
+
+ trace_assign_type(field, entry);
+
+ if (!seq_print_ip_sym(s, field->ip, flags))
+ goto partial;
+
+ if (!trace_seq_puts(s, ": "))
+ goto partial;
+
+ if (!trace_seq_bprintf(s, field->fmt, field->buf))
+ goto partial;
+
+ return TRACE_TYPE_HANDLED;
+
+ partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+
+static enum print_line_t
+trace_bprint_raw(struct trace_iterator *iter, int flags)
+{
+ struct bprint_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ if (!trace_seq_printf(s, ": %lx : ", field->ip))
+ goto partial;
+
+ if (!trace_seq_bprintf(s, field->fmt, field->buf))
+ goto partial;
+
+ return TRACE_TYPE_HANDLED;
+
+ partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+
+static struct trace_event trace_bprint_event = {
+ .type = TRACE_BPRINT,
+ .trace = trace_bprint_print,
+ .raw = trace_bprint_raw,
+};
+
+/* TRACE_PRINT */
+static enum print_line_t trace_print_print(struct trace_iterator *iter,
+ int flags)
+{
+ struct print_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ if (!seq_print_ip_sym(s, field->ip, flags))
+ goto partial;
+
+ if (!trace_seq_printf(s, ": %s", field->buf))
+ goto partial;
+
+ return TRACE_TYPE_HANDLED;
+
+ partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
+{
+ struct print_entry *field;
+
+ trace_assign_type(field, iter->ent);
+
+ if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
+ goto partial;
+
+ return TRACE_TYPE_HANDLED;
+
+ partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event trace_print_event = {
+ .type = TRACE_PRINT,
+ .trace = trace_print_print,
+ .raw = trace_print_raw,
+};
+
+
+static struct trace_event *events[] __initdata = {
+ &trace_fn_event,
+ &trace_ctx_event,
+ &trace_wake_event,
+ &trace_special_event,
+ &trace_stack_event,
+ &trace_user_stack_event,
+ &trace_bprint_event,
+ &trace_print_event,
+ NULL
+};
+
+__init static int init_events(void)
+{
+ struct trace_event *event;
+ int i, ret;
+
+ for (i = 0; events[i]; i++) {
+ event = events[i];
+
+ ret = register_ftrace_event(event);
+ if (!ret) {
+ printk(KERN_WARNING "event %d failed to register\n",
+ event->type);
+ WARN_ON_ONCE(1);
+ }
+ }
+
+ return 0;
+}
+device_initcall(init_events);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
new file mode 100644
index 00000000000..d38bec4a9c3
--- /dev/null
+++ b/kernel/trace/trace_output.h
@@ -0,0 +1,51 @@
+#ifndef __TRACE_EVENTS_H
+#define __TRACE_EVENTS_H
+
+#include <linux/trace_seq.h>
+#include "trace.h"
+
+extern enum print_line_t
+trace_print_bprintk_msg_only(struct trace_iterator *iter);
+extern enum print_line_t
+trace_print_printk_msg_only(struct trace_iterator *iter);
+
+extern int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
+ unsigned long sym_flags);
+extern int seq_print_userip_objs(const struct userstack_entry *entry,
+ struct trace_seq *s, unsigned long sym_flags);
+extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+ unsigned long ip, unsigned long sym_flags);
+
+extern int trace_print_context(struct trace_iterator *iter);
+extern int trace_print_lat_context(struct trace_iterator *iter);
+
+extern void trace_event_read_lock(void);
+extern void trace_event_read_unlock(void);
+extern struct trace_event *ftrace_find_event(int type);
+
+extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
+ int flags);
+
+/* used by module unregistering */
+extern int __unregister_ftrace_event(struct trace_event *event);
+extern struct rw_semaphore trace_event_mutex;
+
+#define MAX_MEMHEX_BYTES 8
+#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
+
+#define SEQ_PUT_FIELD_RET(s, x) \
+do { \
+ if (!trace_seq_putmem(s, &(x), sizeof(x))) \
+ return TRACE_TYPE_PARTIAL_LINE; \
+} while (0)
+
+#define SEQ_PUT_HEX_FIELD_RET(s, x) \
+do { \
+ BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
+ if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
+ return TRACE_TYPE_PARTIAL_LINE; \
+} while (0)
+
+#endif
+
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 7bda248daf5..fe1a00f1445 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -11,15 +11,123 @@
#include <linux/init.h>
#include <linux/debugfs.h>
-#include <linux/ftrace.h>
+#include <trace/power.h>
#include <linux/kallsyms.h>
#include <linux/module.h>
#include "trace.h"
+#include "trace_output.h"
static struct trace_array *power_trace;
static int __read_mostly trace_power_enabled;
+static void probe_power_start(struct power_trace *it, unsigned int type,
+ unsigned int level)
+{
+ if (!trace_power_enabled)
+ return;
+
+ memset(it, 0, sizeof(struct power_trace));
+ it->state = level;
+ it->type = type;
+ it->stamp = ktime_get();
+}
+
+
+static void probe_power_end(struct power_trace *it)
+{
+ struct ftrace_event_call *call = &event_power;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ struct trace_power *entry;
+ struct trace_array_cpu *data;
+ struct trace_array *tr = power_trace;
+
+ if (!trace_power_enabled)
+ return;
+
+ buffer = tr->buffer;
+
+ preempt_disable();
+ it->end = ktime_get();
+ data = tr->data[smp_processor_id()];
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
+ sizeof(*entry), 0, 0);
+ if (!event)
+ goto out;
+ entry = ring_buffer_event_data(event);
+ entry->state_data = *it;
+ if (!filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit(buffer, event, 0, 0);
+ out:
+ preempt_enable();
+}
+
+static void probe_power_mark(struct power_trace *it, unsigned int type,
+ unsigned int level)
+{
+ struct ftrace_event_call *call = &event_power;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ struct trace_power *entry;
+ struct trace_array_cpu *data;
+ struct trace_array *tr = power_trace;
+
+ if (!trace_power_enabled)
+ return;
+
+ buffer = tr->buffer;
+
+ memset(it, 0, sizeof(struct power_trace));
+ it->state = level;
+ it->type = type;
+ it->stamp = ktime_get();
+ preempt_disable();
+ it->end = it->stamp;
+ data = tr->data[smp_processor_id()];
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
+ sizeof(*entry), 0, 0);
+ if (!event)
+ goto out;
+ entry = ring_buffer_event_data(event);
+ entry->state_data = *it;
+ if (!filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit(buffer, event, 0, 0);
+ out:
+ preempt_enable();
+}
+
+static int tracing_power_register(void)
+{
+ int ret;
+
+ ret = register_trace_power_start(probe_power_start);
+ if (ret) {
+ pr_info("power trace: Couldn't activate tracepoint"
+ " probe to trace_power_start\n");
+ return ret;
+ }
+ ret = register_trace_power_end(probe_power_end);
+ if (ret) {
+ pr_info("power trace: Couldn't activate tracepoint"
+ " probe to trace_power_end\n");
+ goto fail_start;
+ }
+ ret = register_trace_power_mark(probe_power_mark);
+ if (ret) {
+ pr_info("power trace: Couldn't activate tracepoint"
+ " probe to trace_power_mark\n");
+ goto fail_end;
+ }
+ return ret;
+fail_end:
+ unregister_trace_power_end(probe_power_end);
+fail_start:
+ unregister_trace_power_start(probe_power_start);
+ return ret;
+}
static void start_power_trace(struct trace_array *tr)
{
@@ -31,16 +139,23 @@ static void stop_power_trace(struct trace_array *tr)
trace_power_enabled = 0;
}
+static void power_trace_reset(struct trace_array *tr)
+{
+ trace_power_enabled = 0;
+ unregister_trace_power_start(probe_power_start);
+ unregister_trace_power_end(probe_power_end);
+ unregister_trace_power_mark(probe_power_mark);
+}
+
static int power_trace_init(struct trace_array *tr)
{
- int cpu;
power_trace = tr;
trace_power_enabled = 1;
+ tracing_power_register();
- for_each_cpu(cpu, cpu_possible_mask)
- tracing_reset(tr, cpu);
+ tracing_reset_online_cpus(tr);
return 0;
}
@@ -79,14 +194,21 @@ static enum print_line_t power_print_line(struct trace_iterator *iter)
return TRACE_TYPE_UNHANDLED;
}
+static void power_print_header(struct seq_file *s)
+{
+ seq_puts(s, "# TIMESTAMP STATE EVENT\n");
+ seq_puts(s, "# | | |\n");
+}
+
static struct tracer power_tracer __read_mostly =
{
.name = "power",
.init = power_trace_init,
.start = start_power_trace,
.stop = stop_power_trace,
- .reset = stop_power_trace,
+ .reset = power_trace_reset,
.print_line = power_print_line,
+ .print_header = power_print_header,
};
static int init_power_trace(void)
@@ -94,86 +216,3 @@ static int init_power_trace(void)
return register_tracer(&power_tracer);
}
device_initcall(init_power_trace);
-
-void trace_power_start(struct power_trace *it, unsigned int type,
- unsigned int level)
-{
- if (!trace_power_enabled)
- return;
-
- memset(it, 0, sizeof(struct power_trace));
- it->state = level;
- it->type = type;
- it->stamp = ktime_get();
-}
-EXPORT_SYMBOL_GPL(trace_power_start);
-
-
-void trace_power_end(struct power_trace *it)
-{
- struct ring_buffer_event *event;
- struct trace_power *entry;
- struct trace_array_cpu *data;
- unsigned long irq_flags;
- struct trace_array *tr = power_trace;
-
- if (!trace_power_enabled)
- return;
-
- preempt_disable();
- it->end = ktime_get();
- data = tr->data[smp_processor_id()];
-
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
- &irq_flags);
- if (!event)
- goto out;
- entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, 0, 0);
- entry->ent.type = TRACE_POWER;
- entry->state_data = *it;
- ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-
- trace_wake_up();
-
- out:
- preempt_enable();
-}
-EXPORT_SYMBOL_GPL(trace_power_end);
-
-void trace_power_mark(struct power_trace *it, unsigned int type,
- unsigned int level)
-{
- struct ring_buffer_event *event;
- struct trace_power *entry;
- struct trace_array_cpu *data;
- unsigned long irq_flags;
- struct trace_array *tr = power_trace;
-
- if (!trace_power_enabled)
- return;
-
- memset(it, 0, sizeof(struct power_trace));
- it->state = level;
- it->type = type;
- it->stamp = ktime_get();
- preempt_disable();
- it->end = it->stamp;
- data = tr->data[smp_processor_id()];
-
- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
- &irq_flags);
- if (!event)
- goto out;
- entry = ring_buffer_event_data(event);
- tracing_generic_entry_update(&entry->ent, 0, 0);
- entry->ent.type = TRACE_POWER;
- entry->state_data = *it;
- ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-
- trace_wake_up();
-
- out:
- preempt_enable();
-}
-EXPORT_SYMBOL_GPL(trace_power_mark);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
new file mode 100644
index 00000000000..687699d365a
--- /dev/null
+++ b/kernel/trace/trace_printk.c
@@ -0,0 +1,252 @@
+/*
+ * trace binary printk
+ *
+ * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
+ *
+ */
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/ftrace.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/marker.h>
+#include <linux/mutex.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+#ifdef CONFIG_MODULES
+
+/*
+ * modules trace_printk()'s formats are autosaved in struct trace_bprintk_fmt
+ * which are queued on trace_bprintk_fmt_list.
+ */
+static LIST_HEAD(trace_bprintk_fmt_list);
+
+/* serialize accesses to trace_bprintk_fmt_list */
+static DEFINE_MUTEX(btrace_mutex);
+
+struct trace_bprintk_fmt {
+ struct list_head list;
+ char fmt[0];
+};
+
+static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
+{
+ struct trace_bprintk_fmt *pos;
+ list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
+ if (!strcmp(pos->fmt, fmt))
+ return pos;
+ }
+ return NULL;
+}
+
+static
+void hold_module_trace_bprintk_format(const char **start, const char **end)
+{
+ const char **iter;
+
+ mutex_lock(&btrace_mutex);
+ for (iter = start; iter < end; iter++) {
+ struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
+ if (tb_fmt) {
+ *iter = tb_fmt->fmt;
+ continue;
+ }
+
+ tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
+ + strlen(*iter) + 1, GFP_KERNEL);
+ if (tb_fmt) {
+ list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
+ strcpy(tb_fmt->fmt, *iter);
+ *iter = tb_fmt->fmt;
+ } else
+ *iter = NULL;
+ }
+ mutex_unlock(&btrace_mutex);
+}
+
+static int module_trace_bprintk_format_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+ if (mod->num_trace_bprintk_fmt) {
+ const char **start = mod->trace_bprintk_fmt_start;
+ const char **end = start + mod->num_trace_bprintk_fmt;
+
+ if (val == MODULE_STATE_COMING)
+ hold_module_trace_bprintk_format(start, end);
+ }
+ return 0;
+}
+
+#else /* !CONFIG_MODULES */
+__init static int
+module_trace_bprintk_format_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULES */
+
+
+__initdata_or_module static
+struct notifier_block module_trace_bprintk_format_nb = {
+ .notifier_call = module_trace_bprintk_format_notify,
+};
+
+int __trace_bprintk(unsigned long ip, const char *fmt, ...)
+ {
+ int ret;
+ va_list ap;
+
+ if (unlikely(!fmt))
+ return 0;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ va_start(ap, fmt);
+ ret = trace_vbprintk(ip, fmt, ap);
+ va_end(ap);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__trace_bprintk);
+
+int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap)
+ {
+ if (unlikely(!fmt))
+ return 0;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ return trace_vbprintk(ip, fmt, ap);
+}
+EXPORT_SYMBOL_GPL(__ftrace_vbprintk);
+
+int __trace_printk(unsigned long ip, const char *fmt, ...)
+{
+ int ret;
+ va_list ap;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ va_start(ap, fmt);
+ ret = trace_vprintk(ip, fmt, ap);
+ va_end(ap);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__trace_printk);
+
+int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+{
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ return trace_vprintk(ip, fmt, ap);
+}
+EXPORT_SYMBOL_GPL(__ftrace_vprintk);
+
+static void *
+t_start(struct seq_file *m, loff_t *pos)
+{
+ const char **fmt = __start___trace_bprintk_fmt + *pos;
+
+ if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
+ return NULL;
+ return fmt;
+}
+
+static void *t_next(struct seq_file *m, void * v, loff_t *pos)
+{
+ (*pos)++;
+ return t_start(m, pos);
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+ const char **fmt = v;
+ const char *str = *fmt;
+ int i;
+
+ seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);
+
+ /*
+ * Tabs and new lines need to be converted.
+ */
+ for (i = 0; str[i]; i++) {
+ switch (str[i]) {
+ case '\n':
+ seq_puts(m, "\\n");
+ break;
+ case '\t':
+ seq_puts(m, "\\t");
+ break;
+ case '\\':
+ seq_puts(m, "\\");
+ break;
+ case '"':
+ seq_puts(m, "\\\"");
+ break;
+ default:
+ seq_putc(m, str[i]);
+ }
+ }
+ seq_puts(m, "\"\n");
+
+ return 0;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static const struct seq_operations show_format_seq_ops = {
+ .start = t_start,
+ .next = t_next,
+ .show = t_show,
+ .stop = t_stop,
+};
+
+static int
+ftrace_formats_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &show_format_seq_ops);
+}
+
+static const struct file_operations ftrace_formats_fops = {
+ .open = ftrace_formats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static __init int init_trace_printk_function_export(void)
+{
+ struct dentry *d_tracer;
+
+ d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
+
+ trace_create_file("printk_formats", 0444, d_tracer,
+ NULL, &ftrace_formats_fops);
+
+ return 0;
+}
+
+fs_initcall(init_trace_printk_function_export);
+
+static __init int init_trace_printk(void)
+{
+ return register_module_notifier(&module_trace_bprintk_format_nb);
+}
+
+early_initcall(init_trace_printk);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index df175cb4564..5fca0f51fde 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -10,7 +10,7 @@
#include <linux/kallsyms.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
#include "trace.h"
@@ -18,6 +18,36 @@ static struct trace_array *ctx_trace;
static int __read_mostly tracer_enabled;
static int sched_ref;
static DEFINE_MUTEX(sched_register_mutex);
+static int sched_stopped;
+
+
+void
+tracing_sched_switch_trace(struct trace_array *tr,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned long flags, int pc)
+{
+ struct ftrace_event_call *call = &event_context_switch;
+ struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer_event *event;
+ struct ctx_switch_entry *entry;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->prev_pid = prev->pid;
+ entry->prev_prio = prev->prio;
+ entry->prev_state = prev->state;
+ entry->next_pid = next->pid;
+ entry->next_prio = next->prio;
+ entry->next_state = next->state;
+ entry->next_cpu = task_cpu(next);
+
+ if (!filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit(buffer, event, flags, pc);
+}
static void
probe_sched_switch(struct rq *__rq, struct task_struct *prev,
@@ -28,13 +58,13 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
int cpu;
int pc;
- if (!sched_ref)
+ if (unlikely(!sched_ref))
return;
tracing_record_cmdline(prev);
tracing_record_cmdline(next);
- if (!tracer_enabled)
+ if (!tracer_enabled || sched_stopped)
return;
pc = preempt_count();
@@ -43,11 +73,41 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
data = ctx_trace->data[cpu];
if (likely(!atomic_read(&data->disabled)))
- tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc);
+ tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
local_irq_restore(flags);
}
+void
+tracing_sched_wakeup_trace(struct trace_array *tr,
+ struct task_struct *wakee,
+ struct task_struct *curr,
+ unsigned long flags, int pc)
+{
+ struct ftrace_event_call *call = &event_wakeup;
+ struct ring_buffer_event *event;
+ struct ctx_switch_entry *entry;
+ struct ring_buffer *buffer = tr->buffer;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->prev_pid = curr->pid;
+ entry->prev_prio = curr->prio;
+ entry->prev_state = curr->state;
+ entry->next_pid = wakee->pid;
+ entry->next_prio = wakee->prio;
+ entry->next_state = wakee->state;
+ entry->next_cpu = task_cpu(wakee);
+
+ if (!filter_check_discard(call, entry, buffer, event))
+ ring_buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(tr->buffer, flags, 6, pc);
+ ftrace_trace_userstack(tr->buffer, flags, pc);
+}
+
static void
probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
{
@@ -55,18 +115,21 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
unsigned long flags;
int cpu, pc;
- if (!likely(tracer_enabled))
+ if (unlikely(!sched_ref))
return;
- pc = preempt_count();
tracing_record_cmdline(current);
+ if (!tracer_enabled || sched_stopped)
+ return;
+
+ pc = preempt_count();
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = ctx_trace->data[cpu];
if (likely(!atomic_read(&data->disabled)))
- tracing_sched_wakeup_trace(ctx_trace, data, wakee, current,
+ tracing_sched_wakeup_trace(ctx_trace, wakee, current,
flags, pc);
local_irq_restore(flags);
@@ -93,7 +156,7 @@ static int tracing_sched_register(void)
ret = register_trace_sched_switch(probe_sched_switch);
if (ret) {
pr_info("sched trace: Couldn't activate tracepoint"
- " probe to kernel_sched_schedule\n");
+ " probe to kernel_sched_switch\n");
goto fail_deprobe_wake_new;
}
@@ -185,12 +248,6 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
ctx_trace = tr;
}
-static void start_sched_trace(struct trace_array *tr)
-{
- tracing_reset_online_cpus(tr);
- tracing_start_sched_switch_record();
-}
-
static void stop_sched_trace(struct trace_array *tr)
{
tracing_stop_sched_switch_record();
@@ -199,7 +256,8 @@ static void stop_sched_trace(struct trace_array *tr)
static int sched_switch_trace_init(struct trace_array *tr)
{
ctx_trace = tr;
- start_sched_trace(tr);
+ tracing_reset_online_cpus(tr);
+ tracing_start_sched_switch_record();
return 0;
}
@@ -211,13 +269,12 @@ static void sched_switch_trace_reset(struct trace_array *tr)
static void sched_switch_trace_start(struct trace_array *tr)
{
- tracing_reset_online_cpus(tr);
- tracing_start_sched_switch();
+ sched_stopped = 0;
}
static void sched_switch_trace_stop(struct trace_array *tr)
{
- tracing_stop_sched_switch();
+ sched_stopped = 1;
}
static struct tracer sched_switch_trace __read_mostly =
@@ -227,6 +284,7 @@ static struct tracer sched_switch_trace __read_mostly =
.reset = sched_switch_trace_reset,
.start = sched_switch_trace_start,
.stop = sched_switch_trace_stop,
+ .wait_pipe = poll_wait_pipe,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_sched_switch,
#endif
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 43586b689e3..ad69f105a7c 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,7 +15,7 @@
#include <linux/kallsyms.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
#include "trace.h"
@@ -25,12 +25,15 @@ static int __read_mostly tracer_enabled;
static struct task_struct *wakeup_task;
static int wakeup_cpu;
static unsigned wakeup_prio = -1;
+static int wakeup_rt;
static raw_spinlock_t wakeup_lock =
(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
static void __wakeup_reset(struct trace_array *tr);
+static int save_lat_flag;
+
#ifdef CONFIG_FUNCTION_TRACER
/*
* irqsoff uses its own tracer function to keep the overhead down:
@@ -71,7 +74,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
if (task_cpu(wakeup_task) != cpu)
goto unlock;
- trace_function(tr, data, ip, parent_ip, flags, pc);
+ trace_function(tr, ip, parent_ip, flags, pc);
unlock:
__raw_spin_unlock(&wakeup_lock);
@@ -135,9 +138,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
pc = preempt_count();
- /* The task we are waiting for is waking up */
- data = wakeup_trace->data[wakeup_cpu];
-
/* disable local data, not wakeup_cpu data */
cpu = raw_smp_processor_id();
disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
@@ -151,7 +151,11 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
if (unlikely(!tracer_enabled || next != wakeup_task))
goto out_unlock;
- trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+ /* The task we are waiting for is waking up */
+ data = wakeup_trace->data[wakeup_cpu];
+
+ trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
+ tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
/*
* usecs conversion is slow so we try to delay the conversion
@@ -182,14 +186,6 @@ out:
static void __wakeup_reset(struct trace_array *tr)
{
- struct trace_array_cpu *data;
- int cpu;
-
- for_each_possible_cpu(cpu) {
- data = tr->data[cpu];
- tracing_reset(tr, cpu);
- }
-
wakeup_cpu = -1;
wakeup_prio = -1;
@@ -203,6 +199,8 @@ static void wakeup_reset(struct trace_array *tr)
{
unsigned long flags;
+ tracing_reset_online_cpus(tr);
+
local_irq_save(flags);
__raw_spin_lock(&wakeup_lock);
__wakeup_reset(tr);
@@ -213,6 +211,7 @@ static void wakeup_reset(struct trace_array *tr)
static void
probe_wakeup(struct rq *rq, struct task_struct *p, int success)
{
+ struct trace_array_cpu *data;
int cpu = smp_processor_id();
unsigned long flags;
long disabled;
@@ -224,7 +223,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
tracing_record_cmdline(p);
tracing_record_cmdline(current);
- if (likely(!rt_task(p)) ||
+ if ((wakeup_rt && !rt_task(p)) ||
p->prio >= wakeup_prio ||
p->prio >= current->prio)
return;
@@ -252,9 +251,16 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
local_save_flags(flags);
- wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
- trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu],
- CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+ data = wakeup_trace->data[wakeup_cpu];
+ data->preempt_timestamp = ftrace_now(cpu);
+ tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
+
+ /*
+ * We must be careful in using CALLER_ADDR2. But since wake_up
+ * is not called by an assembly function (where as schedule is)
+ * it should be safe to use it here.
+ */
+ trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
out_locked:
__raw_spin_unlock(&wakeup_lock);
@@ -262,12 +268,6 @@ out:
atomic_dec(&wakeup_trace->data[cpu]->disabled);
}
-/*
- * save_tracer_enabled is used to save the state of the tracer_enabled
- * variable when we disable it when we open a trace output file.
- */
-static int save_tracer_enabled;
-
static void start_wakeup_tracer(struct trace_array *tr)
{
int ret;
@@ -289,7 +289,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
ret = register_trace_sched_switch(probe_wakeup_sched_switch);
if (ret) {
pr_info("sched trace: Couldn't activate tracepoint"
- " probe to kernel_sched_schedule\n");
+ " probe to kernel_sched_switch\n");
goto fail_deprobe_wake_new;
}
@@ -306,13 +306,10 @@ static void start_wakeup_tracer(struct trace_array *tr)
register_ftrace_function(&trace_ops);
- if (tracing_is_enabled()) {
+ if (tracing_is_enabled())
tracer_enabled = 1;
- save_tracer_enabled = 1;
- } else {
+ else
tracer_enabled = 0;
- save_tracer_enabled = 0;
- }
return;
fail_deprobe_wake_new:
@@ -324,53 +321,54 @@ fail_deprobe:
static void stop_wakeup_tracer(struct trace_array *tr)
{
tracer_enabled = 0;
- save_tracer_enabled = 0;
unregister_ftrace_function(&trace_ops);
unregister_trace_sched_switch(probe_wakeup_sched_switch);
unregister_trace_sched_wakeup_new(probe_wakeup);
unregister_trace_sched_wakeup(probe_wakeup);
}
-static int wakeup_tracer_init(struct trace_array *tr)
+static int __wakeup_tracer_init(struct trace_array *tr)
{
+ save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
+ trace_flags |= TRACE_ITER_LATENCY_FMT;
+
+ tracing_max_latency = 0;
wakeup_trace = tr;
start_wakeup_tracer(tr);
return 0;
}
+static int wakeup_tracer_init(struct trace_array *tr)
+{
+ wakeup_rt = 0;
+ return __wakeup_tracer_init(tr);
+}
+
+static int wakeup_rt_tracer_init(struct trace_array *tr)
+{
+ wakeup_rt = 1;
+ return __wakeup_tracer_init(tr);
+}
+
static void wakeup_tracer_reset(struct trace_array *tr)
{
stop_wakeup_tracer(tr);
/* make sure we put back any tasks we are tracing */
wakeup_reset(tr);
+
+ if (!save_lat_flag)
+ trace_flags &= ~TRACE_ITER_LATENCY_FMT;
}
static void wakeup_tracer_start(struct trace_array *tr)
{
wakeup_reset(tr);
tracer_enabled = 1;
- save_tracer_enabled = 1;
}
static void wakeup_tracer_stop(struct trace_array *tr)
{
tracer_enabled = 0;
- save_tracer_enabled = 0;
-}
-
-static void wakeup_tracer_open(struct trace_iterator *iter)
-{
- /* stop the trace while dumping */
- tracer_enabled = 0;
-}
-
-static void wakeup_tracer_close(struct trace_iterator *iter)
-{
- /* forget about any processes we were recording */
- if (save_tracer_enabled) {
- wakeup_reset(iter->tr);
- tracer_enabled = 1;
- }
}
static struct tracer wakeup_tracer __read_mostly =
@@ -380,8 +378,20 @@ static struct tracer wakeup_tracer __read_mostly =
.reset = wakeup_tracer_reset,
.start = wakeup_tracer_start,
.stop = wakeup_tracer_stop,
- .open = wakeup_tracer_open,
- .close = wakeup_tracer_close,
+ .print_max = 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_wakeup,
+#endif
+};
+
+static struct tracer wakeup_rt_tracer __read_mostly =
+{
+ .name = "wakeup_rt",
+ .init = wakeup_rt_tracer_init,
+ .reset = wakeup_tracer_reset,
+ .start = wakeup_tracer_start,
+ .stop = wakeup_tracer_stop,
+ .wait_pipe = poll_wait_pipe,
.print_max = 1,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
@@ -396,6 +406,10 @@ __init static int init_wakeup_tracer(void)
if (ret)
return ret;
+ ret = register_tracer(&wakeup_rt_tracer);
+ if (ret)
+ return ret;
+
return 0;
}
device_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 88c8eb70f54..d2cdbabb4ea 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1,5 +1,6 @@
/* Include in trace.c */
+#include <linux/stringify.h>
#include <linux/kthread.h>
#include <linux/delay.h>
@@ -9,11 +10,13 @@ static inline int trace_valid_entry(struct trace_entry *entry)
case TRACE_FN:
case TRACE_CTX:
case TRACE_WAKE:
- case TRACE_CONT:
case TRACE_STACK:
case TRACE_PRINT:
case TRACE_SPECIAL:
case TRACE_BRANCH:
+ case TRACE_GRAPH_ENT:
+ case TRACE_GRAPH_RET:
+ case TRACE_HW_BRANCHES:
return 1;
}
return 0;
@@ -23,10 +26,20 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
{
struct ring_buffer_event *event;
struct trace_entry *entry;
+ unsigned int loops = 0;
while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) {
entry = ring_buffer_event_data(event);
+ /*
+ * The ring buffer is a size of trace_buf_size, if
+ * we loop more than the size, there's something wrong
+ * with the ring buffer.
+ */
+ if (loops++ > trace_buf_size) {
+ printk(KERN_CONT ".. bad ring buffer ");
+ goto failed;
+ }
if (!trace_valid_entry(entry)) {
printk(KERN_CONT ".. invalid entry %d ",
entry->type);
@@ -57,11 +70,20 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
cnt = ring_buffer_entries(tr->buffer);
+ /*
+ * The trace_test_buffer_cpu runs a while loop to consume all data.
+ * If the calling tracer is broken, and is constantly filling
+ * the buffer, this will run forever, and hard lock the box.
+ * We disable the ring buffer while we do this test to prevent
+ * a hard lock up.
+ */
+ tracing_off();
for_each_possible_cpu(cpu) {
ret = trace_test_buffer_cpu(tr, cpu);
if (ret)
break;
}
+ tracing_on();
__raw_spin_unlock(&ftrace_max_lock);
local_irq_restore(flags);
@@ -80,9 +102,6 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
#ifdef CONFIG_DYNAMIC_FTRACE
-#define __STR(x) #x
-#define STR(x) __STR(x)
-
/* Test dynamic code modification and ftrace filters */
int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
struct trace_array *tr,
@@ -106,17 +125,17 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
func();
/*
- * Some archs *cough*PowerPC*cough* add charachters to the
+ * Some archs *cough*PowerPC*cough* add characters to the
* start of the function names. We simply put a '*' to
- * accomodate them.
+ * accommodate them.
*/
- func_name = "*" STR(DYN_FTRACE_TEST_NAME);
+ func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
/* filter only on our function */
ftrace_set_filter(func_name, strlen(func_name), 1);
/* enable tracing */
- ret = trace->init(tr);
+ ret = tracer_init(trace, tr);
if (ret) {
warn_failed_init_tracer(trace, ret);
goto out;
@@ -170,6 +189,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
#else
# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
#endif /* CONFIG_DYNAMIC_FTRACE */
+
/*
* Simple verification test of ftrace function tracer.
* Enable ftrace, sleep 1/10 second, and then read the trace
@@ -190,7 +210,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
ftrace_enabled = 1;
tracer_enabled = 1;
- ret = trace->init(tr);
+ ret = tracer_init(trace, tr);
if (ret) {
warn_failed_init_tracer(trace, ret);
goto out;
@@ -228,6 +248,91 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
}
#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+/* Maximum number of functions to trace before diagnosing a hang */
+#define GRAPH_MAX_FUNC_TEST 100000000
+
+static void __ftrace_dump(bool disable_tracing);
+static unsigned int graph_hang_thresh;
+
+/* Wrap the real function entry probe to avoid possible hanging */
+static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
+{
+ /* This is harmlessly racy, we want to approximately detect a hang */
+ if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
+ ftrace_graph_stop();
+ printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
+ if (ftrace_dump_on_oops)
+ __ftrace_dump(false);
+ return 0;
+ }
+
+ return trace_graph_entry(trace);
+}
+
+/*
+ * Pretty much the same than for the function tracer from which the selftest
+ * has been borrowed.
+ */
+int
+trace_selftest_startup_function_graph(struct tracer *trace,
+ struct trace_array *tr)
+{
+ int ret;
+ unsigned long count;
+
+ /*
+ * Simulate the init() callback but we attach a watchdog callback
+ * to detect and recover from possible hangs
+ */
+ tracing_reset_online_cpus(tr);
+ set_graph_array(tr);
+ ret = register_ftrace_graph(&trace_graph_return,
+ &trace_graph_entry_watchdog);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ goto out;
+ }
+ tracing_start_cmdline_record();
+
+ /* Sleep for a 1/10 of a second */
+ msleep(100);
+
+ /* Have we just recovered from a hang? */
+ if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) {
+ tracing_selftest_disabled = true;
+ ret = -1;
+ goto out;
+ }
+
+ tracing_stop();
+
+ /* check the trace buffer */
+ ret = trace_test_buffer(tr, &count);
+
+ trace->reset(tr);
+ tracing_start();
+
+ if (!ret && !count) {
+ printk(KERN_CONT ".. no entries found ..");
+ ret = -1;
+ goto out;
+ }
+
+ /* Don't test dynamic tracing, the function tracer already did */
+
+out:
+ /* Stop it if we failed */
+ if (ret)
+ ftrace_graph_stop();
+
+ return ret;
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+
#ifdef CONFIG_IRQSOFF_TRACER
int
trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
@@ -237,7 +342,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
int ret;
/* start the tracing */
- ret = trace->init(tr);
+ ret = tracer_init(trace, tr);
if (ret) {
warn_failed_init_tracer(trace, ret);
return ret;
@@ -249,6 +354,14 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
local_irq_disable();
udelay(100);
local_irq_enable();
+
+ /*
+ * Stop the tracer to avoid a warning subsequent
+ * to buffer flipping failure because tracing_stop()
+ * disables the tr and max buffers, making flipping impossible
+ * in case of parallels max irqs off latencies.
+ */
+ trace->stop(tr);
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
@@ -291,7 +404,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
}
/* start the tracing */
- ret = trace->init(tr);
+ ret = tracer_init(trace, tr);
if (ret) {
warn_failed_init_tracer(trace, ret);
return ret;
@@ -303,6 +416,14 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
preempt_disable();
udelay(100);
preempt_enable();
+
+ /*
+ * Stop the tracer to avoid a warning subsequent
+ * to buffer flipping failure because tracing_stop()
+ * disables the tr and max buffers, making flipping impossible
+ * in case of parallels max preempt off latencies.
+ */
+ trace->stop(tr);
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
@@ -345,10 +466,10 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
}
/* start the tracing */
- ret = trace->init(tr);
+ ret = tracer_init(trace, tr);
if (ret) {
warn_failed_init_tracer(trace, ret);
- goto out;
+ goto out_no_start;
}
/* reset the max latency */
@@ -362,31 +483,35 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
/* reverse the order of preempt vs irqs */
local_irq_enable();
+ /*
+ * Stop the tracer to avoid a warning subsequent
+ * to buffer flipping failure because tracing_stop()
+ * disables the tr and max buffers, making flipping impossible
+ * in case of parallels max irqs/preempt off latencies.
+ */
+ trace->stop(tr);
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
ret = trace_test_buffer(tr, NULL);
- if (ret) {
- tracing_start();
+ if (ret)
goto out;
- }
ret = trace_test_buffer(&max_tr, &count);
- if (ret) {
- tracing_start();
+ if (ret)
goto out;
- }
if (!ret && !count) {
printk(KERN_CONT ".. no entries found ..");
ret = -1;
- tracing_start();
goto out;
}
/* do the test by disabling interrupts first this time */
tracing_max_latency = 0;
tracing_start();
+ trace->start(tr);
+
preempt_disable();
local_irq_disable();
udelay(100);
@@ -394,6 +519,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
/* reverse the order of preempt vs irqs */
local_irq_enable();
+ trace->stop(tr);
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
@@ -409,9 +535,10 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
goto out;
}
- out:
- trace->reset(tr);
+out:
tracing_start();
+out_no_start:
+ trace->reset(tr);
tracing_max_latency = save_max;
return ret;
@@ -477,7 +604,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
wait_for_completion(&isrt);
/* start the tracing */
- ret = trace->init(tr);
+ ret = tracer_init(trace, tr);
if (ret) {
warn_failed_init_tracer(trace, ret);
return ret;
@@ -538,7 +665,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
int ret;
/* start the tracing */
- ret = trace->init(tr);
+ ret = tracer_init(trace, tr);
if (ret) {
warn_failed_init_tracer(trace, ret);
return ret;
@@ -570,10 +697,10 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
int ret;
/* start the tracing */
- ret = trace->init(tr);
+ ret = tracer_init(trace, tr);
if (ret) {
warn_failed_init_tracer(trace, ret);
- return 0;
+ return ret;
}
/* Sleep for a 1/10 of a second */
@@ -585,6 +712,11 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
trace->reset(tr);
tracing_start();
+ if (!ret && !count) {
+ printk(KERN_CONT ".. no entries found ..");
+ ret = -1;
+ }
+
return ret;
}
#endif /* CONFIG_SYSPROF_TRACER */
@@ -597,7 +729,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
int ret;
/* start the tracing */
- ret = trace->init(tr);
+ ret = tracer_init(trace, tr);
if (ret) {
warn_failed_init_tracer(trace, ret);
return ret;
@@ -612,6 +744,67 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
trace->reset(tr);
tracing_start();
+ if (!ret && !count) {
+ printk(KERN_CONT ".. no entries found ..");
+ ret = -1;
+ }
+
return ret;
}
#endif /* CONFIG_BRANCH_TRACER */
+
+#ifdef CONFIG_HW_BRANCH_TRACER
+int
+trace_selftest_startup_hw_branches(struct tracer *trace,
+ struct trace_array *tr)
+{
+ struct trace_iterator *iter;
+ struct tracer tracer;
+ unsigned long count;
+ int ret;
+
+ if (!trace->open) {
+ printk(KERN_CONT "missing open function...");
+ return -1;
+ }
+
+ ret = tracer_init(trace, tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ return ret;
+ }
+
+ /*
+ * The hw-branch tracer needs to collect the trace from the various
+ * cpu trace buffers - before tracing is stopped.
+ */
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ memcpy(&tracer, trace, sizeof(tracer));
+
+ iter->trace = &tracer;
+ iter->tr = tr;
+ iter->pos = -1;
+ mutex_init(&iter->mutex);
+
+ trace->open(iter);
+
+ mutex_destroy(&iter->mutex);
+ kfree(iter);
+
+ tracing_stop();
+
+ ret = trace_test_buffer(tr, &count);
+ trace->reset(tr);
+ tracing_start();
+
+ if (!ret && !count) {
+ printk(KERN_CONT "no entries found..");
+ ret = -1;
+ }
+
+ return ret;
+}
+#endif /* CONFIG_HW_BRANCH_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index d0871bc0aca..0f6facb050a 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -186,43 +186,33 @@ static const struct file_operations stack_max_size_fops = {
};
static void *
-t_next(struct seq_file *m, void *v, loff_t *pos)
+__next(struct seq_file *m, loff_t *pos)
{
- long i;
-
- (*pos)++;
+ long n = *pos - 1;
- if (v == SEQ_START_TOKEN)
- i = 0;
- else {
- i = *(long *)v;
- i++;
- }
-
- if (i >= max_stack_trace.nr_entries ||
- stack_dump_trace[i] == ULONG_MAX)
+ if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
return NULL;
- m->private = (void *)i;
-
+ m->private = (void *)n;
return &m->private;
}
-static void *t_start(struct seq_file *m, loff_t *pos)
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
{
- void *t = SEQ_START_TOKEN;
- loff_t l = 0;
+ (*pos)++;
+ return __next(m, pos);
+}
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
local_irq_disable();
__raw_spin_lock(&max_stack_lock);
if (*pos == 0)
return SEQ_START_TOKEN;
- for (; t && l < *pos; t = t_next(m, t, &l))
- ;
-
- return t;
+ return __next(m, pos);
}
static void t_stop(struct seq_file *m, void *p)
@@ -234,15 +224,19 @@ static void t_stop(struct seq_file *m, void *p)
static int trace_lookup_stack(struct seq_file *m, long i)
{
unsigned long addr = stack_dump_trace[i];
-#ifdef CONFIG_KALLSYMS
- char str[KSYM_SYMBOL_LEN];
- sprint_symbol(str, addr);
+ return seq_printf(m, "%pF\n", (void *)addr);
+}
- return seq_printf(m, "%s\n", str);
-#else
- return seq_printf(m, "%p\n", (void*)addr);
-#endif
+static void print_disabled(struct seq_file *m)
+{
+ seq_puts(m, "#\n"
+ "# Stack tracer disabled\n"
+ "#\n"
+ "# To enable the stack tracer, either add 'stacktrace' to the\n"
+ "# kernel command line\n"
+ "# or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'\n"
+ "#\n");
}
static int t_show(struct seq_file *m, void *v)
@@ -251,10 +245,14 @@ static int t_show(struct seq_file *m, void *v)
int size;
if (v == SEQ_START_TOKEN) {
- seq_printf(m, " Depth Size Location"
+ seq_printf(m, " Depth Size Location"
" (%d entries)\n"
- " ----- ---- --------\n",
- max_stack_trace.nr_entries);
+ " ----- ---- --------\n",
+ max_stack_trace.nr_entries - 1);
+
+ if (!stack_tracer_enabled && !max_stack_size)
+ print_disabled(m);
+
return 0;
}
@@ -286,17 +284,14 @@ static const struct seq_operations stack_trace_seq_ops = {
static int stack_trace_open(struct inode *inode, struct file *file)
{
- int ret;
-
- ret = seq_open(file, &stack_trace_seq_ops);
-
- return ret;
+ return seq_open(file, &stack_trace_seq_ops);
}
static const struct file_operations stack_trace_fops = {
.open = stack_trace_open,
.read = seq_read,
.llseek = seq_lseek,
+ .release = seq_release,
};
int
@@ -311,10 +306,10 @@ stack_trace_sysctl(struct ctl_table *table, int write,
ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
if (ret || !write ||
- (last_stack_tracer_enabled == stack_tracer_enabled))
+ (last_stack_tracer_enabled == !!stack_tracer_enabled))
goto out;
- last_stack_tracer_enabled = stack_tracer_enabled;
+ last_stack_tracer_enabled = !!stack_tracer_enabled;
if (stack_tracer_enabled)
register_ftrace_function(&trace_ops);
@@ -337,19 +332,14 @@ __setup("stacktrace", enable_stacktrace);
static __init int stack_trace_init(void)
{
struct dentry *d_tracer;
- struct dentry *entry;
d_tracer = tracing_init_dentry();
- entry = debugfs_create_file("stack_max_size", 0644, d_tracer,
- &max_stack_size, &stack_max_size_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'stack_max_size' entry\n");
+ trace_create_file("stack_max_size", 0644, d_tracer,
+ &max_stack_size, &stack_max_size_fops);
- entry = debugfs_create_file("stack_trace", 0444, d_tracer,
- NULL, &stack_trace_fops);
- if (!entry)
- pr_warning("Could not create debugfs 'stack_trace' entry\n");
+ trace_create_file("stack_trace", 0444, d_tracer,
+ NULL, &stack_trace_fops);
if (stack_tracer_enabled)
register_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
new file mode 100644
index 00000000000..a4bb239eb98
--- /dev/null
+++ b/kernel/trace/trace_stat.c
@@ -0,0 +1,387 @@
+/*
+ * Infrastructure for statistic tracing (histogram output).
+ *
+ * Copyright (C) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Based on the code from trace_branch.c which is
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+
+
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/debugfs.h>
+#include "trace_stat.h"
+#include "trace.h"
+
+
+/*
+ * List of stat red-black nodes from a tracer
+ * We use a such tree to sort quickly the stat
+ * entries from the tracer.
+ */
+struct stat_node {
+ struct rb_node node;
+ void *stat;
+};
+
+/* A stat session is the stats output in one file */
+struct stat_session {
+ struct list_head session_list;
+ struct tracer_stat *ts;
+ struct rb_root stat_root;
+ struct mutex stat_mutex;
+ struct dentry *file;
+};
+
+/* All of the sessions currently in use. Each stat file embed one session */
+static LIST_HEAD(all_stat_sessions);
+static DEFINE_MUTEX(all_stat_sessions_mutex);
+
+/* The root directory for all stat files */
+static struct dentry *stat_dir;
+
+/*
+ * Iterate through the rbtree using a post order traversal path
+ * to release the next node.
+ * It won't necessary release one at each iteration
+ * but it will at least advance closer to the next one
+ * to be released.
+ */
+static struct rb_node *release_next(struct tracer_stat *ts,
+ struct rb_node *node)
+{
+ struct stat_node *snode;
+ struct rb_node *parent = rb_parent(node);
+
+ if (node->rb_left)
+ return node->rb_left;
+ else if (node->rb_right)
+ return node->rb_right;
+ else {
+ if (!parent)
+ ;
+ else if (parent->rb_left == node)
+ parent->rb_left = NULL;
+ else
+ parent->rb_right = NULL;
+
+ snode = container_of(node, struct stat_node, node);
+ if (ts->stat_release)
+ ts->stat_release(snode->stat);
+ kfree(snode);
+
+ return parent;
+ }
+}
+
+static void __reset_stat_session(struct stat_session *session)
+{
+ struct rb_node *node = session->stat_root.rb_node;
+
+ while (node)
+ node = release_next(session->ts, node);
+
+ session->stat_root = RB_ROOT;
+}
+
+static void reset_stat_session(struct stat_session *session)
+{
+ mutex_lock(&session->stat_mutex);
+ __reset_stat_session(session);
+ mutex_unlock(&session->stat_mutex);
+}
+
+static void destroy_session(struct stat_session *session)
+{
+ debugfs_remove(session->file);
+ __reset_stat_session(session);
+ mutex_destroy(&session->stat_mutex);
+ kfree(session);
+}
+
+typedef int (*cmp_stat_t)(void *, void *);
+
+static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+ struct stat_node *data;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ data->stat = stat;
+
+ /*
+ * Figure out where to put new node
+ * This is a descendent sorting
+ */
+ while (*new) {
+ struct stat_node *this;
+ int result;
+
+ this = container_of(*new, struct stat_node, node);
+ result = cmp(data->stat, this->stat);
+
+ parent = *new;
+ if (result >= 0)
+ new = &((*new)->rb_left);
+ else
+ new = &((*new)->rb_right);
+ }
+
+ rb_link_node(&data->node, parent, new);
+ rb_insert_color(&data->node, root);
+ return 0;
+}
+
+/*
+ * For tracers that don't provide a stat_cmp callback.
+ * This one will force an insertion as right-most node
+ * in the rbtree.
+ */
+static int dummy_cmp(void *p1, void *p2)
+{
+ return -1;
+}
+
+/*
+ * Initialize the stat rbtree at each trace_stat file opening.
+ * All of these copies and sorting are required on all opening
+ * since the stats could have changed between two file sessions.
+ */
+static int stat_seq_init(struct stat_session *session)
+{
+ struct tracer_stat *ts = session->ts;
+ struct rb_root *root = &session->stat_root;
+ void *stat;
+ int ret = 0;
+ int i;
+
+ mutex_lock(&session->stat_mutex);
+ __reset_stat_session(session);
+
+ if (!ts->stat_cmp)
+ ts->stat_cmp = dummy_cmp;
+
+ stat = ts->stat_start(ts);
+ if (!stat)
+ goto exit;
+
+ ret = insert_stat(root, stat, ts->stat_cmp);
+ if (ret)
+ goto exit;
+
+ /*
+ * Iterate over the tracer stat entries and store them in an rbtree.
+ */
+ for (i = 1; ; i++) {
+ stat = ts->stat_next(stat, i);
+
+ /* End of insertion */
+ if (!stat)
+ break;
+
+ ret = insert_stat(root, stat, ts->stat_cmp);
+ if (ret)
+ goto exit_free_rbtree;
+ }
+
+exit:
+ mutex_unlock(&session->stat_mutex);
+ return ret;
+
+exit_free_rbtree:
+ __reset_stat_session(session);
+ mutex_unlock(&session->stat_mutex);
+ return ret;
+}
+
+
+static void *stat_seq_start(struct seq_file *s, loff_t *pos)
+{
+ struct stat_session *session = s->private;
+ struct rb_node *node;
+ int n = *pos;
+ int i;
+
+ /* Prevent from tracer switch or rbtree modification */
+ mutex_lock(&session->stat_mutex);
+
+ /* If we are in the beginning of the file, print the headers */
+ if (session->ts->stat_headers) {
+ if (n == 0)
+ return SEQ_START_TOKEN;
+ n--;
+ }
+
+ node = rb_first(&session->stat_root);
+ for (i = 0; node && i < n; i++)
+ node = rb_next(node);
+
+ return node;
+}
+
+static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
+{
+ struct stat_session *session = s->private;
+ struct rb_node *node = p;
+
+ (*pos)++;
+
+ if (p == SEQ_START_TOKEN)
+ return rb_first(&session->stat_root);
+
+ return rb_next(node);
+}
+
+static void stat_seq_stop(struct seq_file *s, void *p)
+{
+ struct stat_session *session = s->private;
+ mutex_unlock(&session->stat_mutex);
+}
+
+static int stat_seq_show(struct seq_file *s, void *v)
+{
+ struct stat_session *session = s->private;
+ struct stat_node *l = container_of(v, struct stat_node, node);
+
+ if (v == SEQ_START_TOKEN)
+ return session->ts->stat_headers(s);
+
+ return session->ts->stat_show(s, l->stat);
+}
+
+static const struct seq_operations trace_stat_seq_ops = {
+ .start = stat_seq_start,
+ .next = stat_seq_next,
+ .stop = stat_seq_stop,
+ .show = stat_seq_show
+};
+
+/* The session stat is refilled and resorted at each stat file opening */
+static int tracing_stat_open(struct inode *inode, struct file *file)
+{
+ int ret;
+ struct seq_file *m;
+ struct stat_session *session = inode->i_private;
+
+ ret = stat_seq_init(session);
+ if (ret)
+ return ret;
+
+ ret = seq_open(file, &trace_stat_seq_ops);
+ if (ret) {
+ reset_stat_session(session);
+ return ret;
+ }
+
+ m = file->private_data;
+ m->private = session;
+ return ret;
+}
+
+/*
+ * Avoid consuming memory with our now useless rbtree.
+ */
+static int tracing_stat_release(struct inode *i, struct file *f)
+{
+ struct stat_session *session = i->i_private;
+
+ reset_stat_session(session);
+
+ return seq_release(i, f);
+}
+
+static const struct file_operations tracing_stat_fops = {
+ .open = tracing_stat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = tracing_stat_release
+};
+
+static int tracing_stat_init(void)
+{
+ struct dentry *d_tracing;
+
+ d_tracing = tracing_init_dentry();
+
+ stat_dir = debugfs_create_dir("trace_stat", d_tracing);
+ if (!stat_dir)
+ pr_warning("Could not create debugfs "
+ "'trace_stat' entry\n");
+ return 0;
+}
+
+static int init_stat_file(struct stat_session *session)
+{
+ if (!stat_dir && tracing_stat_init())
+ return -ENODEV;
+
+ session->file = debugfs_create_file(session->ts->name, 0644,
+ stat_dir,
+ session, &tracing_stat_fops);
+ if (!session->file)
+ return -ENOMEM;
+ return 0;
+}
+
+int register_stat_tracer(struct tracer_stat *trace)
+{
+ struct stat_session *session, *node;
+ int ret;
+
+ if (!trace)
+ return -EINVAL;
+
+ if (!trace->stat_start || !trace->stat_next || !trace->stat_show)
+ return -EINVAL;
+
+ /* Already registered? */
+ mutex_lock(&all_stat_sessions_mutex);
+ list_for_each_entry(node, &all_stat_sessions, session_list) {
+ if (node->ts == trace) {
+ mutex_unlock(&all_stat_sessions_mutex);
+ return -EINVAL;
+ }
+ }
+ mutex_unlock(&all_stat_sessions_mutex);
+
+ /* Init the session */
+ session = kzalloc(sizeof(*session), GFP_KERNEL);
+ if (!session)
+ return -ENOMEM;
+
+ session->ts = trace;
+ INIT_LIST_HEAD(&session->session_list);
+ mutex_init(&session->stat_mutex);
+
+ ret = init_stat_file(session);
+ if (ret) {
+ destroy_session(session);
+ return ret;
+ }
+
+ /* Register */
+ mutex_lock(&all_stat_sessions_mutex);
+ list_add_tail(&session->session_list, &all_stat_sessions);
+ mutex_unlock(&all_stat_sessions_mutex);
+
+ return 0;
+}
+
+void unregister_stat_tracer(struct tracer_stat *trace)
+{
+ struct stat_session *node, *tmp;
+
+ mutex_lock(&all_stat_sessions_mutex);
+ list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+ if (node->ts == trace) {
+ list_del(&node->session_list);
+ destroy_session(node);
+ break;
+ }
+ }
+ mutex_unlock(&all_stat_sessions_mutex);
+}
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
new file mode 100644
index 00000000000..8f03914b9a6
--- /dev/null
+++ b/kernel/trace/trace_stat.h
@@ -0,0 +1,33 @@
+#ifndef __TRACE_STAT_H
+#define __TRACE_STAT_H
+
+#include <linux/seq_file.h>
+
+/*
+ * If you want to provide a stat file (one-shot statistics), fill
+ * an iterator with stat_start/stat_next and a stat_show callbacks.
+ * The others callbacks are optional.
+ */
+struct tracer_stat {
+ /* The name of your stat file */
+ const char *name;
+ /* Iteration over statistic entries */
+ void *(*stat_start)(struct tracer_stat *trace);
+ void *(*stat_next)(void *prev, int idx);
+ /* Compare two entries for stats sorting */
+ int (*stat_cmp)(void *p1, void *p2);
+ /* Print a stat entry */
+ int (*stat_show)(struct seq_file *s, void *p);
+ /* Release an entry */
+ void (*stat_release)(void *stat);
+ /* Print the headers of your stat entries */
+ int (*stat_headers)(struct seq_file *s);
+};
+
+/*
+ * Destroy or create a stat file
+ */
+extern int register_stat_tracer(struct tracer_stat *trace);
+extern void unregister_stat_tracer(struct tracer_stat *trace);
+
+#endif /* __TRACE_STAT_H */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
new file mode 100644
index 00000000000..8712ce3c6a0
--- /dev/null
+++ b/kernel/trace/trace_syscalls.c
@@ -0,0 +1,523 @@
+#include <trace/syscall.h>
+#include <trace/events/syscalls.h>
+#include <linux/kernel.h>
+#include <linux/ftrace.h>
+#include <linux/perf_counter.h>
+#include <asm/syscall.h>
+
+#include "trace_output.h"
+#include "trace.h"
+
+static DEFINE_MUTEX(syscall_trace_lock);
+static int sys_refcount_enter;
+static int sys_refcount_exit;
+static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
+static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
+
+enum print_line_t
+print_syscall_enter(struct trace_iterator *iter, int flags)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *ent = iter->ent;
+ struct syscall_trace_enter *trace;
+ struct syscall_metadata *entry;
+ int i, ret, syscall;
+
+ trace = (typeof(trace))ent;
+ syscall = trace->nr;
+ entry = syscall_nr_to_meta(syscall);
+
+ if (!entry)
+ goto end;
+
+ if (entry->enter_id != ent->type) {
+ WARN_ON_ONCE(1);
+ goto end;
+ }
+
+ ret = trace_seq_printf(s, "%s(", entry->name);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ for (i = 0; i < entry->nb_args; i++) {
+ /* parameter types */
+ if (trace_flags & TRACE_ITER_VERBOSE) {
+ ret = trace_seq_printf(s, "%s ", entry->types[i]);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+ /* parameter values */
+ ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
+ trace->args[i],
+ i == entry->nb_args - 1 ? "" : ", ");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+
+ ret = trace_seq_putc(s, ')');
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+end:
+ ret = trace_seq_putc(s, '\n');
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+enum print_line_t
+print_syscall_exit(struct trace_iterator *iter, int flags)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *ent = iter->ent;
+ struct syscall_trace_exit *trace;
+ int syscall;
+ struct syscall_metadata *entry;
+ int ret;
+
+ trace = (typeof(trace))ent;
+ syscall = trace->nr;
+ entry = syscall_nr_to_meta(syscall);
+
+ if (!entry) {
+ trace_seq_printf(s, "\n");
+ return TRACE_TYPE_HANDLED;
+ }
+
+ if (entry->exit_id != ent->type) {
+ WARN_ON_ONCE(1);
+ return TRACE_TYPE_UNHANDLED;
+ }
+
+ ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
+ trace->ret);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+extern char *__bad_type_size(void);
+
+#define SYSCALL_FIELD(type, name) \
+ sizeof(type) != sizeof(trace.name) ? \
+ __bad_type_size() : \
+ #type, #name, offsetof(typeof(trace), name), sizeof(trace.name)
+
+int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
+{
+ int i;
+ int nr;
+ int ret;
+ struct syscall_metadata *entry;
+ struct syscall_trace_enter trace;
+ int offset = offsetof(struct syscall_trace_enter, args);
+
+ nr = syscall_name_to_nr(call->data);
+ entry = syscall_nr_to_meta(nr);
+
+ if (!entry)
+ return 0;
+
+ ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
+ SYSCALL_FIELD(int, nr));
+ if (!ret)
+ return 0;
+
+ for (i = 0; i < entry->nb_args; i++) {
+ ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i],
+ entry->args[i]);
+ if (!ret)
+ return 0;
+ ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset,
+ sizeof(unsigned long));
+ if (!ret)
+ return 0;
+ offset += sizeof(unsigned long);
+ }
+
+ trace_seq_puts(s, "\nprint fmt: \"");
+ for (i = 0; i < entry->nb_args; i++) {
+ ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i],
+ sizeof(unsigned long),
+ i == entry->nb_args - 1 ? "" : ", ");
+ if (!ret)
+ return 0;
+ }
+ trace_seq_putc(s, '"');
+
+ for (i = 0; i < entry->nb_args; i++) {
+ ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
+ entry->args[i]);
+ if (!ret)
+ return 0;
+ }
+
+ return trace_seq_putc(s, '\n');
+}
+
+int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
+{
+ int ret;
+ struct syscall_trace_exit trace;
+
+ ret = trace_seq_printf(s,
+ "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+ "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
+ SYSCALL_FIELD(int, nr),
+ SYSCALL_FIELD(unsigned long, ret));
+ if (!ret)
+ return 0;
+
+ return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n");
+}
+
+int syscall_enter_define_fields(struct ftrace_event_call *call)
+{
+ struct syscall_trace_enter trace;
+ struct syscall_metadata *meta;
+ int ret;
+ int nr;
+ int i;
+ int offset = offsetof(typeof(trace), args);
+
+ nr = syscall_name_to_nr(call->data);
+ meta = syscall_nr_to_meta(nr);
+
+ if (!meta)
+ return 0;
+
+ ret = trace_define_common_fields(call);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < meta->nb_args; i++) {
+ ret = trace_define_field(call, meta->types[i],
+ meta->args[i], offset,
+ sizeof(unsigned long), 0,
+ FILTER_OTHER);
+ offset += sizeof(unsigned long);
+ }
+
+ return ret;
+}
+
+int syscall_exit_define_fields(struct ftrace_event_call *call)
+{
+ struct syscall_trace_exit trace;
+ int ret;
+
+ ret = trace_define_common_fields(call);
+ if (ret)
+ return ret;
+
+ ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0,
+ FILTER_OTHER);
+
+ return ret;
+}
+
+void ftrace_syscall_enter(struct pt_regs *regs, long id)
+{
+ struct syscall_trace_enter *entry;
+ struct syscall_metadata *sys_data;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ int size;
+ int syscall_nr;
+
+ syscall_nr = syscall_get_nr(current, regs);
+ if (syscall_nr < 0)
+ return;
+ if (!test_bit(syscall_nr, enabled_enter_syscalls))
+ return;
+
+ sys_data = syscall_nr_to_meta(syscall_nr);
+ if (!sys_data)
+ return;
+
+ size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
+
+ event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id,
+ size, 0, 0);
+ if (!event)
+ return;
+
+ entry = ring_buffer_event_data(event);
+ entry->nr = syscall_nr;
+ syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
+
+ if (!filter_current_check_discard(buffer, sys_data->enter_event,
+ entry, event))
+ trace_current_buffer_unlock_commit(buffer, event, 0, 0);
+}
+
+void ftrace_syscall_exit(struct pt_regs *regs, long ret)
+{
+ struct syscall_trace_exit *entry;
+ struct syscall_metadata *sys_data;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ int syscall_nr;
+
+ syscall_nr = syscall_get_nr(current, regs);
+ if (syscall_nr < 0)
+ return;
+ if (!test_bit(syscall_nr, enabled_exit_syscalls))
+ return;
+
+ sys_data = syscall_nr_to_meta(syscall_nr);
+ if (!sys_data)
+ return;
+
+ event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id,
+ sizeof(*entry), 0, 0);
+ if (!event)
+ return;
+
+ entry = ring_buffer_event_data(event);
+ entry->nr = syscall_nr;
+ entry->ret = syscall_get_return_value(current, regs);
+
+ if (!filter_current_check_discard(buffer, sys_data->exit_event,
+ entry, event))
+ trace_current_buffer_unlock_commit(buffer, event, 0, 0);
+}
+
+int reg_event_syscall_enter(void *ptr)
+{
+ int ret = 0;
+ int num;
+ char *name;
+
+ name = (char *)ptr;
+ num = syscall_name_to_nr(name);
+ if (num < 0 || num >= NR_syscalls)
+ return -ENOSYS;
+ mutex_lock(&syscall_trace_lock);
+ if (!sys_refcount_enter)
+ ret = register_trace_sys_enter(ftrace_syscall_enter);
+ if (ret) {
+ pr_info("event trace: Could not activate"
+ "syscall entry trace point");
+ } else {
+ set_bit(num, enabled_enter_syscalls);
+ sys_refcount_enter++;
+ }
+ mutex_unlock(&syscall_trace_lock);
+ return ret;
+}
+
+void unreg_event_syscall_enter(void *ptr)
+{
+ int num;
+ char *name;
+
+ name = (char *)ptr;
+ num = syscall_name_to_nr(name);
+ if (num < 0 || num >= NR_syscalls)
+ return;
+ mutex_lock(&syscall_trace_lock);
+ sys_refcount_enter--;
+ clear_bit(num, enabled_enter_syscalls);
+ if (!sys_refcount_enter)
+ unregister_trace_sys_enter(ftrace_syscall_enter);
+ mutex_unlock(&syscall_trace_lock);
+}
+
+int reg_event_syscall_exit(void *ptr)
+{
+ int ret = 0;
+ int num;
+ char *name;
+
+ name = (char *)ptr;
+ num = syscall_name_to_nr(name);
+ if (num < 0 || num >= NR_syscalls)
+ return -ENOSYS;
+ mutex_lock(&syscall_trace_lock);
+ if (!sys_refcount_exit)
+ ret = register_trace_sys_exit(ftrace_syscall_exit);
+ if (ret) {
+ pr_info("event trace: Could not activate"
+ "syscall exit trace point");
+ } else {
+ set_bit(num, enabled_exit_syscalls);
+ sys_refcount_exit++;
+ }
+ mutex_unlock(&syscall_trace_lock);
+ return ret;
+}
+
+void unreg_event_syscall_exit(void *ptr)
+{
+ int num;
+ char *name;
+
+ name = (char *)ptr;
+ num = syscall_name_to_nr(name);
+ if (num < 0 || num >= NR_syscalls)
+ return;
+ mutex_lock(&syscall_trace_lock);
+ sys_refcount_exit--;
+ clear_bit(num, enabled_exit_syscalls);
+ if (!sys_refcount_exit)
+ unregister_trace_sys_exit(ftrace_syscall_exit);
+ mutex_unlock(&syscall_trace_lock);
+}
+
+struct trace_event event_syscall_enter = {
+ .trace = print_syscall_enter,
+};
+
+struct trace_event event_syscall_exit = {
+ .trace = print_syscall_exit,
+};
+
+#ifdef CONFIG_EVENT_PROFILE
+
+static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
+static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
+static int sys_prof_refcount_enter;
+static int sys_prof_refcount_exit;
+
+static void prof_syscall_enter(struct pt_regs *regs, long id)
+{
+ struct syscall_trace_enter *rec;
+ struct syscall_metadata *sys_data;
+ int syscall_nr;
+ int size;
+
+ syscall_nr = syscall_get_nr(current, regs);
+ if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
+ return;
+
+ sys_data = syscall_nr_to_meta(syscall_nr);
+ if (!sys_data)
+ return;
+
+ /* get the size after alignment with the u32 buffer size field */
+ size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
+ size = ALIGN(size + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
+
+ do {
+ char raw_data[size];
+
+ /* zero the dead bytes from align to not leak stack to user */
+ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+
+ rec = (struct syscall_trace_enter *) raw_data;
+ tracing_generic_entry_update(&rec->ent, 0, 0);
+ rec->ent.type = sys_data->enter_id;
+ rec->nr = syscall_nr;
+ syscall_get_arguments(current, regs, 0, sys_data->nb_args,
+ (unsigned long *)&rec->args);
+ perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
+ } while(0);
+}
+
+int reg_prof_syscall_enter(char *name)
+{
+ int ret = 0;
+ int num;
+
+ num = syscall_name_to_nr(name);
+ if (num < 0 || num >= NR_syscalls)
+ return -ENOSYS;
+
+ mutex_lock(&syscall_trace_lock);
+ if (!sys_prof_refcount_enter)
+ ret = register_trace_sys_enter(prof_syscall_enter);
+ if (ret) {
+ pr_info("event trace: Could not activate"
+ "syscall entry trace point");
+ } else {
+ set_bit(num, enabled_prof_enter_syscalls);
+ sys_prof_refcount_enter++;
+ }
+ mutex_unlock(&syscall_trace_lock);
+ return ret;
+}
+
+void unreg_prof_syscall_enter(char *name)
+{
+ int num;
+
+ num = syscall_name_to_nr(name);
+ if (num < 0 || num >= NR_syscalls)
+ return;
+
+ mutex_lock(&syscall_trace_lock);
+ sys_prof_refcount_enter--;
+ clear_bit(num, enabled_prof_enter_syscalls);
+ if (!sys_prof_refcount_enter)
+ unregister_trace_sys_enter(prof_syscall_enter);
+ mutex_unlock(&syscall_trace_lock);
+}
+
+static void prof_syscall_exit(struct pt_regs *regs, long ret)
+{
+ struct syscall_metadata *sys_data;
+ struct syscall_trace_exit rec;
+ int syscall_nr;
+
+ syscall_nr = syscall_get_nr(current, regs);
+ if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
+ return;
+
+ sys_data = syscall_nr_to_meta(syscall_nr);
+ if (!sys_data)
+ return;
+
+ tracing_generic_entry_update(&rec.ent, 0, 0);
+ rec.ent.type = sys_data->exit_id;
+ rec.nr = syscall_nr;
+ rec.ret = syscall_get_return_value(current, regs);
+
+ perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
+}
+
+int reg_prof_syscall_exit(char *name)
+{
+ int ret = 0;
+ int num;
+
+ num = syscall_name_to_nr(name);
+ if (num < 0 || num >= NR_syscalls)
+ return -ENOSYS;
+
+ mutex_lock(&syscall_trace_lock);
+ if (!sys_prof_refcount_exit)
+ ret = register_trace_sys_exit(prof_syscall_exit);
+ if (ret) {
+ pr_info("event trace: Could not activate"
+ "syscall entry trace point");
+ } else {
+ set_bit(num, enabled_prof_exit_syscalls);
+ sys_prof_refcount_exit++;
+ }
+ mutex_unlock(&syscall_trace_lock);
+ return ret;
+}
+
+void unreg_prof_syscall_exit(char *name)
+{
+ int num;
+
+ num = syscall_name_to_nr(name);
+ if (num < 0 || num >= NR_syscalls)
+ return;
+
+ mutex_lock(&syscall_trace_lock);
+ sys_prof_refcount_exit--;
+ clear_bit(num, enabled_prof_exit_syscalls);
+ if (!sys_prof_refcount_exit)
+ unregister_trace_sys_exit(prof_syscall_exit);
+ mutex_unlock(&syscall_trace_lock);
+}
+
+#endif
+
+
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index eaca5ad803f..f6693969287 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -88,7 +88,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
}
}
-const static struct stacktrace_ops backtrace_ops = {
+static const struct stacktrace_ops backtrace_ops = {
.warning = backtrace_warning,
.warning_symbol = backtrace_warning_symbol,
.stack = backtrace_stack,
@@ -203,7 +203,8 @@ static void start_stack_timer(void *unused)
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer->function = stack_trace_timer_fn;
- hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
+ hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+ HRTIMER_MODE_REL_PINNED);
}
static void start_stack_timers(void)
@@ -226,15 +227,6 @@ static void stop_stack_timers(void)
stop_stack_timer(cpu);
}
-static void start_stack_trace(struct trace_array *tr)
-{
- mutex_lock(&sample_timer_lock);
- tracing_reset_online_cpus(tr);
- start_stack_timers();
- tracer_enabled = 1;
- mutex_unlock(&sample_timer_lock);
-}
-
static void stop_stack_trace(struct trace_array *tr)
{
mutex_lock(&sample_timer_lock);
@@ -247,12 +239,18 @@ static int stack_trace_init(struct trace_array *tr)
{
sysprof_trace = tr;
- start_stack_trace(tr);
+ tracing_start_cmdline_record();
+
+ mutex_lock(&sample_timer_lock);
+ start_stack_timers();
+ tracer_enabled = 1;
+ mutex_unlock(&sample_timer_lock);
return 0;
}
static void stack_trace_reset(struct trace_array *tr)
{
+ tracing_stop_cmdline_record();
stop_stack_trace(tr);
}
@@ -317,18 +315,14 @@ sysprof_sample_write(struct file *filp, const char __user *ubuf,
return cnt;
}
-static struct file_operations sysprof_sample_fops = {
+static const struct file_operations sysprof_sample_fops = {
.read = sysprof_sample_read,
.write = sysprof_sample_write,
};
void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
{
- struct dentry *entry;
- entry = debugfs_create_file("sysprof_sample_period", 0644,
+ trace_create_file("sysprof_sample_period", 0644,
d_tracer, NULL, &sysprof_sample_fops);
- if (entry)
- return;
- pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n");
}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
new file mode 100644
index 00000000000..40cafb07dff
--- /dev/null
+++ b/kernel/trace/trace_workqueue.c
@@ -0,0 +1,295 @@
+/*
+ * Workqueue statistical tracer.
+ *
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ */
+
+
+#include <trace/events/workqueue.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <linux/kref.h>
+#include "trace_stat.h"
+#include "trace.h"
+
+
+/* A cpu workqueue thread */
+struct cpu_workqueue_stats {
+ struct list_head list;
+ struct kref kref;
+ int cpu;
+ pid_t pid;
+/* Can be inserted from interrupt or user context, need to be atomic */
+ atomic_t inserted;
+/*
+ * Don't need to be atomic, works are serialized in a single workqueue thread
+ * on a single CPU.
+ */
+ unsigned int executed;
+};
+
+/* List of workqueue threads on one cpu */
+struct workqueue_global_stats {
+ struct list_head list;
+ spinlock_t lock;
+};
+
+/* Don't need a global lock because allocated before the workqueues, and
+ * never freed.
+ */
+static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
+#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
+
+static void cpu_workqueue_stat_free(struct kref *kref)
+{
+ kfree(container_of(kref, struct cpu_workqueue_stats, kref));
+}
+
+/* Insertion of a work */
+static void
+probe_workqueue_insertion(struct task_struct *wq_thread,
+ struct work_struct *work)
+{
+ int cpu = cpumask_first(&wq_thread->cpus_allowed);
+ struct cpu_workqueue_stats *node;
+ unsigned long flags;
+
+ spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+ list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
+ if (node->pid == wq_thread->pid) {
+ atomic_inc(&node->inserted);
+ goto found;
+ }
+ }
+ pr_debug("trace_workqueue: entry not found\n");
+found:
+ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
+}
+
+/* Execution of a work */
+static void
+probe_workqueue_execution(struct task_struct *wq_thread,
+ struct work_struct *work)
+{
+ int cpu = cpumask_first(&wq_thread->cpus_allowed);
+ struct cpu_workqueue_stats *node;
+ unsigned long flags;
+
+ spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+ list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
+ if (node->pid == wq_thread->pid) {
+ node->executed++;
+ goto found;
+ }
+ }
+ pr_debug("trace_workqueue: entry not found\n");
+found:
+ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
+}
+
+/* Creation of a cpu workqueue thread */
+static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
+{
+ struct cpu_workqueue_stats *cws;
+ unsigned long flags;
+
+ WARN_ON(cpu < 0);
+
+ /* Workqueues are sometimes created in atomic context */
+ cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
+ if (!cws) {
+ pr_warning("trace_workqueue: not enough memory\n");
+ return;
+ }
+ INIT_LIST_HEAD(&cws->list);
+ kref_init(&cws->kref);
+ cws->cpu = cpu;
+ cws->pid = wq_thread->pid;
+
+ spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+ list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
+ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
+}
+
+/* Destruction of a cpu workqueue thread */
+static void probe_workqueue_destruction(struct task_struct *wq_thread)
+{
+ /* Workqueue only execute on one cpu */
+ int cpu = cpumask_first(&wq_thread->cpus_allowed);
+ struct cpu_workqueue_stats *node, *next;
+ unsigned long flags;
+
+ spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+ list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
+ list) {
+ if (node->pid == wq_thread->pid) {
+ list_del(&node->list);
+ kref_put(&node->kref, cpu_workqueue_stat_free);
+ goto found;
+ }
+ }
+
+ pr_debug("trace_workqueue: don't find workqueue to destroy\n");
+found:
+ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
+
+}
+
+static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
+{
+ unsigned long flags;
+ struct cpu_workqueue_stats *ret = NULL;
+
+
+ spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+
+ if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
+ ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
+ struct cpu_workqueue_stats, list);
+ kref_get(&ret->kref);
+ }
+
+ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
+
+ return ret;
+}
+
+static void *workqueue_stat_start(struct tracer_stat *trace)
+{
+ int cpu;
+ void *ret = NULL;
+
+ for_each_possible_cpu(cpu) {
+ ret = workqueue_stat_start_cpu(cpu);
+ if (ret)
+ return ret;
+ }
+ return NULL;
+}
+
+static void *workqueue_stat_next(void *prev, int idx)
+{
+ struct cpu_workqueue_stats *prev_cws = prev;
+ struct cpu_workqueue_stats *ret;
+ int cpu = prev_cws->cpu;
+ unsigned long flags;
+
+ spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+ if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
+ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
+ do {
+ cpu = cpumask_next(cpu, cpu_possible_mask);
+ if (cpu >= nr_cpu_ids)
+ return NULL;
+ } while (!(ret = workqueue_stat_start_cpu(cpu)));
+ return ret;
+ } else {
+ ret = list_entry(prev_cws->list.next,
+ struct cpu_workqueue_stats, list);
+ kref_get(&ret->kref);
+ }
+ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
+
+ return ret;
+}
+
+static int workqueue_stat_show(struct seq_file *s, void *p)
+{
+ struct cpu_workqueue_stats *cws = p;
+ struct pid *pid;
+ struct task_struct *tsk;
+
+ pid = find_get_pid(cws->pid);
+ if (pid) {
+ tsk = get_pid_task(pid, PIDTYPE_PID);
+ if (tsk) {
+ seq_printf(s, "%3d %6d %6u %s\n", cws->cpu,
+ atomic_read(&cws->inserted), cws->executed,
+ tsk->comm);
+ put_task_struct(tsk);
+ }
+ put_pid(pid);
+ }
+
+ return 0;
+}
+
+static void workqueue_stat_release(void *stat)
+{
+ struct cpu_workqueue_stats *node = stat;
+
+ kref_put(&node->kref, cpu_workqueue_stat_free);
+}
+
+static int workqueue_stat_headers(struct seq_file *s)
+{
+ seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
+ seq_printf(s, "# | | | |\n");
+ return 0;
+}
+
+struct tracer_stat workqueue_stats __read_mostly = {
+ .name = "workqueues",
+ .stat_start = workqueue_stat_start,
+ .stat_next = workqueue_stat_next,
+ .stat_show = workqueue_stat_show,
+ .stat_release = workqueue_stat_release,
+ .stat_headers = workqueue_stat_headers
+};
+
+
+int __init stat_workqueue_init(void)
+{
+ if (register_stat_tracer(&workqueue_stats)) {
+ pr_warning("Unable to register workqueue stat tracer\n");
+ return 1;
+ }
+
+ return 0;
+}
+fs_initcall(stat_workqueue_init);
+
+/*
+ * Workqueues are created very early, just after pre-smp initcalls.
+ * So we must register our tracepoints at this stage.
+ */
+int __init trace_workqueue_early_init(void)
+{
+ int ret, cpu;
+
+ ret = register_trace_workqueue_insertion(probe_workqueue_insertion);
+ if (ret)
+ goto out;
+
+ ret = register_trace_workqueue_execution(probe_workqueue_execution);
+ if (ret)
+ goto no_insertion;
+
+ ret = register_trace_workqueue_creation(probe_workqueue_creation);
+ if (ret)
+ goto no_execution;
+
+ ret = register_trace_workqueue_destruction(probe_workqueue_destruction);
+ if (ret)
+ goto no_creation;
+
+ for_each_possible_cpu(cpu) {
+ spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
+ INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
+ }
+
+ return 0;
+
+no_creation:
+ unregister_trace_workqueue_creation(probe_workqueue_creation);
+no_execution:
+ unregister_trace_workqueue_execution(probe_workqueue_execution);
+no_insertion:
+ unregister_trace_workqueue_insertion(probe_workqueue_insertion);
+out:
+ pr_warning("trace_workqueue: unable to trace workqueues\n");
+
+ return 1;
+}
+early_initcall(trace_workqueue_early_init);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 79602740bbb..9489a0a9b1b 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -24,6 +24,7 @@
#include <linux/tracepoint.h>
#include <linux/err.h>
#include <linux/slab.h>
+#include <linux/sched.h>
extern struct tracepoint __start___tracepoints[];
extern struct tracepoint __stop___tracepoints[];
@@ -242,6 +243,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
{
WARN_ON(strcmp((*entry)->name, elem->name) != 0);
+ if (elem->regfunc && !elem->state && active)
+ elem->regfunc();
+ else if (elem->unregfunc && elem->state && !active)
+ elem->unregfunc();
+
/*
* rcu_assign_pointer has a smp_wmb() which makes sure that the new
* probe callbacks array is consistent before setting a pointer to it.
@@ -261,6 +267,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
*/
static void disable_tracepoint(struct tracepoint *elem)
{
+ if (elem->unregfunc && elem->state)
+ elem->unregfunc();
+
elem->state = 0;
rcu_assign_pointer(elem->funcs, NULL);
}
@@ -272,12 +281,15 @@ static void disable_tracepoint(struct tracepoint *elem)
*
* Updates the probe callback corresponding to a range of tracepoints.
*/
-void tracepoint_update_probe_range(struct tracepoint *begin,
- struct tracepoint *end)
+void
+tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
{
struct tracepoint *iter;
struct tracepoint_entry *mark_entry;
+ if (!begin)
+ return;
+
mutex_lock(&tracepoints_mutex);
for (iter = begin; iter < end; iter++) {
mark_entry = get_tracepoint(iter->name);
@@ -551,9 +563,6 @@ int tracepoint_module_notify(struct notifier_block *self,
switch (val) {
case MODULE_STATE_COMING:
- tracepoint_update_probe_range(mod->tracepoints,
- mod->tracepoints + mod->num_tracepoints);
- break;
case MODULE_STATE_GOING:
tracepoint_update_probe_range(mod->tracepoints,
mod->tracepoints + mod->num_tracepoints);
@@ -574,3 +583,41 @@ static int init_tracepoints(void)
__initcall(init_tracepoints);
#endif /* CONFIG_MODULES */
+
+#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
+
+/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
+static int sys_tracepoint_refcount;
+
+void syscall_regfunc(void)
+{
+ unsigned long flags;
+ struct task_struct *g, *t;
+
+ if (!sys_tracepoint_refcount) {
+ read_lock_irqsave(&tasklist_lock, flags);
+ do_each_thread(g, t) {
+ /* Skip kernel threads. */
+ if (t->mm)
+ set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
+ } while_each_thread(g, t);
+ read_unlock_irqrestore(&tasklist_lock, flags);
+ }
+ sys_tracepoint_refcount++;
+}
+
+void syscall_unregfunc(void)
+{
+ unsigned long flags;
+ struct task_struct *g, *t;
+
+ sys_tracepoint_refcount--;
+ if (!sys_tracepoint_refcount) {
+ read_lock_irqsave(&tasklist_lock, flags);
+ do_each_thread(g, t) {
+ clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
+ } while_each_thread(g, t);
+ read_unlock_irqrestore(&tasklist_lock, flags);
+ }
+}
+#endif
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 43f891b05a4..00d59d048ed 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -122,8 +122,10 @@ void acct_update_integrals(struct task_struct *tsk)
if (likely(tsk->mm)) {
cputime_t time, dtime;
struct timeval value;
+ unsigned long flags;
u64 delta;
+ local_irq_save(flags);
time = tsk->stime + tsk->utime;
dtime = cputime_sub(time, tsk->acct_timexpd);
jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
@@ -131,10 +133,12 @@ void acct_update_integrals(struct task_struct *tsk)
delta = delta * USEC_PER_SEC + value.tv_usec;
if (delta == 0)
- return;
+ goto out;
tsk->acct_timexpd = time;
tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
+ out:
+ local_irq_restore(flags);
}
}
diff --git a/kernel/user.c b/kernel/user.c
index 477b6660f44..2c000e7132a 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -20,7 +20,7 @@
struct user_namespace init_user_ns = {
.kref = {
- .refcount = ATOMIC_INIT(1),
+ .refcount = ATOMIC_INIT(2),
},
.creator = &root_user,
};
@@ -72,21 +72,7 @@ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
static void uid_hash_remove(struct user_struct *up)
{
hlist_del_init(&up->uidhash_node);
-}
-
-static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
-{
- struct user_struct *user;
- struct hlist_node *h;
-
- hlist_for_each_entry(user, h, hashent, uidhash_node) {
- if (user->uid == uid) {
- atomic_inc(&user->__count);
- return user;
- }
- }
-
- return NULL;
+ put_user_ns(up->user_ns);
}
#ifdef CONFIG_USER_SCHED
@@ -118,6 +104,23 @@ static int sched_create_user(struct user_struct *up) { return 0; }
#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
+static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
+{
+ struct user_struct *user;
+ struct hlist_node *h;
+
+ hlist_for_each_entry(user, h, hashent, uidhash_node) {
+ if (user->uid == uid) {
+ /* possibly resurrect an "almost deleted" object */
+ if (atomic_inc_return(&user->__count) == 1)
+ cancel_delayed_work(&user->work);
+ return user;
+ }
+ }
+
+ return NULL;
+}
+
static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
static DEFINE_MUTEX(uids_mutex);
@@ -282,38 +285,35 @@ int __init uids_sysfs_init(void)
return uids_user_create(&root_user);
}
-/* work function to remove sysfs directory for a user and free up
+/* delayed work function to remove sysfs directory for a user and free up
* corresponding structures.
*/
-static void remove_user_sysfs_dir(struct work_struct *w)
+static void cleanup_user_struct(struct work_struct *w)
{
- struct user_struct *up = container_of(w, struct user_struct, work);
+ struct user_struct *up = container_of(w, struct user_struct, work.work);
unsigned long flags;
int remove_user = 0;
- if (up->user_ns != &init_user_ns)
- return;
/* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
* atomic.
*/
uids_mutex_lock();
- local_irq_save(flags);
-
- if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
+ spin_lock_irqsave(&uidhash_lock, flags);
+ if (atomic_read(&up->__count) == 0) {
uid_hash_remove(up);
remove_user = 1;
- spin_unlock_irqrestore(&uidhash_lock, flags);
- } else {
- local_irq_restore(flags);
}
+ spin_unlock_irqrestore(&uidhash_lock, flags);
if (!remove_user)
goto done;
- kobject_uevent(&up->kobj, KOBJ_REMOVE);
- kobject_del(&up->kobj);
- kobject_put(&up->kobj);
+ if (up->user_ns == &init_user_ns) {
+ kobject_uevent(&up->kobj, KOBJ_REMOVE);
+ kobject_del(&up->kobj);
+ kobject_put(&up->kobj);
+ }
sched_destroy_user(up);
key_put(up->uid_keyring);
@@ -330,17 +330,28 @@ done:
*/
static void free_user(struct user_struct *up, unsigned long flags)
{
- /* restore back the count */
- atomic_inc(&up->__count);
spin_unlock_irqrestore(&uidhash_lock, flags);
-
- put_user_ns(up->user_ns);
- INIT_WORK(&up->work, remove_user_sysfs_dir);
- schedule_work(&up->work);
+ INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
+ schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
}
#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
+static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
+{
+ struct user_struct *user;
+ struct hlist_node *h;
+
+ hlist_for_each_entry(user, h, hashent, uidhash_node) {
+ if (user->uid == uid) {
+ atomic_inc(&user->__count);
+ return user;
+ }
+ }
+
+ return NULL;
+}
+
int uids_sysfs_init(void) { return 0; }
static inline int uids_user_create(struct user_struct *up) { return 0; }
static inline void uids_mutex_lock(void) { }
@@ -357,12 +368,29 @@ static void free_user(struct user_struct *up, unsigned long flags)
sched_destroy_user(up);
key_put(up->uid_keyring);
key_put(up->session_keyring);
- put_user_ns(up->user_ns);
kmem_cache_free(uid_cachep, up);
}
#endif
+#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
+/*
+ * We need to check if a setuid can take place. This function should be called
+ * before successfully completing the setuid.
+ */
+int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
+{
+
+ return sched_rt_can_attach(up->tg, tsk);
+
+}
+#else
+int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
+{
+ return 1;
+}
+#endif
+
/*
* Locate the user_struct for the passed UID. If found, take a ref on it. The
* caller must undo that ref with free_uid().
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 79084311ee5..076c7c8215b 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -60,12 +60,25 @@ int create_user_ns(struct cred *new)
return 0;
}
-void free_user_ns(struct kref *kref)
+/*
+ * Deferred destructor for a user namespace. This is required because
+ * free_user_ns() may be called with uidhash_lock held, but we need to call
+ * back to free_uid() which will want to take the lock again.
+ */
+static void free_user_ns_work(struct work_struct *work)
{
- struct user_namespace *ns;
-
- ns = container_of(kref, struct user_namespace, kref);
+ struct user_namespace *ns =
+ container_of(work, struct user_namespace, destroyer);
free_uid(ns->creator);
kfree(ns);
}
+
+void free_user_ns(struct kref *kref)
+{
+ struct user_namespace *ns =
+ container_of(kref, struct user_namespace, kref);
+
+ INIT_WORK(&ns->destroyer, free_user_ns_work);
+ schedule_work(&ns->destroyer);
+}
EXPORT_SYMBOL(free_user_ns);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 815237a55af..8a82b4b8ea5 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,6 +15,16 @@
#include <linux/err.h>
#include <linux/slab.h>
+static struct uts_namespace *create_uts_ns(void)
+{
+ struct uts_namespace *uts_ns;
+
+ uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
+ if (uts_ns)
+ kref_init(&uts_ns->kref);
+ return uts_ns;
+}
+
/*
* Clone a new ns copying an original utsname, setting refcount to 1
* @old_ns: namespace to clone
@@ -24,14 +34,13 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
{
struct uts_namespace *ns;
- ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
+ ns = create_uts_ns();
if (!ns)
return ERR_PTR(-ENOMEM);
down_read(&uts_sem);
memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
up_read(&uts_sem);
- kref_init(&ns->kref);
return ns;
}
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 3b34b354593..92359cc747a 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -37,7 +37,7 @@ static void put_uts(ctl_table *table, int write, void *which)
up_write(&uts_sem);
}
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
/*
* Special case of dostring for the UTS structure. This has locks
* to observe. Should this be in kernel/sys.c ????
diff --git a/kernel/wait.c b/kernel/wait.c
index cd87131f2fc..c4bd3d825f3 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,13 +10,14 @@
#include <linux/wait.h>
#include <linux/hash.h>
-void init_waitqueue_head(wait_queue_head_t *q)
+void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
{
spin_lock_init(&q->lock);
+ lockdep_set_class(&q->lock, key);
INIT_LIST_HEAD(&q->task_list);
}
-EXPORT_SYMBOL(init_waitqueue_head);
+EXPORT_SYMBOL(__init_waitqueue_head);
void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
{
@@ -91,6 +92,15 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
}
EXPORT_SYMBOL(prepare_to_wait_exclusive);
+/*
+ * finish_wait - clean up after waiting in a queue
+ * @q: waitqueue waited on
+ * @wait: wait descriptor
+ *
+ * Sets current thread back to running state and removes
+ * the wait descriptor from the given waitqueue if still
+ * queued.
+ */
void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;
@@ -117,6 +127,39 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
}
EXPORT_SYMBOL(finish_wait);
+/*
+ * abort_exclusive_wait - abort exclusive waiting in a queue
+ * @q: waitqueue waited on
+ * @wait: wait descriptor
+ * @state: runstate of the waiter to be woken
+ * @key: key to identify a wait bit queue or %NULL
+ *
+ * Sets current thread back to running state and removes
+ * the wait descriptor from the given waitqueue if still
+ * queued.
+ *
+ * Wakes up the next waiter if the caller is concurrently
+ * woken up through the queue.
+ *
+ * This prevents waiter starvation where an exclusive waiter
+ * aborts and is woken up concurrently and noone wakes up
+ * the next waiter.
+ */
+void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
+ unsigned int mode, void *key)
+{
+ unsigned long flags;
+
+ __set_current_state(TASK_RUNNING);
+ spin_lock_irqsave(&q->lock, flags);
+ if (!list_empty(&wait->task_list))
+ list_del_init(&wait->task_list);
+ else if (waitqueue_active(q))
+ __wake_up_locked_key(q, mode, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(abort_exclusive_wait);
+
int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
int ret = default_wake_function(wait, mode, sync, key);
@@ -177,17 +220,20 @@ int __sched
__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
int (*action)(void *), unsigned mode)
{
- int ret = 0;
-
do {
+ int ret;
+
prepare_to_wait_exclusive(wq, &q->wait, mode);
- if (test_bit(q->key.bit_nr, q->key.flags)) {
- if ((ret = (*action)(q->key.flags)))
- break;
- }
+ if (!test_bit(q->key.bit_nr, q->key.flags))
+ continue;
+ ret = action(q->key.flags);
+ if (!ret)
+ continue;
+ abort_exclusive_wait(wq, &q->wait, mode, &q->key);
+ return ret;
} while (test_and_set_bit(q->key.bit_nr, q->key.flags));
finish_wait(wq, &q->wait);
- return ret;
+ return 0;
}
EXPORT_SYMBOL(__wait_on_bit_lock);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2f445833ae3..addfe2df93b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -33,6 +33,8 @@
#include <linux/kallsyms.h>
#include <linux/debug_locks.h>
#include <linux/lockdep.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/workqueue.h>
/*
* The per-CPU workqueue (if single thread, we always use the first
@@ -48,8 +50,6 @@ struct cpu_workqueue_struct {
struct workqueue_struct *wq;
struct task_struct *thread;
-
- int run_depth; /* Detect run_workqueue() recursion depth */
} ____cacheline_aligned;
/*
@@ -128,6 +128,8 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
static void insert_work(struct cpu_workqueue_struct *cwq,
struct work_struct *work, struct list_head *head)
{
+ trace_workqueue_insertion(cwq->thread, work);
+
set_wq_data(work, cwq);
/*
* Ensure that we get the right work->data if we see the
@@ -262,13 +264,6 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
static void run_workqueue(struct cpu_workqueue_struct *cwq)
{
spin_lock_irq(&cwq->lock);
- cwq->run_depth++;
- if (cwq->run_depth > 3) {
- /* morton gets to eat his hat */
- printk("%s: recursion depth exceeded: %d\n",
- __func__, cwq->run_depth);
- dump_stack();
- }
while (!list_empty(&cwq->worklist)) {
struct work_struct *work = list_entry(cwq->worklist.next,
struct work_struct, entry);
@@ -284,7 +279,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
*/
struct lockdep_map lockdep_map = work->lockdep_map;
#endif
-
+ trace_workqueue_execution(cwq->thread, work);
cwq->current_work = work;
list_del_init(cwq->worklist.next);
spin_unlock_irq(&cwq->lock);
@@ -311,7 +306,6 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
spin_lock_irq(&cwq->lock);
cwq->current_work = NULL;
}
- cwq->run_depth--;
spin_unlock_irq(&cwq->lock);
}
@@ -323,8 +317,6 @@ static int worker_thread(void *__cwq)
if (cwq->wq->freezeable)
set_freezable();
- set_user_nice(current, -5);
-
for (;;) {
prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
if (!freezing(current) &&
@@ -368,29 +360,20 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
{
- int active;
-
- if (cwq->thread == current) {
- /*
- * Probably keventd trying to flush its own queue. So simply run
- * it by hand rather than deadlocking.
- */
- run_workqueue(cwq);
- active = 1;
- } else {
- struct wq_barrier barr;
+ int active = 0;
+ struct wq_barrier barr;
- active = 0;
- spin_lock_irq(&cwq->lock);
- if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
- insert_wq_barrier(cwq, &barr, &cwq->worklist);
- active = 1;
- }
- spin_unlock_irq(&cwq->lock);
+ WARN_ON(cwq->thread == current);
- if (active)
- wait_for_completion(&barr.done);
+ spin_lock_irq(&cwq->lock);
+ if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
+ insert_wq_barrier(cwq, &barr, &cwq->worklist);
+ active = 1;
}
+ spin_unlock_irq(&cwq->lock);
+
+ if (active)
+ wait_for_completion(&barr.done);
return active;
}
@@ -416,7 +399,7 @@ void flush_workqueue(struct workqueue_struct *wq)
might_sleep();
lock_map_acquire(&wq->lockdep_map);
lock_map_release(&wq->lockdep_map);
- for_each_cpu_mask_nr(cpu, *cpu_map)
+ for_each_cpu(cpu, cpu_map)
flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
}
EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -547,7 +530,7 @@ static void wait_on_work(struct work_struct *work)
wq = cwq->wq;
cpu_map = wq_cpu_map(wq);
- for_each_cpu_mask_nr(cpu, *cpu_map)
+ for_each_cpu(cpu, cpu_map)
wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
}
@@ -615,7 +598,12 @@ static struct workqueue_struct *keventd_wq __read_mostly;
* schedule_work - put work task in global workqueue
* @work: job to be done
*
- * This puts a job in the kernel-global workqueue.
+ * Returns zero if @work was already on the kernel-global workqueue and
+ * non-zero otherwise.
+ *
+ * This puts a job in the kernel-global workqueue if it was not already
+ * queued and leaves it in the same position on the kernel-global
+ * workqueue otherwise.
*/
int schedule_work(struct work_struct *work)
{
@@ -787,6 +775,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
cwq->thread = p;
+ trace_workqueue_creation(cwq->thread, cpu);
+
return 0;
}
@@ -891,6 +881,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
* checks list_empty(), and a "normal" queue_work() can't use
* a dead CPU.
*/
+ trace_workqueue_destruction(cwq->thread);
kthread_stop(cwq->thread);
cwq->thread = NULL;
}
@@ -911,7 +902,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
list_del(&wq->list);
spin_unlock(&workqueue_lock);
- for_each_cpu_mask_nr(cpu, *cpu_map)
+ for_each_cpu(cpu, cpu_map)
cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
cpu_maps_update_done();
@@ -971,18 +962,20 @@ undo:
}
#ifdef CONFIG_SMP
+
struct work_for_cpu {
- struct work_struct work;
+ struct completion completion;
long (*fn)(void *);
void *arg;
long ret;
};
-static void do_work_for_cpu(struct work_struct *w)
+static int do_work_for_cpu(void *_wfc)
{
- struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work);
-
+ struct work_for_cpu *wfc = _wfc;
wfc->ret = wfc->fn(wfc->arg);
+ complete(&wfc->completion);
+ return 0;
}
/**
@@ -991,25 +984,25 @@ static void do_work_for_cpu(struct work_struct *w)
* @fn: the function to run
* @arg: the function arg
*
- * This will return -EINVAL in the cpu is not online, or the return value
- * of @fn otherwise.
+ * This will return the value @fn returns.
+ * It is up to the caller to ensure that the cpu doesn't go offline.
+ * The caller must not hold any locks which would prevent @fn from completing.
*/
long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
{
- struct work_for_cpu wfc;
-
- INIT_WORK(&wfc.work, do_work_for_cpu);
- wfc.fn = fn;
- wfc.arg = arg;
- get_online_cpus();
- if (unlikely(!cpu_online(cpu)))
- wfc.ret = -EINVAL;
- else {
- schedule_work_on(cpu, &wfc.work);
- flush_work(&wfc.work);
- }
- put_online_cpus();
-
+ struct task_struct *sub_thread;
+ struct work_for_cpu wfc = {
+ .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
+ .fn = fn,
+ .arg = arg,
+ };
+
+ sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
+ if (IS_ERR(sub_thread))
+ return PTR_ERR(sub_thread);
+ kthread_bind(sub_thread, cpu);
+ wake_up_process(sub_thread);
+ wait_for_completion(&wfc.completion);
return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu);