diff options
Diffstat (limited to 'kernel')
36 files changed, 2925 insertions, 643 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ca83b73fba1..0249f4be9b5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1710,14 +1710,13 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, return -EFAULT; buffer[nbytes] = 0; /* nul-terminate */ - strstrip(buffer); if (cft->write_u64) { - u64 val = simple_strtoull(buffer, &end, 0); + u64 val = simple_strtoull(strstrip(buffer), &end, 0); if (*end) return -EINVAL; retval = cft->write_u64(cgrp, cft, val); } else { - s64 val = simple_strtoll(buffer, &end, 0); + s64 val = simple_strtoll(strstrip(buffer), &end, 0); if (*end) return -EINVAL; retval = cft->write_s64(cgrp, cft, val); @@ -1753,8 +1752,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, } buffer[nbytes] = 0; /* nul-terminate */ - strstrip(buffer); - retval = cft->write_string(cgrp, cft, buffer); + retval = cft->write_string(cgrp, cft, strstrip(buffer)); if (!retval) retval = nbytes; out: diff --git a/kernel/exit.c b/kernel/exit.c index e61891f8012..f7864ac2ecc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -359,10 +359,8 @@ void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; - if (task_session(curr) != pid) { + if (task_session(curr) != pid) change_pid(curr, PIDTYPE_SID, pid); - proc_sid_connector(curr); - } if (task_pgrp(curr) != pid) change_pid(curr, PIDTYPE_PGID, pid); diff --git a/kernel/fork.c b/kernel/fork.c index 4c20fff8c13..166b8c49257 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -91,7 +91,7 @@ int nr_processes(void) int cpu; int total = 0; - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) total += per_cpu(process_counts, cpu); return total; diff --git a/kernel/futex.c b/kernel/futex.c index 4949d336d88..fb65e822fc4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -150,7 +150,8 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key) */ static inline int match_futex(union futex_key *key1, union futex_key *key2) { - return (key1->both.word == key2->both.word + return (key1 && key2 + && key1->both.word == key2->both.word && key1->both.ptr == key2->both.ptr && key1->both.offset == key2->both.offset); } @@ -1028,7 +1029,6 @@ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, struct futex_hash_bucket *hb) { - drop_futex_key_refs(&q->key); get_futex_key_refs(key); q->key = *key; @@ -1226,6 +1226,7 @@ retry_private: */ if (ret == 1) { WARN_ON(pi_state); + drop_count++; task_count++; ret = get_futex_value_locked(&curval2, uaddr2); if (!ret) @@ -1304,6 +1305,7 @@ retry_private: if (ret == 1) { /* We got the lock. */ requeue_pi_wake_futex(this, &key2, hb2); + drop_count++; continue; } else if (ret) { /* -EDEADLK */ @@ -1791,6 +1793,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, current->timer_slack_ns); } +retry: /* Prepare to wait on uaddr. */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) @@ -1808,9 +1811,14 @@ static int futex_wait(u32 __user *uaddr, int fshared, goto out_put_key; /* - * We expect signal_pending(current), but another thread may - * have handled it for us already. + * We expect signal_pending(current), but we might be the + * victim of a spurious wakeup as well. */ + if (!signal_pending(current)) { + put_futex_key(fshared, &q.key); + goto retry; + } + ret = -ERESTARTSYS; if (!abs_time) goto out_put_key; @@ -2118,9 +2126,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, */ plist_del(&q->list, &q->list.plist); + /* Handle spurious wakeups gracefully */ + ret = -EWOULDBLOCK; if (timeout && !timeout->task) ret = -ETIMEDOUT; - else + else if (signal_pending(current)) ret = -ERESTARTNOINTR; } return ret; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 114e704760f..bd7273e6282 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -121,7 +121,9 @@ static void poll_all_shared_irqs(void) if (!(status & IRQ_SPURIOUS_DISABLED)) continue; + local_irq_disable(); try_one_irq(i, desc); + local_irq_enable(); } } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 5240d75f4c6..84495958e70 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -90,6 +90,9 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) */ static struct kprobe_blackpoint kprobe_blacklist[] = { {"preempt_schedule",}, + {"native_get_debugreg",}, + {"irq_entries_start",}, + {"common_interrupt",}, {NULL} /* Terminator */ }; @@ -673,6 +676,40 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) return (kprobe_opcode_t *)(((char *)addr) + p->offset); } +/* Check passed kprobe is valid and return kprobe in kprobe_table. */ +static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) +{ + struct kprobe *old_p, *list_p; + + old_p = get_kprobe(p->addr); + if (unlikely(!old_p)) + return NULL; + + if (p != old_p) { + list_for_each_entry_rcu(list_p, &old_p->list, list) + if (list_p == p) + /* kprobe p is a valid probe */ + goto valid; + return NULL; + } +valid: + return old_p; +} + +/* Return error if the kprobe is being re-registered */ +static inline int check_kprobe_rereg(struct kprobe *p) +{ + int ret = 0; + struct kprobe *old_p; + + mutex_lock(&kprobe_mutex); + old_p = __get_valid_kprobe(p); + if (old_p) + ret = -EINVAL; + mutex_unlock(&kprobe_mutex); + return ret; +} + int __kprobes register_kprobe(struct kprobe *p) { int ret = 0; @@ -685,6 +722,10 @@ int __kprobes register_kprobe(struct kprobe *p) return -EINVAL; p->addr = addr; + ret = check_kprobe_rereg(p); + if (ret) + return ret; + preempt_disable(); if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr)) { @@ -754,26 +795,6 @@ out: } EXPORT_SYMBOL_GPL(register_kprobe); -/* Check passed kprobe is valid and return kprobe in kprobe_table. */ -static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) -{ - struct kprobe *old_p, *list_p; - - old_p = get_kprobe(p->addr); - if (unlikely(!old_p)) - return NULL; - - if (p != old_p) { - list_for_each_entry_rcu(list_p, &old_p->list, list) - if (list_p == p) - /* kprobe p is a valid probe */ - goto valid; - return NULL; - } -valid: - return old_p; -} - /* * Unregister a kprobe without a scheduler synchronization. */ @@ -1141,6 +1162,13 @@ static void __kprobes kill_kprobe(struct kprobe *p) arch_remove_kprobe(p); } +void __kprobes dump_kprobe(struct kprobe *kp) +{ + printk(KERN_WARNING "Dumping kprobe:\n"); + printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", + kp->symbol_name, kp->addr, kp->offset); +} + /* Module notifier call back, checking kprobes on the module */ static int __kprobes kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) diff --git a/kernel/kthread.c b/kernel/kthread.c index 5fe709982ca..ab7ae57773e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -150,29 +150,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), EXPORT_SYMBOL(kthread_create); /** - * kthread_bind - bind a just-created kthread to a cpu. - * @k: thread created by kthread_create(). - * @cpu: cpu (might not be online, must be possible) for @k to run on. - * - * Description: This function is equivalent to set_cpus_allowed(), - * except that @cpu doesn't need to be online, and the thread must be - * stopped (i.e., just returned from kthread_create()). - */ -void kthread_bind(struct task_struct *k, unsigned int cpu) -{ - /* Must have done schedule() in kthread() before we set_task_cpu */ - if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) { - WARN_ON(1); - return; - } - set_task_cpu(k, cpu); - k->cpus_allowed = cpumask_of_cpu(cpu); - k->rt.nr_cpus_allowed = 1; - k->flags |= PF_THREAD_BOUND; -} -EXPORT_SYMBOL(kthread_bind); - -/** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 3815ac1d58b..f5dcd36d315 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -49,7 +49,7 @@ #include "lockdep_internals.h" #define CREATE_TRACE_POINTS -#include <trace/events/lockdep.h> +#include <trace/events/lock.h> #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; @@ -142,6 +142,11 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock) #ifdef CONFIG_LOCK_STAT static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); +static inline u64 lockstat_clock(void) +{ + return cpu_clock(smp_processor_id()); +} + static int lock_point(unsigned long points[], unsigned long ip) { int i; @@ -158,7 +163,7 @@ static int lock_point(unsigned long points[], unsigned long ip) return i; } -static void lock_time_inc(struct lock_time *lt, s64 time) +static void lock_time_inc(struct lock_time *lt, u64 time) { if (time > lt->max) lt->max = time; @@ -234,12 +239,12 @@ static void put_lock_stats(struct lock_class_stats *stats) static void lock_release_holdtime(struct held_lock *hlock) { struct lock_class_stats *stats; - s64 holdtime; + u64 holdtime; if (!lock_stat) return; - holdtime = sched_clock() - hlock->holdtime_stamp; + holdtime = lockstat_clock() - hlock->holdtime_stamp; stats = get_lock_stats(hlock_class(hlock)); if (hlock->read) @@ -2792,7 +2797,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->references = references; #ifdef CONFIG_LOCK_STAT hlock->waittime_stamp = 0; - hlock->holdtime_stamp = sched_clock(); + hlock->holdtime_stamp = lockstat_clock(); #endif if (check == 2 && !mark_irqflags(curr, hlock)) @@ -3322,7 +3327,7 @@ found_it: if (hlock->instance != lock) return; - hlock->waittime_stamp = sched_clock(); + hlock->waittime_stamp = lockstat_clock(); contention_point = lock_point(hlock_class(hlock)->contention_point, ip); contending_point = lock_point(hlock_class(hlock)->contending_point, @@ -3345,8 +3350,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) struct held_lock *hlock, *prev_hlock; struct lock_class_stats *stats; unsigned int depth; - u64 now; - s64 waittime = 0; + u64 now, waittime = 0; int i, cpu; depth = curr->lockdep_depth; @@ -3374,7 +3378,7 @@ found_it: cpu = smp_processor_id(); if (hlock->waittime_stamp) { - now = sched_clock(); + now = lockstat_clock(); waittime = now - hlock->waittime_stamp; hlock->holdtime_stamp = now; } diff --git a/kernel/notifier.c b/kernel/notifier.c index 61d5aa5eced..acd24e7643e 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -558,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); static ATOMIC_NOTIFIER_HEAD(die_chain); -int notrace notify_die(enum die_val val, const char *str, +int notrace __kprobes notify_die(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) { struct die_args args = { diff --git a/kernel/params.c b/kernel/params.c index 9da58eabdcb..d656c276508 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -218,15 +218,11 @@ int param_set_charp(const char *val, struct kernel_param *kp) return -ENOSPC; } - if (kp->flags & KPARAM_KMALLOCED) - kfree(*(char **)kp->arg); - /* This is a hack. We can't need to strdup in early boot, and we * don't need to; this mangled commandline is preserved. */ if (slab_is_available()) { - kp->flags |= KPARAM_KMALLOCED; *(char **)kp->arg = kstrdup(val, GFP_KERNEL); - if (!kp->arg) + if (!*(char **)kp->arg) return -ENOMEM; } else *(const char **)kp->arg = val; @@ -304,6 +300,7 @@ static int param_array(const char *name, unsigned int min, unsigned int max, void *elem, int elemsize, int (*set)(const char *, struct kernel_param *kp), + u16 flags, unsigned int *num) { int ret; @@ -313,6 +310,7 @@ static int param_array(const char *name, /* Get the name right for errors. */ kp.name = name; kp.arg = elem; + kp.flags = flags; /* No equals sign? */ if (!val) { @@ -358,7 +356,8 @@ int param_array_set(const char *val, struct kernel_param *kp) unsigned int temp_num; return param_array(kp->name, val, 1, arr->max, arr->elem, - arr->elemsize, arr->set, arr->num ?: &temp_num); + arr->elemsize, arr->set, kp->flags, + arr->num ?: &temp_num); } int param_array_get(char *buffer, struct kernel_param *kp) @@ -605,11 +604,7 @@ void module_param_sysfs_remove(struct module *mod) void destroy_params(const struct kernel_param *params, unsigned num) { - unsigned int i; - - for (i = 0; i < num; i++) - if (params[i].flags & KPARAM_KMALLOCED) - kfree(*(char **)params[i].arg); + /* FIXME: This should free kmalloced charp parameters. It doesn't. */ } static void __init kernel_add_sysfs_param(const char *name, diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 7f29643c898..3256e36ad25 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -28,6 +28,7 @@ #include <linux/anon_inodes.h> #include <linux/kernel_stat.h> #include <linux/perf_event.h> +#include <linux/ftrace_event.h> #include <asm/irq_regs.h> @@ -1658,6 +1659,8 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) return ERR_PTR(err); } +static void perf_event_free_filter(struct perf_event *event); + static void free_event_rcu(struct rcu_head *head) { struct perf_event *event; @@ -1665,6 +1668,7 @@ static void free_event_rcu(struct rcu_head *head) event = container_of(head, struct perf_event, rcu_head); if (event->ns) put_pid_ns(event->ns); + perf_event_free_filter(event); kfree(event); } @@ -1974,7 +1978,8 @@ unlock: return ret; } -int perf_event_set_output(struct perf_event *event, int output_fd); +static int perf_event_set_output(struct perf_event *event, int output_fd); +static int perf_event_set_filter(struct perf_event *event, void __user *arg); static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2002,6 +2007,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_SET_OUTPUT: return perf_event_set_output(event, arg); + case PERF_EVENT_IOC_SET_FILTER: + return perf_event_set_filter(event, (void __user *)arg); + default: return -ENOTTY; } @@ -2666,20 +2674,21 @@ static void perf_output_wakeup(struct perf_output_handle *handle) static void perf_output_lock(struct perf_output_handle *handle) { struct perf_mmap_data *data = handle->data; - int cpu; + int cur, cpu = get_cpu(); handle->locked = 0; - local_irq_save(handle->flags); - cpu = smp_processor_id(); - - if (in_nmi() && atomic_read(&data->lock) == cpu) - return; + for (;;) { + cur = atomic_cmpxchg(&data->lock, -1, cpu); + if (cur == -1) { + handle->locked = 1; + break; + } + if (cur == cpu) + break; - while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) cpu_relax(); - - handle->locked = 1; + } } static void perf_output_unlock(struct perf_output_handle *handle) @@ -2725,7 +2734,7 @@ again: if (atomic_xchg(&data->wakeup, 0)) perf_output_wakeup(handle); out: - local_irq_restore(handle->flags); + put_cpu(); } void perf_output_copy(struct perf_output_handle *handle, @@ -3806,9 +3815,14 @@ static int perf_swevent_is_counting(struct perf_event *event) return 1; } +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data); + static int perf_swevent_match(struct perf_event *event, enum perf_type_id type, - u32 event_id, struct pt_regs *regs) + u32 event_id, + struct perf_sample_data *data, + struct pt_regs *regs) { if (!perf_swevent_is_counting(event)) return 0; @@ -3826,6 +3840,10 @@ static int perf_swevent_match(struct perf_event *event, return 0; } + if (event->attr.type == PERF_TYPE_TRACEPOINT && + !perf_tp_event_match(event, data)) + return 0; + return 1; } @@ -3842,7 +3860,7 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_swevent_match(event, type, event_id, regs)) + if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_add(event, nr, nmi, data, regs); } rcu_read_unlock(); @@ -4108,6 +4126,7 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE + void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { @@ -4131,8 +4150,15 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record, } EXPORT_SYMBOL_GPL(perf_tp_event); -extern int ftrace_profile_enable(int); -extern void ftrace_profile_disable(int); +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data) +{ + void *record = data->raw->data; + + if (likely(!event->filter) || filter_match_preds(event->filter, record)) + return 1; + return 0; +} static void tp_perf_event_destroy(struct perf_event *event) { @@ -4157,12 +4183,53 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) return &perf_ops_generic; } + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + char *filter_str; + int ret; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + + filter_str = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(filter_str)) + return PTR_ERR(filter_str); + + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + + kfree(filter_str); + return ret; +} + +static void perf_event_free_filter(struct perf_event *event) +{ + ftrace_profile_free_filter(event); +} + #else + +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data) +{ + return 1; +} + static const struct pmu *tp_perf_event_init(struct perf_event *event) { return NULL; } -#endif + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + return -ENOENT; +} + +static void perf_event_free_filter(struct perf_event *event) +{ +} + +#endif /* CONFIG_EVENT_PROFILE */ atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; @@ -4208,6 +4275,8 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) case PERF_COUNT_SW_PAGE_FAULTS_MAJ: case PERF_COUNT_SW_CONTEXT_SWITCHES: case PERF_COUNT_SW_CPU_MIGRATIONS: + case PERF_COUNT_SW_ALIGNMENT_FAULTS: + case PERF_COUNT_SW_EMULATION_FAULTS: if (!event->parent) { atomic_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; @@ -4416,7 +4485,7 @@ err_size: goto out; } -int perf_event_set_output(struct perf_event *event, int output_fd) +static int perf_event_set_output(struct perf_event *event, int output_fd) { struct perf_event *output_event = NULL; struct file *output_file = NULL; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 04b3a83d686..04a9e90d248 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -693,21 +693,22 @@ static int software_resume(void) /* The snapshot device should not be opened while we're running */ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { error = -EBUSY; + swsusp_close(FMODE_READ); goto Unlock; } pm_prepare_console(); error = pm_notifier_call_chain(PM_RESTORE_PREPARE); if (error) - goto Finish; + goto close_finish; error = usermodehelper_disable(); if (error) - goto Finish; + goto close_finish; error = create_basic_memory_bitmaps(); if (error) - goto Finish; + goto close_finish; pr_debug("PM: Preparing processes for restore.\n"); error = prepare_processes(); @@ -719,6 +720,7 @@ static int software_resume(void) pr_debug("PM: Reading hibernation image.\n"); error = swsusp_read(&flags); + swsusp_close(FMODE_READ); if (!error) hibernation_restore(flags & SF_PLATFORM_MODE); @@ -737,6 +739,9 @@ static int software_resume(void) mutex_unlock(&pm_mutex); pr_debug("PM: Resume from disk failed.\n"); return error; +close_finish: + swsusp_close(FMODE_READ); + goto Finish; } late_initcall(software_resume); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 17d8bb1acf9..25596e450ac 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -19,7 +19,7 @@ * The time it takes is system-specific though, so when we test this * during system bootup we allow a LOT of time. */ -#define TEST_SUSPEND_SECONDS 5 +#define TEST_SUSPEND_SECONDS 10 static unsigned long suspend_test_start_time; @@ -49,7 +49,8 @@ void suspend_test_finish(const char *label) * has some performance issues. The stack dump of a WARN_ON * is more likely to get the right attention than a printk... */ - WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); + WARN(msec > (TEST_SUSPEND_SECONDS * 1000), + "Component: %s, time: %u\n", label, msec); } /* diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b101cdc4df3..890f6b11b1d 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -314,7 +314,6 @@ static int save_image(struct swap_map_handle *handle, { unsigned int m; int ret; - int error = 0; int nr_pages; int err2; struct bio *bio; @@ -329,26 +328,27 @@ static int save_image(struct swap_map_handle *handle, nr_pages = 0; bio = NULL; do_gettimeofday(&start); - do { + while (1) { ret = snapshot_read_next(snapshot, PAGE_SIZE); - if (ret > 0) { - error = swap_write_page(handle, data_of(*snapshot), - &bio); - if (error) - break; - if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; - } - } while (ret > 0); + if (ret <= 0) + break; + ret = swap_write_page(handle, data_of(*snapshot), &bio); + if (ret) + break; + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } err2 = wait_on_bio_chain(&bio); do_gettimeofday(&stop); - if (!error) - error = err2; - if (!error) + if (!ret) + ret = err2; + if (!ret) printk("\b\b\b\bdone\n"); + else + printk("\n"); swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); - return error; + return ret; } /** @@ -536,7 +536,8 @@ static int load_image(struct swap_map_handle *handle, snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) error = -ENODATA; - } + } else + printk("\n"); swsusp_show_speed(&start, &stop, nr_to_read, "Read"); return error; } @@ -572,8 +573,6 @@ int swsusp_read(unsigned int *flags_p) error = load_image(&handle, &snapshot, header->pages - 1); release_swap_reader(&handle); - blkdev_put(resume_bdev, FMODE_READ); - if (!error) pr_debug("PM: Image successfully loaded\n"); else @@ -596,7 +595,7 @@ int swsusp_check(void) error = bio_read_page(swsusp_resume_block, swsusp_header, NULL); if (error) - return error; + goto put; if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); @@ -604,8 +603,10 @@ int swsusp_check(void) error = bio_write_page(swsusp_resume_block, swsusp_header, NULL); } else { - return -EINVAL; + error = -EINVAL; } + +put: if (error) blkdev_put(resume_bdev, FMODE_READ); else diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 705f02ac743..f3077c0ab18 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -59,7 +59,7 @@ NUM_RCU_LVL_2, \ NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \ }, \ - .signaled = RCU_SIGNAL_INIT, \ + .signaled = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ @@ -657,14 +657,17 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) * irqs disabled. */ rcu_for_each_node_breadth_first(rsp, rnp) { - spin_lock(&rnp->lock); /* irqs already disabled. */ + spin_lock(&rnp->lock); /* irqs already disabled. */ rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; - spin_unlock(&rnp->lock); /* irqs already disabled. */ + spin_unlock(&rnp->lock); /* irqs remain disabled. */ } + rnp = rcu_get_root(rsp); + spin_lock(&rnp->lock); /* irqs already disabled. */ rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ + spin_unlock(&rnp->lock); /* irqs remain disabled. */ spin_unlock_irqrestore(&rsp->onofflock, flags); } @@ -706,6 +709,7 @@ static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) { WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); rsp->completed = rsp->gpnum; + rsp->signaled = RCU_GP_IDLE; rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ } @@ -913,7 +917,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) spin_unlock(&rnp->lock); /* irqs remain disabled. */ break; } - rcu_preempt_offline_tasks(rsp, rnp, rdp); + + /* + * If there was a task blocking the current grace period, + * and if all CPUs have checked in, we need to propagate + * the quiescent state up the rcu_node hierarchy. But that + * is inconvenient at the moment due to deadlock issues if + * this should end the current grace period. So set the + * offlined CPU's bit in ->qsmask in order to force the + * next force_quiescent_state() invocation to clean up this + * mess in a deadlock-free manner. + */ + if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask) + rnp->qsmask |= mask; + mask = rnp->grpmask; spin_unlock(&rnp->lock); /* irqs remain disabled. */ rnp = rnp->parent; @@ -958,7 +975,7 @@ static void rcu_offline_cpu(int cpu) * Invoke any RCU callbacks that have made it to the end of their grace * period. Thottle as specified by rdp->blimit. */ -static void rcu_do_batch(struct rcu_data *rdp) +static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; @@ -1011,6 +1028,13 @@ static void rcu_do_batch(struct rcu_data *rdp) if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) rdp->blimit = blimit; + /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ + if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { + rdp->qlen_last_fqs_check = 0; + rdp->n_force_qs_snap = rsp->n_force_qs; + } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) + rdp->qlen_last_fqs_check = rdp->qlen; + local_irq_restore(flags); /* Re-raise the RCU softirq if there are callbacks remaining. */ @@ -1142,9 +1166,10 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) } spin_unlock(&rnp->lock); switch (signaled) { + case RCU_GP_IDLE: case RCU_GP_INIT: - break; /* grace period still initializing, ignore. */ + break; /* grace period idle or initializing, ignore. */ case RCU_SAVE_DYNTICK: @@ -1158,7 +1183,8 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) /* Update state, record completion counter. */ spin_lock(&rnp->lock); - if (lastcomp == rsp->completed) { + if (lastcomp == rsp->completed && + rsp->signaled == RCU_SAVE_DYNTICK) { rsp->signaled = RCU_FORCE_QS; dyntick_record_completed(rsp, lastcomp); } @@ -1224,7 +1250,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) } /* If there are callbacks ready, invoke them. */ - rcu_do_batch(rdp); + rcu_do_batch(rsp, rdp); } /* @@ -1288,10 +1314,20 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ } - /* Force the grace period if too many callbacks or too long waiting. */ - if (unlikely(++rdp->qlen > qhimark)) { + /* + * Force the grace period if too many callbacks or too long waiting. + * Enforce hysteresis, and don't invoke force_quiescent_state() + * if some other CPU has recently done so. Also, don't bother + * invoking force_quiescent_state() if the newly enqueued callback + * is the only one waiting for a grace period to complete. + */ + if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { rdp->blimit = LONG_MAX; - force_quiescent_state(rsp, 0); + if (rsp->n_force_qs == rdp->n_force_qs_snap && + *rdp->nxttail[RCU_DONE_TAIL] != head) + force_quiescent_state(rsp, 0); + rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->qlen_last_fqs_check = rdp->qlen; } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) force_quiescent_state(rsp, 1); local_irq_restore(flags); @@ -1523,6 +1559,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) rdp->beenonline = 1; /* We have now been online. */ rdp->preemptable = preemptable; rdp->passed_quiesc_completed = lastcomp - 1; + rdp->qlen_last_fqs_check = 0; + rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; spin_unlock(&rnp->lock); /* irqs remain disabled. */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index b40ac570604..1899023b096 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -167,6 +167,10 @@ struct rcu_data { struct rcu_head *nxtlist; struct rcu_head **nxttail[RCU_NEXT_SIZE]; long qlen; /* # of queued callbacks */ + long qlen_last_fqs_check; + /* qlen at last check for QS forcing */ + unsigned long n_force_qs_snap; + /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ #ifdef CONFIG_NO_HZ @@ -197,9 +201,10 @@ struct rcu_data { }; /* Values for signaled field in struct rcu_state. */ -#define RCU_GP_INIT 0 /* Grace period being initialized. */ -#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */ -#define RCU_FORCE_QS 2 /* Need to force quiescent state. */ +#define RCU_GP_IDLE 0 /* No grace period in progress. */ +#define RCU_GP_INIT 1 /* Grace period being initialized. */ +#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ +#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ #ifdef CONFIG_NO_HZ #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK #else /* #ifdef CONFIG_NO_HZ */ @@ -302,9 +307,9 @@ static void rcu_print_task_stall(struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp); +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp); static void rcu_preempt_offline_cpu(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_preempt_check_callbacks(int cpu); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c0cb783aa16..ef2a58c2b9d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -304,21 +304,25 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) * parent is to remove the need for rcu_read_unlock_special() to * make more than two attempts to acquire the target rcu_node's lock. * + * Returns 1 if there was previously a task blocking the current grace + * period on the specified rcu_node structure. + * * The caller must hold rnp->lock with irqs disabled. */ -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) { int i; struct list_head *lp; struct list_head *lp_root; + int retval = rcu_preempted_readers(rnp); struct rcu_node *rnp_root = rcu_get_root(rsp); struct task_struct *tp; if (rnp == rnp_root) { WARN_ONCE(1, "Last CPU thought to be offlined?"); - return; /* Shouldn't happen: at least one CPU online. */ + return 0; /* Shouldn't happen: at least one CPU online. */ } WARN_ON_ONCE(rnp != rdp->mynode && (!list_empty(&rnp->blocked_tasks[0]) || @@ -342,6 +346,8 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, spin_unlock(&rnp_root->lock); /* irqs remain disabled */ } } + + return retval; } /* @@ -393,6 +399,17 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) EXPORT_SYMBOL_GPL(call_rcu); /* + * Wait for an rcu-preempt grace period. We are supposed to expedite the + * grace period, but this is the crude slow compatability hack, so just + * invoke synchronize_rcu(). + */ +void synchronize_rcu_expedited(void) +{ + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +/* * Check to see if there is any immediate preemptable-RCU-related work * to be done. */ @@ -521,12 +538,15 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) /* * Because preemptable RCU does not exist, it never needs to migrate - * tasks that were blocked within RCU read-side critical sections. + * tasks that were blocked within RCU read-side critical sections, and + * such non-existent tasks cannot possibly have been blocking the current + * grace period. */ -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) { + return 0; } /* @@ -565,6 +585,16 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) EXPORT_SYMBOL_GPL(call_rcu); /* + * Wait for an rcu-preempt grace period, but make it happen quickly. + * But because preemptable RCU does not exist, map to rcu-sched. + */ +void synchronize_rcu_expedited(void) +{ + synchronize_sched_expedited(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +/* * Because preemptable RCU does not exist, it never has any work to do. */ static int rcu_preempt_pending(int cpu) diff --git a/kernel/sched.c b/kernel/sched.c index 76c0e9691fc..3c11ae0a948 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -309,6 +309,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); */ static DEFINE_SPINLOCK(task_group_lock); +#ifdef CONFIG_FAIR_GROUP_SCHED + #ifdef CONFIG_SMP static int root_task_group_empty(void) { @@ -316,7 +318,6 @@ static int root_task_group_empty(void) } #endif -#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) #else /* !CONFIG_USER_SCHED */ @@ -676,6 +677,7 @@ inline void update_rq_clock(struct rq *rq) /** * runqueue_is_locked + * @cpu: the processor in question. * * Returns true if the current cpu runqueue is locked. * This interface allows printk to be called with the runqueue lock @@ -1563,11 +1565,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED -struct update_shares_data { - unsigned long rq_weight[NR_CPUS]; -}; - -static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); +static __read_mostly unsigned long *update_shares_data; static void __set_se_shares(struct sched_entity *se, unsigned long shares); @@ -1577,12 +1575,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); static void update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long sd_shares, unsigned long sd_rq_weight, - struct update_shares_data *usd) + unsigned long *usd_rq_weight) { unsigned long shares, rq_weight; int boost = 0; - rq_weight = usd->rq_weight[cpu]; + rq_weight = usd_rq_weight[cpu]; if (!rq_weight) { boost = 1; rq_weight = NICE_0_LOAD; @@ -1617,7 +1615,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, static int tg_shares_up(struct task_group *tg, void *data) { unsigned long weight, rq_weight = 0, shares = 0; - struct update_shares_data *usd; + unsigned long *usd_rq_weight; struct sched_domain *sd = data; unsigned long flags; int i; @@ -1626,11 +1624,11 @@ static int tg_shares_up(struct task_group *tg, void *data) return 0; local_irq_save(flags); - usd = &__get_cpu_var(update_shares_data); + usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); for_each_cpu(i, sched_domain_span(sd)) { weight = tg->cfs_rq[i]->load.weight; - usd->rq_weight[i] = weight; + usd_rq_weight[i] = weight; /* * If there are currently no tasks on the cpu pretend there @@ -1651,7 +1649,7 @@ static int tg_shares_up(struct task_group *tg, void *data) shares = tg->shares; for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight, usd); + update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); local_irq_restore(flags); @@ -1995,6 +1993,38 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio, running); } +/** + * kthread_bind - bind a just-created kthread to a cpu. + * @p: thread created by kthread_create(). + * @cpu: cpu (might not be online, must be possible) for @k to run on. + * + * Description: This function is equivalent to set_cpus_allowed(), + * except that @cpu doesn't need to be online, and the thread must be + * stopped (i.e., just returned from kthread_create()). + * + * Function lives here instead of kthread.c because it messes with + * scheduler internals which require locking. + */ +void kthread_bind(struct task_struct *p, unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { + WARN_ON(1); + return; + } + + spin_lock_irqsave(&rq->lock, flags); + set_task_cpu(p, cpu); + p->cpus_allowed = cpumask_of_cpu(cpu); + p->rt.nr_cpus_allowed = 1; + p->flags |= PF_THREAD_BOUND; + spin_unlock_irqrestore(&rq->lock, flags); +} +EXPORT_SYMBOL(kthread_bind); + #ifdef CONFIG_SMP /* * Is this task likely cache-hot: @@ -2007,7 +2037,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) /* * Buddy candidates are cache hot: */ - if (sched_feat(CACHE_HOT_BUDDY) && + if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && (&p->se == cfs_rq_of(&p->se)->next || &p->se == cfs_rq_of(&p->se)->last)) return 1; @@ -2311,7 +2341,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; - struct rq *rq; + struct rq *rq, *orig_rq; if (!sched_feat(SYNC_WAKEUPS)) wake_flags &= ~WF_SYNC; @@ -2319,7 +2349,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, this_cpu = get_cpu(); smp_wmb(); - rq = task_rq_lock(p, &flags); + rq = orig_rq = task_rq_lock(p, &flags); update_rq_clock(rq); if (!(p->state & state)) goto out; @@ -2350,6 +2380,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, set_task_cpu(p, cpu); rq = task_rq_lock(p, &flags); + + if (rq != orig_rq) + update_rq_clock(rq); + WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); @@ -3656,6 +3690,7 @@ static void update_group_power(struct sched_domain *sd, int cpu) /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @sd: The sched_domain whose statistics are to be updated. * @group: sched_group whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu @@ -6718,9 +6753,6 @@ EXPORT_SYMBOL(yield); /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. - * - * But don't do that if it is a deliberate, throttling IO wait (this task - * has set its backing_dev_info: the queue against which it should throttle) */ void __sched io_schedule(void) { @@ -9404,6 +9436,10 @@ void __init sched_init(void) #endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_GROUP_SCHED */ +#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP + update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), + __alignof__(unsigned long)); +#endif for_each_possible_cpu(i) { struct rq *rq; @@ -9529,13 +9565,13 @@ void __init sched_init(void) current->sched_class = &fair_sched_class; /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ - alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ - alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); #endif - alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); + zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ perf_event_init(); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4e777b47eed..37087a7fac2 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -822,6 +822,26 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) * re-elected due to buddy favours. */ clear_buddies(cfs_rq, curr); + return; + } + + /* + * Ensure that a task that missed wakeup preemption by a + * narrow margin doesn't have to wait for a full slice. + * This also mitigates buddy induced latencies under load. + */ + if (!sched_feat(WAKEUP_PREEMPT)) + return; + + if (delta_exec < sysctl_sched_min_granularity) + return; + + if (cfs_rq->nr_running > 1) { + struct sched_entity *se = __pick_next_entity(cfs_rq); + s64 delta = curr->vruntime - se->vruntime; + + if (delta > ideal_runtime) + resched_task(rq_of(cfs_rq)->curr); } } @@ -861,12 +881,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *left = se; - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) - return cfs_rq->next; + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + se = cfs_rq->next; - if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) - return cfs_rq->last; + /* + * Prefer last buddy, try to return the CPU to a preempted task. + */ + if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) + se = cfs_rq->last; + + clear_buddies(cfs_rq, se); return se; } @@ -1568,6 +1594,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); int sync = wake_flags & WF_SYNC; + int scale = cfs_rq->nr_running >= sched_nr_latency; update_curr(cfs_rq); @@ -1582,18 +1609,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(se == pse)) return; - /* - * Only set the backward buddy when the current task is still on the - * rq. This can happen when a wakeup gets interleaved with schedule on - * the ->pre_schedule() or idle_balance() point, either of which can - * drop the rq lock. - * - * Also, during early boot the idle thread is in the fair class, for - * obvious reasons its a bad idea to schedule back to the idle thread. - */ - if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) - set_last_buddy(se); - if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) + if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) set_next_buddy(pse); /* @@ -1639,8 +1655,22 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) + if (wakeup_preempt_entity(se, pse) == 1) { resched_task(curr); + /* + * Only set the backward buddy when the current task is still + * on the rq. This can happen when a wakeup gets interleaved + * with schedule on the ->pre_schedule() or idle_balance() + * point, either of which can * drop the rq lock. + * + * Also, during early boot the idle thread is in the fair class, + * for obvious reasons its a bad idea to schedule back to it. + */ + if (unlikely(!se->on_rq || curr == rq->idle)) + return; + if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) + set_last_buddy(se); + } } static struct task_struct *pick_next_task_fair(struct rq *rq) @@ -1654,16 +1684,6 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) do { se = pick_next_entity(cfs_rq); - /* - * If se was a buddy, clear it so that it will have to earn - * the favour again. - * - * If se was not a buddy, clear the buddies because neither - * was elegible to run, let them earn it again. - * - * IOW. unconditionally clear buddies. - */ - __clear_buddies(cfs_rq, NULL); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); diff --git a/kernel/sys.c b/kernel/sys.c index 255475d163e..ce17760d9c5 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1110,6 +1110,8 @@ SYSCALL_DEFINE0(setsid) err = session; out: write_unlock_irq(&tasklist_lock); + if (err > 0) + proc_sid_connector(group_leader); return err; } @@ -1546,24 +1548,37 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (arg4 | arg5) return -EINVAL; switch (arg2) { - case 0: + case PR_MCE_KILL_CLEAR: if (arg3 != 0) return -EINVAL; current->flags &= ~PF_MCE_PROCESS; break; - case 1: + case PR_MCE_KILL_SET: current->flags |= PF_MCE_PROCESS; - if (arg3 != 0) + if (arg3 == PR_MCE_KILL_EARLY) current->flags |= PF_MCE_EARLY; - else + else if (arg3 == PR_MCE_KILL_LATE) current->flags &= ~PF_MCE_EARLY; + else if (arg3 == PR_MCE_KILL_DEFAULT) + current->flags &= + ~(PF_MCE_EARLY|PF_MCE_PROCESS); + else + return -EINVAL; break; default: return -EINVAL; } error = 0; break; - + case PR_MCE_KILL_GET: + if (arg2 | arg3 | arg4 | arg5) + return -EINVAL; + if (current->flags & PF_MCE_PROCESS) + error = (current->flags & PF_MCE_EARLY) ? + PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; + else + error = PR_MCE_KILL_DEFAULT; + break; default: error = -EINVAL; break; diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index b38423ca711..b6e7aaea460 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -1521,7 +1521,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) if (!table->ctl_name && table->strategy) set_fail(&fail, table, "Strategy without ctl_name"); #endif -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL if (table->procname && !table->proc_handler) set_fail(&fail, table, "No proc_handler"); #endif diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b416512ad17..f05671609a8 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -428,6 +428,23 @@ config BLK_DEV_IO_TRACE If unsure, say N. +config KPROBE_EVENT + depends on KPROBES + depends on X86 + bool "Enable kprobes-based dynamic events" + select TRACING + default y + help + This allows the user to add tracing events (similar to tracepoints) on the fly + via the ftrace interface. See Documentation/trace/kprobetrace.txt + for more details. + + Those events can be inserted wherever kprobes can probe, and record + various register and memory values. + + This option is also required by perf-probe subcommand of perf tools. If + you want to use perf tools, this option is strongly recommended. + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FUNCTION_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 26f03ac07c2..edc3a3cca1a 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o +obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_EVENT_TRACING) += power-traces.o libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 37ba67e3326..7cb6f192259 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -60,6 +60,13 @@ static int last_ftrace_enabled; /* Quick disabling of function tracer. */ int function_trace_stop; +/* List for set_ftrace_pid's pids. */ +LIST_HEAD(ftrace_pids); +struct ftrace_pid { + struct list_head list; + struct pid *pid; +}; + /* * ftrace_disabled is set when an anomaly is discovered. * ftrace_disabled is much stronger than ftrace_enabled. @@ -78,6 +85,10 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); +#endif + static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { struct ftrace_ops *op = ftrace_list; @@ -155,7 +166,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) else func = ftrace_list_func; - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } @@ -203,7 +214,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_list->next == &ftrace_list_end) { ftrace_func_t func = ftrace_list->func; - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } @@ -231,7 +242,7 @@ static void ftrace_update_pid_func(void) func = __ftrace_trace_function; #endif - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } else { @@ -740,7 +751,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, out: mutex_unlock(&ftrace_profile_lock); - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -821,8 +832,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) } #endif /* CONFIG_FUNCTION_PROFILER */ -/* set when tracing only a pid */ -struct pid *ftrace_pid_trace; static struct pid * const ftrace_swapper_pid = &init_struct_pid; #ifdef CONFIG_DYNAMIC_FTRACE @@ -1261,12 +1270,34 @@ static int ftrace_update_code(struct module *mod) ftrace_new_addrs = p->newlist; p->flags = 0L; - /* convert record (i.e, patch mcount-call with NOP) */ - if (ftrace_code_disable(mod, p)) { - p->flags |= FTRACE_FL_CONVERTED; - ftrace_update_cnt++; - } else + /* + * Do the initial record convertion from mcount jump + * to the NOP instructions. + */ + if (!ftrace_code_disable(mod, p)) { ftrace_free_rec(p); + continue; + } + + p->flags |= FTRACE_FL_CONVERTED; + ftrace_update_cnt++; + + /* + * If the tracing is enabled, go ahead and enable the record. + * + * The reason not to enable the record immediatelly is the + * inherent check of ftrace_make_nop/ftrace_make_call for + * correct previous instructions. Making first the NOP + * conversion puts the module to the correct state, thus + * passing the ftrace_make_call check. + */ + if (ftrace_start_up) { + int failed = __ftrace_replace_code(p, 1); + if (failed) { + ftrace_bug(failed, p->ip); + ftrace_free_rec(p); + } + } } stop = ftrace_now(raw_smp_processor_id()); @@ -1656,60 +1687,6 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin) return ret; } -enum { - MATCH_FULL, - MATCH_FRONT_ONLY, - MATCH_MIDDLE_ONLY, - MATCH_END_ONLY, -}; - -/* - * (static function - no need for kernel doc) - * - * Pass in a buffer containing a glob and this function will - * set search to point to the search part of the buffer and - * return the type of search it is (see enum above). - * This does modify buff. - * - * Returns enum type. - * search returns the pointer to use for comparison. - * not returns 1 if buff started with a '!' - * 0 otherwise. - */ -static int -ftrace_setup_glob(char *buff, int len, char **search, int *not) -{ - int type = MATCH_FULL; - int i; - - if (buff[0] == '!') { - *not = 1; - buff++; - len--; - } else - *not = 0; - - *search = buff; - - for (i = 0; i < len; i++) { - if (buff[i] == '*') { - if (!i) { - *search = buff + 1; - type = MATCH_END_ONLY; - } else { - if (type == MATCH_END_ONLY) - type = MATCH_MIDDLE_ONLY; - else - type = MATCH_FRONT_ONLY; - buff[i] = 0; - break; - } - } - } - - return type; -} - static int ftrace_match(char *str, char *regex, int len, int type) { int matched = 0; @@ -1758,7 +1735,7 @@ static void ftrace_match_records(char *buff, int len, int enable) int not; flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; - type = ftrace_setup_glob(buff, len, &search, ¬); + type = filter_parse_regex(buff, len, &search, ¬); search_len = strlen(search); @@ -1826,7 +1803,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable) } if (strlen(buff)) { - type = ftrace_setup_glob(buff, strlen(buff), &search, ¬); + type = filter_parse_regex(buff, strlen(buff), &search, ¬); search_len = strlen(search); } @@ -1991,7 +1968,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, int count = 0; char *search; - type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); + type = filter_parse_regex(glob, strlen(glob), &search, ¬); len = strlen(search); /* we do not support '!' for function probes */ @@ -2068,7 +2045,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, else if (glob) { int not; - type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); + type = filter_parse_regex(glob, strlen(glob), &search, ¬); len = strlen(search); /* we do not support '!' for function probes */ @@ -2222,15 +2199,15 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, ret = ftrace_process_regex(parser->buffer, parser->idx, enable); if (ret) - goto out; + goto out_unlock; trace_parser_clear(parser); } ret = read; - +out_unlock: mutex_unlock(&ftrace_regex_lock); -out: + return ret; } @@ -2297,6 +2274,7 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset) #define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; +static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static int __init set_ftrace_notrace(char *str) { @@ -2312,6 +2290,31 @@ static int __init set_ftrace_filter(char *str) } __setup("ftrace_filter=", set_ftrace_filter); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int __init set_graph_function(char *str) +{ + strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_graph_filter=", set_graph_function); + +static void __init set_ftrace_early_graph(char *buf) +{ + int ret; + char *func; + + while (buf) { + func = strsep(&buf, ","); + /* we allow only one expression at a time */ + ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, + func); + if (ret) + printk(KERN_DEBUG "ftrace: function %s not " + "traceable\n", func); + } +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + static void __init set_ftrace_early_filter(char *buf, int enable) { char *func; @@ -2328,6 +2331,10 @@ static void __init set_ftrace_early_filters(void) set_ftrace_early_filter(ftrace_filter_buf, 1); if (ftrace_notrace_buf[0]) set_ftrace_early_filter(ftrace_notrace_buf, 0); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (ftrace_graph_buf[0]) + set_ftrace_early_graph(ftrace_graph_buf); +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ } static int @@ -2513,7 +2520,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) return -ENODEV; /* decode regex */ - type = ftrace_setup_glob(buffer, strlen(buffer), &search, ¬); + type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); if (not) return -EINVAL; @@ -2624,7 +2631,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) return 0; } -static int ftrace_convert_nops(struct module *mod, +static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) { @@ -2684,7 +2691,7 @@ static void ftrace_init_module(struct module *mod, { if (ftrace_disabled || start == end) return; - ftrace_convert_nops(mod, start, end); + ftrace_process_locs(mod, start, end); } static int ftrace_module_notify(struct notifier_block *self, @@ -2745,7 +2752,7 @@ void __init ftrace_init(void) last_ftrace_enabled = ftrace_enabled = 1; - ret = ftrace_convert_nops(NULL, + ret = ftrace_process_locs(NULL, __start_mcount_loc, __stop_mcount_loc); @@ -2778,23 +2785,6 @@ static inline void ftrace_startup_enable(int command) { } # define ftrace_shutdown_sysctl() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ -static ssize_t -ftrace_pid_read(struct file *file, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - int r; - - if (ftrace_pid_trace == ftrace_swapper_pid) - r = sprintf(buf, "swapper tasks\n"); - else if (ftrace_pid_trace) - r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace)); - else - r = sprintf(buf, "no pid\n"); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - static void clear_ftrace_swapper(void) { struct task_struct *p; @@ -2845,14 +2835,12 @@ static void set_ftrace_pid(struct pid *pid) rcu_read_unlock(); } -static void clear_ftrace_pid_task(struct pid **pid) +static void clear_ftrace_pid_task(struct pid *pid) { - if (*pid == ftrace_swapper_pid) + if (pid == ftrace_swapper_pid) clear_ftrace_swapper(); else - clear_ftrace_pid(*pid); - - *pid = NULL; + clear_ftrace_pid(pid); } static void set_ftrace_pid_task(struct pid *pid) @@ -2863,11 +2851,140 @@ static void set_ftrace_pid_task(struct pid *pid) set_ftrace_pid(pid); } +static int ftrace_pid_add(int p) +{ + struct pid *pid; + struct ftrace_pid *fpid; + int ret = -EINVAL; + + mutex_lock(&ftrace_lock); + + if (!p) + pid = ftrace_swapper_pid; + else + pid = find_get_pid(p); + + if (!pid) + goto out; + + ret = 0; + + list_for_each_entry(fpid, &ftrace_pids, list) + if (fpid->pid == pid) + goto out_put; + + ret = -ENOMEM; + + fpid = kmalloc(sizeof(*fpid), GFP_KERNEL); + if (!fpid) + goto out_put; + + list_add(&fpid->list, &ftrace_pids); + fpid->pid = pid; + + set_ftrace_pid_task(pid); + + ftrace_update_pid_func(); + ftrace_startup_enable(0); + + mutex_unlock(&ftrace_lock); + return 0; + +out_put: + if (pid != ftrace_swapper_pid) + put_pid(pid); + +out: + mutex_unlock(&ftrace_lock); + return ret; +} + +static void ftrace_pid_reset(void) +{ + struct ftrace_pid *fpid, *safe; + + mutex_lock(&ftrace_lock); + list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) { + struct pid *pid = fpid->pid; + + clear_ftrace_pid_task(pid); + + list_del(&fpid->list); + kfree(fpid); + } + + ftrace_update_pid_func(); + ftrace_startup_enable(0); + + mutex_unlock(&ftrace_lock); +} + +static void *fpid_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&ftrace_lock); + + if (list_empty(&ftrace_pids) && (!*pos)) + return (void *) 1; + + return seq_list_start(&ftrace_pids, *pos); +} + +static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) +{ + if (v == (void *)1) + return NULL; + + return seq_list_next(v, &ftrace_pids, pos); +} + +static void fpid_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&ftrace_lock); +} + +static int fpid_show(struct seq_file *m, void *v) +{ + const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); + + if (v == (void *)1) { + seq_printf(m, "no pid\n"); + return 0; + } + + if (fpid->pid == ftrace_swapper_pid) + seq_printf(m, "swapper tasks\n"); + else + seq_printf(m, "%u\n", pid_vnr(fpid->pid)); + + return 0; +} + +static const struct seq_operations ftrace_pid_sops = { + .start = fpid_start, + .next = fpid_next, + .stop = fpid_stop, + .show = fpid_show, +}; + +static int +ftrace_pid_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_pid_reset(); + + if (file->f_mode & FMODE_READ) + ret = seq_open(file, &ftrace_pid_sops); + + return ret; +} + static ssize_t ftrace_pid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct pid *pid; char buf[64]; long val; int ret; @@ -2880,57 +2997,38 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; + /* + * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" + * to clean the filter quietly. + */ + strstrip(buf); + if (strlen(buf) == 0) + return 1; + ret = strict_strtol(buf, 10, &val); if (ret < 0) return ret; - mutex_lock(&ftrace_lock); - if (val < 0) { - /* disable pid tracing */ - if (!ftrace_pid_trace) - goto out; - - clear_ftrace_pid_task(&ftrace_pid_trace); - - } else { - /* swapper task is special */ - if (!val) { - pid = ftrace_swapper_pid; - if (pid == ftrace_pid_trace) - goto out; - } else { - pid = find_get_pid(val); - - if (pid == ftrace_pid_trace) { - put_pid(pid); - goto out; - } - } - - if (ftrace_pid_trace) - clear_ftrace_pid_task(&ftrace_pid_trace); + ret = ftrace_pid_add(val); - if (!pid) - goto out; - - ftrace_pid_trace = pid; - - set_ftrace_pid_task(ftrace_pid_trace); - } - - /* update the function call */ - ftrace_update_pid_func(); - ftrace_startup_enable(0); + return ret ? ret : cnt; +} - out: - mutex_unlock(&ftrace_lock); +static int +ftrace_pid_release(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_READ) + seq_release(inode, file); - return cnt; + return 0; } static const struct file_operations ftrace_pid_fops = { - .read = ftrace_pid_read, - .write = ftrace_pid_write, + .open = ftrace_pid_open, + .write = ftrace_pid_write, + .read = seq_read, + .llseek = seq_lseek, + .release = ftrace_pid_release, }; static __init int ftrace_init_debugfs(void) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d4ff0197054..db223fe8887 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -397,18 +397,21 @@ int ring_buffer_print_page_header(struct trace_seq *s) int ret; ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" - "offset:0;\tsize:%u;\n", - (unsigned int)sizeof(field.time_stamp)); + "offset:0;\tsize:%u;\tsigned:%u;\n", + (unsigned int)sizeof(field.time_stamp), + (unsigned int)is_signed_type(u64)); ret = trace_seq_printf(s, "\tfield: local_t commit;\t" - "offset:%u;\tsize:%u;\n", + "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), - (unsigned int)sizeof(field.commit)); + (unsigned int)sizeof(field.commit), + (unsigned int)is_signed_type(long)); ret = trace_seq_printf(s, "\tfield: char data;\t" - "offset:%u;\tsize:%u;\n", + "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), - (unsigned int)BUF_PAGE_SIZE); + (unsigned int)BUF_PAGE_SIZE, + (unsigned int)is_signed_type(char)); return ret; } @@ -483,7 +486,7 @@ struct ring_buffer_iter { /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 -static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) +static inline u64 rb_time_stamp(struct ring_buffer *buffer) { /* shift to debug/test normalization and TIME_EXTENTS */ return buffer->clock() << DEBUG_SHIFT; @@ -494,7 +497,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) u64 time; preempt_disable_notrace(); - time = rb_time_stamp(buffer, cpu); + time = rb_time_stamp(buffer); preempt_enable_no_resched_notrace(); return time; @@ -599,7 +602,7 @@ static struct list_head *rb_list_head(struct list_head *list) } /* - * rb_is_head_page - test if the give page is the head page + * rb_is_head_page - test if the given page is the head page * * Because the reader may move the head_page pointer, we can * not trust what the head page is (it may be pointing to @@ -1193,6 +1196,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) atomic_inc(&cpu_buffer->record_disabled); synchronize_sched(); + spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { @@ -1207,6 +1211,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) return; rb_reset_cpu(cpu_buffer); + spin_unlock_irq(&cpu_buffer->reader_lock); rb_check_pages(cpu_buffer); @@ -1868,7 +1873,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, * Nested commits always have zero deltas, so * just reread the time stamp */ - *ts = rb_time_stamp(buffer, cpu_buffer->cpu); + *ts = rb_time_stamp(buffer); next_page->page->time_stamp = *ts; } @@ -2111,7 +2116,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; - ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); + ts = rb_time_stamp(cpu_buffer->buffer); /* * Only the first commit can update the timestamp. @@ -2681,7 +2686,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) EXPORT_SYMBOL_GPL(ring_buffer_entries); /** - * ring_buffer_overrun_cpu - get the number of overruns in buffer + * ring_buffer_overruns - get the number of overruns in buffer * @buffer: The ring buffer * * Returns the total number of overruns in the ring buffer diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 45068269ebb..9d3067a62d4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -129,7 +129,7 @@ static int tracing_set_tracer(const char *buf); static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; -static int __init set_ftrace(char *str) +static int __init set_cmdline_ftrace(char *str) { strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; @@ -137,7 +137,7 @@ static int __init set_ftrace(char *str) ring_buffer_expanded = 1; return 1; } -__setup("ftrace=", set_ftrace); +__setup("ftrace=", set_cmdline_ftrace); static int __init set_ftrace_dump_on_oops(char *str) { @@ -1393,7 +1393,7 @@ int trace_array_vprintk(struct trace_array *tr, int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { - return trace_array_printk(&global_trace, ip, fmt, args); + return trace_array_vprintk(&global_trace, ip, fmt, args); } EXPORT_SYMBOL_GPL(trace_vprintk); @@ -2440,7 +2440,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, return ret; } - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -2582,7 +2582,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, } mutex_unlock(&trace_types_lock); - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -2764,7 +2764,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, if (err) return err; - filp->f_pos += ret; + *ppos += ret; return ret; } @@ -3299,7 +3299,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, } } - filp->f_pos += cnt; + *ppos += cnt; /* If check pages failed, return ENOMEM */ if (tracing_disabled) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 405cb850b75..b4e4212e66d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -101,6 +101,29 @@ struct syscall_trace_exit { unsigned long ret; }; +struct kprobe_trace_entry { + struct trace_entry ent; + unsigned long ip; + int nargs; + unsigned long args[]; +}; + +#define SIZEOF_KPROBE_TRACE_ENTRY(n) \ + (offsetof(struct kprobe_trace_entry, args) + \ + (sizeof(unsigned long) * (n))) + +struct kretprobe_trace_entry { + struct trace_entry ent; + unsigned long func; + unsigned long ret_ip; + int nargs; + unsigned long args[]; +}; + +#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \ + (offsetof(struct kretprobe_trace_entry, args) + \ + (sizeof(unsigned long) * (n))) + /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: @@ -483,10 +506,6 @@ static inline int ftrace_graph_addr(unsigned long addr) return 0; } #else -static inline int ftrace_trace_addr(unsigned long addr) -{ - return 1; -} static inline int ftrace_graph_addr(unsigned long addr) { return 1; @@ -500,12 +519,12 @@ print_graph_function(struct trace_iterator *iter) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -extern struct pid *ftrace_pid_trace; +extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER static inline int ftrace_trace_task(struct task_struct *task) { - if (!ftrace_pid_trace) + if (list_empty(&ftrace_pids)) return 1; return test_tsk_trace_trace(task); @@ -687,7 +706,6 @@ struct event_filter { int n_preds; struct filter_pred **preds; char *filter_string; - bool no_reset; }; struct event_subsystem { @@ -699,22 +717,40 @@ struct event_subsystem { }; struct filter_pred; +struct regex; typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, int val1, int val2); +typedef int (*regex_match_func)(char *str, struct regex *r, int len); + +enum regex_type { + MATCH_FULL = 0, + MATCH_FRONT_ONLY, + MATCH_MIDDLE_ONLY, + MATCH_END_ONLY, +}; + +struct regex { + char pattern[MAX_FILTER_STR_VAL]; + int len; + int field_len; + regex_match_func match; +}; + struct filter_pred { - filter_pred_fn_t fn; - u64 val; - char str_val[MAX_FILTER_STR_VAL]; - int str_len; - char *field_name; - int offset; - int not; - int op; - int pop_n; + filter_pred_fn_t fn; + u64 val; + struct regex regex; + char *field_name; + int offset; + int not; + int op; + int pop_n; }; +extern enum regex_type +filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s); extern int apply_event_filter(struct ftrace_event_call *call, @@ -730,7 +766,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { + if (unlikely(call->filter_active) && + !filter_match_preds(call->filter, rec)) { ring_buffer_discard_commit(buffer, event); return 1; } diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 8d5c171cc99..e0d351b01f5 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -8,44 +8,39 @@ #include <linux/module.h> #include "trace.h" -/* - * We can't use a size but a type in alloc_percpu() - * So let's create a dummy type that matches the desired size - */ -typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t; -char *trace_profile_buf; -EXPORT_SYMBOL_GPL(trace_profile_buf); +struct perf_trace_buf *perf_trace_buf; +EXPORT_SYMBOL_GPL(perf_trace_buf); -char *trace_profile_buf_nmi; -EXPORT_SYMBOL_GPL(trace_profile_buf_nmi); +struct perf_trace_buf *perf_trace_buf_nmi; +EXPORT_SYMBOL_GPL(perf_trace_buf_nmi); /* Count the events in use (per event id, not per instance) */ static int total_profile_count; static int ftrace_profile_enable_event(struct ftrace_event_call *event) { - char *buf; + struct perf_trace_buf *buf; int ret = -ENOMEM; if (atomic_inc_return(&event->profile_count)) return 0; if (!total_profile_count) { - buf = (char *)alloc_percpu(profile_buf_t); + buf = alloc_percpu(struct perf_trace_buf); if (!buf) goto fail_buf; - rcu_assign_pointer(trace_profile_buf, buf); + rcu_assign_pointer(perf_trace_buf, buf); - buf = (char *)alloc_percpu(profile_buf_t); + buf = alloc_percpu(struct perf_trace_buf); if (!buf) goto fail_buf_nmi; - rcu_assign_pointer(trace_profile_buf_nmi, buf); + rcu_assign_pointer(perf_trace_buf_nmi, buf); } - ret = event->profile_enable(); + ret = event->profile_enable(event); if (!ret) { total_profile_count++; return 0; @@ -53,10 +48,10 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event) fail_buf_nmi: if (!total_profile_count) { - free_percpu(trace_profile_buf_nmi); - free_percpu(trace_profile_buf); - trace_profile_buf_nmi = NULL; - trace_profile_buf = NULL; + free_percpu(perf_trace_buf_nmi); + free_percpu(perf_trace_buf); + perf_trace_buf_nmi = NULL; + perf_trace_buf = NULL; } fail_buf: atomic_dec(&event->profile_count); @@ -84,19 +79,19 @@ int ftrace_profile_enable(int event_id) static void ftrace_profile_disable_event(struct ftrace_event_call *event) { - char *buf, *nmi_buf; + struct perf_trace_buf *buf, *nmi_buf; if (!atomic_add_negative(-1, &event->profile_count)) return; - event->profile_disable(); + event->profile_disable(event); if (!--total_profile_count) { - buf = trace_profile_buf; - rcu_assign_pointer(trace_profile_buf, NULL); + buf = perf_trace_buf; + rcu_assign_pointer(perf_trace_buf, NULL); - nmi_buf = trace_profile_buf_nmi; - rcu_assign_pointer(trace_profile_buf_nmi, NULL); + nmi_buf = perf_trace_buf_nmi; + rcu_assign_pointer(perf_trace_buf_nmi, NULL); /* * Ensure every events in profiling have finished before diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index d128f65778e..1d18315dc83 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -93,9 +93,7 @@ int trace_define_common_fields(struct ftrace_event_call *call) } EXPORT_SYMBOL_GPL(trace_define_common_fields); -#ifdef CONFIG_MODULES - -static void trace_destroy_fields(struct ftrace_event_call *call) +void trace_destroy_fields(struct ftrace_event_call *call) { struct ftrace_event_field *field, *next; @@ -107,8 +105,6 @@ static void trace_destroy_fields(struct ftrace_event_call *call) } } -#endif /* CONFIG_MODULES */ - static void ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { @@ -117,14 +113,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, if (call->enabled) { call->enabled = 0; tracing_stop_cmdline_record(); - call->unregfunc(call->data); + call->unregfunc(call); } break; case 1: if (!call->enabled) { call->enabled = 1; tracing_start_cmdline_record(); - call->regfunc(call->data); + call->regfunc(call); } break; } @@ -507,7 +503,7 @@ extern char *__bad_type_size(void); #define FIELD(type, name) \ sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ #type, "common_" #name, offsetof(typeof(field), name), \ - sizeof(field.name) + sizeof(field.name), is_signed_type(type) static int trace_write_header(struct trace_seq *s) { @@ -515,17 +511,17 @@ static int trace_write_header(struct trace_seq *s) /* struct trace_entry */ return trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\n", - FIELD(unsigned short, type), - FIELD(unsigned char, flags), - FIELD(unsigned char, preempt_count), - FIELD(int, pid), - FIELD(int, lock_depth)); + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\n", + FIELD(unsigned short, type), + FIELD(unsigned char, flags), + FIELD(unsigned char, preempt_count), + FIELD(int, pid), + FIELD(int, lock_depth)); } static ssize_t @@ -878,9 +874,9 @@ event_subsystem_dir(const char *name, struct dentry *d_events) "'%s/filter' entry\n", name); } - entry = trace_create_file("enable", 0644, system->entry, - (void *)system->name, - &ftrace_system_enable_fops); + trace_create_file("enable", 0644, system->entry, + (void *)system->name, + &ftrace_system_enable_fops); return system->entry; } @@ -892,7 +888,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, const struct file_operations *filter, const struct file_operations *format) { - struct dentry *entry; int ret; /* @@ -910,12 +905,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, } if (call->regfunc) - entry = trace_create_file("enable", 0644, call->dir, call, - enable); + trace_create_file("enable", 0644, call->dir, call, + enable); if (call->id && call->profile_enable) - entry = trace_create_file("id", 0444, call->dir, call, - id); + trace_create_file("id", 0444, call->dir, call, + id); if (call->define_fields) { ret = call->define_fields(call); @@ -924,41 +919,60 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, " events/%s\n", call->name); return ret; } - entry = trace_create_file("filter", 0644, call->dir, call, - filter); + trace_create_file("filter", 0644, call->dir, call, + filter); } /* A trace may not want to export its format */ if (!call->show_format) return 0; - entry = trace_create_file("format", 0444, call->dir, call, - format); + trace_create_file("format", 0444, call->dir, call, + format); return 0; } -#define for_each_event(event, start, end) \ - for (event = start; \ - (unsigned long)event < (unsigned long)end; \ - event++) +static int __trace_add_event_call(struct ftrace_event_call *call) +{ + struct dentry *d_events; + int ret; -#ifdef CONFIG_MODULES + if (!call->name) + return -EINVAL; -static LIST_HEAD(ftrace_module_file_list); + if (call->raw_init) { + ret = call->raw_init(call); + if (ret < 0) { + if (ret != -ENOSYS) + pr_warning("Could not initialize trace " + "events/%s\n", call->name); + return ret; + } + } -/* - * Modules must own their file_operations to keep up with - * reference counting. - */ -struct ftrace_module_file_ops { - struct list_head list; - struct module *mod; - struct file_operations id; - struct file_operations enable; - struct file_operations format; - struct file_operations filter; -}; + d_events = event_trace_events_dir(); + if (!d_events) + return -ENOENT; + + ret = event_create_dir(call, d_events, &ftrace_event_id_fops, + &ftrace_enable_fops, &ftrace_event_filter_fops, + &ftrace_event_format_fops); + if (!ret) + list_add(&call->list, &ftrace_events); + + return ret; +} + +/* Add an additional event_call dynamically */ +int trace_add_event_call(struct ftrace_event_call *call) +{ + int ret; + mutex_lock(&event_mutex); + ret = __trace_add_event_call(call); + mutex_unlock(&event_mutex); + return ret; +} static void remove_subsystem_dir(const char *name) { @@ -986,6 +1000,53 @@ static void remove_subsystem_dir(const char *name) } } +/* + * Must be called under locking both of event_mutex and trace_event_mutex. + */ +static void __trace_remove_event_call(struct ftrace_event_call *call) +{ + ftrace_event_enable_disable(call, 0); + if (call->event) + __unregister_ftrace_event(call->event); + debugfs_remove_recursive(call->dir); + list_del(&call->list); + trace_destroy_fields(call); + destroy_preds(call); + remove_subsystem_dir(call->system); +} + +/* Remove an event_call */ +void trace_remove_event_call(struct ftrace_event_call *call) +{ + mutex_lock(&event_mutex); + down_write(&trace_event_mutex); + __trace_remove_event_call(call); + up_write(&trace_event_mutex); + mutex_unlock(&event_mutex); +} + +#define for_each_event(event, start, end) \ + for (event = start; \ + (unsigned long)event < (unsigned long)end; \ + event++) + +#ifdef CONFIG_MODULES + +static LIST_HEAD(ftrace_module_file_list); + +/* + * Modules must own their file_operations to keep up with + * reference counting. + */ +struct ftrace_module_file_ops { + struct list_head list; + struct module *mod; + struct file_operations id; + struct file_operations enable; + struct file_operations format; + struct file_operations filter; +}; + static struct ftrace_module_file_ops * trace_create_file_ops(struct module *mod) { @@ -1043,7 +1104,7 @@ static void trace_module_add_events(struct module *mod) if (!call->name) continue; if (call->raw_init) { - ret = call->raw_init(); + ret = call->raw_init(call); if (ret < 0) { if (ret != -ENOSYS) pr_warning("Could not initialize trace " @@ -1061,10 +1122,11 @@ static void trace_module_add_events(struct module *mod) return; } call->mod = mod; - list_add(&call->list, &ftrace_events); - event_create_dir(call, d_events, - &file_ops->id, &file_ops->enable, - &file_ops->filter, &file_ops->format); + ret = event_create_dir(call, d_events, + &file_ops->id, &file_ops->enable, + &file_ops->filter, &file_ops->format); + if (!ret) + list_add(&call->list, &ftrace_events); } } @@ -1078,14 +1140,7 @@ static void trace_module_remove_events(struct module *mod) list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { found = true; - ftrace_event_enable_disable(call, 0); - if (call->event) - __unregister_ftrace_event(call->event); - debugfs_remove_recursive(call->dir); - list_del(&call->list); - trace_destroy_fields(call); - destroy_preds(call); - remove_subsystem_dir(call->system); + __trace_remove_event_call(call); } } @@ -1203,7 +1258,7 @@ static __init int event_trace_init(void) if (!call->name) continue; if (call->raw_init) { - ret = call->raw_init(); + ret = call->raw_init(call); if (ret < 0) { if (ret != -ENOSYS) pr_warning("Could not initialize trace " @@ -1211,10 +1266,12 @@ static __init int event_trace_init(void) continue; } } - list_add(&call->list, &ftrace_events); - event_create_dir(call, d_events, &ftrace_event_id_fops, - &ftrace_enable_fops, &ftrace_event_filter_fops, - &ftrace_event_format_fops); + ret = event_create_dir(call, d_events, &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); + if (!ret) + list_add(&call->list, &ftrace_events); } while (true) { diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 23245785927..50504cb228d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -18,11 +18,10 @@ * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> */ -#include <linux/debugfs.h> -#include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/mutex.h> +#include <linux/perf_event.h> #include "trace.h" #include "trace_output.h" @@ -31,6 +30,7 @@ enum filter_op_ids { OP_OR, OP_AND, + OP_GLOB, OP_NE, OP_EQ, OP_LT, @@ -48,16 +48,17 @@ struct filter_op { }; static struct filter_op filter_ops[] = { - { OP_OR, "||", 1 }, - { OP_AND, "&&", 2 }, - { OP_NE, "!=", 4 }, - { OP_EQ, "==", 4 }, - { OP_LT, "<", 5 }, - { OP_LE, "<=", 5 }, - { OP_GT, ">", 5 }, - { OP_GE, ">=", 5 }, - { OP_NONE, "OP_NONE", 0 }, - { OP_OPEN_PAREN, "(", 0 }, + { OP_OR, "||", 1 }, + { OP_AND, "&&", 2 }, + { OP_GLOB, "~", 4 }, + { OP_NE, "!=", 4 }, + { OP_EQ, "==", 4 }, + { OP_LT, "<", 5 }, + { OP_LE, "<=", 5 }, + { OP_GT, ">", 5 }, + { OP_GE, ">=", 5 }, + { OP_NONE, "OP_NONE", 0 }, + { OP_OPEN_PAREN, "(", 0 }, }; enum { @@ -197,9 +198,9 @@ static int filter_pred_string(struct filter_pred *pred, void *event, char *addr = (char *)(event + pred->offset); int cmp, match; - cmp = strncmp(addr, pred->str_val, pred->str_len); + cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len); - match = (!cmp) ^ pred->not; + match = cmp ^ pred->not; return match; } @@ -211,9 +212,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, char **addr = (char **)(event + pred->offset); int cmp, match; - cmp = strncmp(*addr, pred->str_val, pred->str_len); + cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len); - match = (!cmp) ^ pred->not; + match = cmp ^ pred->not; return match; } @@ -237,9 +238,9 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event, char *addr = (char *)(event + str_loc); int cmp, match; - cmp = strncmp(addr, pred->str_val, str_len); + cmp = pred->regex.match(addr, &pred->regex, str_len); - match = (!cmp) ^ pred->not; + match = cmp ^ pred->not; return match; } @@ -250,10 +251,121 @@ static int filter_pred_none(struct filter_pred *pred, void *event, return 0; } +/* Basic regex callbacks */ +static int regex_match_full(char *str, struct regex *r, int len) +{ + if (strncmp(str, r->pattern, len) == 0) + return 1; + return 0; +} + +static int regex_match_front(char *str, struct regex *r, int len) +{ + if (strncmp(str, r->pattern, len) == 0) + return 1; + return 0; +} + +static int regex_match_middle(char *str, struct regex *r, int len) +{ + if (strstr(str, r->pattern)) + return 1; + return 0; +} + +static int regex_match_end(char *str, struct regex *r, int len) +{ + char *ptr = strstr(str, r->pattern); + + if (ptr && (ptr[r->len] == 0)) + return 1; + return 0; +} + +/** + * filter_parse_regex - parse a basic regex + * @buff: the raw regex + * @len: length of the regex + * @search: will point to the beginning of the string to compare + * @not: tell whether the match will have to be inverted + * + * This passes in a buffer containing a regex and this function will + * set search to point to the search part of the buffer and + * return the type of search it is (see enum above). + * This does modify buff. + * + * Returns enum type. + * search returns the pointer to use for comparison. + * not returns 1 if buff started with a '!' + * 0 otherwise. + */ +enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) +{ + int type = MATCH_FULL; + int i; + + if (buff[0] == '!') { + *not = 1; + buff++; + len--; + } else + *not = 0; + + *search = buff; + + for (i = 0; i < len; i++) { + if (buff[i] == '*') { + if (!i) { + *search = buff + 1; + type = MATCH_END_ONLY; + } else { + if (type == MATCH_END_ONLY) + type = MATCH_MIDDLE_ONLY; + else + type = MATCH_FRONT_ONLY; + buff[i] = 0; + break; + } + } + } + + return type; +} + +static void filter_build_regex(struct filter_pred *pred) +{ + struct regex *r = &pred->regex; + char *search; + enum regex_type type = MATCH_FULL; + int not = 0; + + if (pred->op == OP_GLOB) { + type = filter_parse_regex(r->pattern, r->len, &search, ¬); + r->len = strlen(search); + memmove(r->pattern, search, r->len+1); + } + + switch (type) { + case MATCH_FULL: + r->match = regex_match_full; + break; + case MATCH_FRONT_ONLY: + r->match = regex_match_front; + break; + case MATCH_MIDDLE_ONLY: + r->match = regex_match_middle; + break; + case MATCH_END_ONLY: + r->match = regex_match_end; + break; + } + + pred->not ^= not; +} + /* return 1 if event matches, 0 otherwise (discard) */ -int filter_match_preds(struct ftrace_event_call *call, void *rec) +int filter_match_preds(struct event_filter *filter, void *rec) { - struct event_filter *filter = call->filter; int match, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; struct filter_pred *pred; @@ -396,7 +508,7 @@ static void filter_clear_pred(struct filter_pred *pred) { kfree(pred->field_name); pred->field_name = NULL; - pred->str_len = 0; + pred->regex.len = 0; } static int filter_set_pred(struct filter_pred *dest, @@ -426,9 +538,8 @@ static void filter_disable_preds(struct ftrace_event_call *call) filter->preds[i]->fn = filter_pred_none; } -void destroy_preds(struct ftrace_event_call *call) +static void __free_preds(struct event_filter *filter) { - struct event_filter *filter = call->filter; int i; if (!filter) @@ -441,21 +552,24 @@ void destroy_preds(struct ftrace_event_call *call) kfree(filter->preds); kfree(filter->filter_string); kfree(filter); +} + +void destroy_preds(struct ftrace_event_call *call) +{ + __free_preds(call->filter); call->filter = NULL; + call->filter_active = 0; } -static int init_preds(struct ftrace_event_call *call) +static struct event_filter *__alloc_preds(void) { struct event_filter *filter; struct filter_pred *pred; int i; - if (call->filter) - return 0; - - filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!call->filter) - return -ENOMEM; + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) + return ERR_PTR(-ENOMEM); filter->n_preds = 0; @@ -471,12 +585,24 @@ static int init_preds(struct ftrace_event_call *call) filter->preds[i] = pred; } - return 0; + return filter; oom: - destroy_preds(call); + __free_preds(filter); + return ERR_PTR(-ENOMEM); +} + +static int init_preds(struct ftrace_event_call *call) +{ + if (call->filter) + return 0; + + call->filter_active = 0; + call->filter = __alloc_preds(); + if (IS_ERR(call->filter)) + return PTR_ERR(call->filter); - return -ENOMEM; + return 0; } static int init_subsystem_preds(struct event_subsystem *system) @@ -499,14 +625,7 @@ static int init_subsystem_preds(struct event_subsystem *system) return 0; } -enum { - FILTER_DISABLE_ALL, - FILTER_INIT_NO_RESET, - FILTER_SKIP_NO_RESET, -}; - -static void filter_free_subsystem_preds(struct event_subsystem *system, - int flag) +static void filter_free_subsystem_preds(struct event_subsystem *system) { struct ftrace_event_call *call; @@ -517,14 +636,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system, if (strcmp(call->system, system->name) != 0) continue; - if (flag == FILTER_INIT_NO_RESET) { - call->filter->no_reset = false; - continue; - } - - if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset) - continue; - filter_disable_preds(call); remove_filter_string(call->filter); } @@ -532,10 +643,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system, static int filter_add_pred_fn(struct filter_parse_state *ps, struct ftrace_event_call *call, + struct event_filter *filter, struct filter_pred *pred, filter_pred_fn_t fn) { - struct event_filter *filter = call->filter; int idx, err; if (filter->n_preds == MAX_FILTER_PRED) { @@ -550,7 +661,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, return err; filter->n_preds++; - call->filter_active = 1; return 0; } @@ -575,7 +685,10 @@ static bool is_string_field(struct ftrace_event_field *field) static int is_legal_op(struct ftrace_event_field *field, int op) { - if (is_string_field(field) && (op != OP_EQ && op != OP_NE)) + if (is_string_field(field) && + (op != OP_EQ && op != OP_NE && op != OP_GLOB)) + return 0; + if (!is_string_field(field) && op == OP_GLOB) return 0; return 1; @@ -626,6 +739,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, + struct event_filter *filter, struct filter_pred *pred, bool dry_run) { @@ -660,21 +774,22 @@ static int filter_add_pred(struct filter_parse_state *ps, } if (is_string_field(field)) { - pred->str_len = field->size; + filter_build_regex(pred); - if (field->filter_type == FILTER_STATIC_STRING) + if (field->filter_type == FILTER_STATIC_STRING) { fn = filter_pred_string; - else if (field->filter_type == FILTER_DYN_STRING) + pred->regex.field_len = field->size; + } else if (field->filter_type == FILTER_DYN_STRING) fn = filter_pred_strloc; else { fn = filter_pred_pchar; - pred->str_len = strlen(pred->str_val); + pred->regex.field_len = strlen(pred->regex.pattern); } } else { if (field->is_signed) - ret = strict_strtoll(pred->str_val, 0, &val); + ret = strict_strtoll(pred->regex.pattern, 0, &val); else - ret = strict_strtoull(pred->str_val, 0, &val); + ret = strict_strtoull(pred->regex.pattern, 0, &val); if (ret) { parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); return -EINVAL; @@ -694,45 +809,7 @@ static int filter_add_pred(struct filter_parse_state *ps, add_pred_fn: if (!dry_run) - return filter_add_pred_fn(ps, call, pred, fn); - return 0; -} - -static int filter_add_subsystem_pred(struct filter_parse_state *ps, - struct event_subsystem *system, - struct filter_pred *pred, - char *filter_string, - bool dry_run) -{ - struct ftrace_event_call *call; - int err = 0; - bool fail = true; - - list_for_each_entry(call, &ftrace_events, list) { - - if (!call->define_fields) - continue; - - if (strcmp(call->system, system->name)) - continue; - - if (call->filter->no_reset) - continue; - - err = filter_add_pred(ps, call, pred, dry_run); - if (err) - call->filter->no_reset = true; - else - fail = false; - - if (!dry_run) - replace_filter_string(call->filter, filter_string); - } - - if (fail) { - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - return err; - } + return filter_add_pred_fn(ps, call, filter, pred, fn); return 0; } @@ -933,8 +1010,9 @@ static void postfix_clear(struct filter_parse_state *ps) while (!list_empty(&ps->postfix)) { elt = list_first_entry(&ps->postfix, struct postfix_elt, list); - kfree(elt->operand); list_del(&elt->list); + kfree(elt->operand); + kfree(elt); } } @@ -1044,8 +1122,8 @@ static struct filter_pred *create_pred(int op, char *operand1, char *operand2) return NULL; } - strcpy(pred->str_val, operand2); - pred->str_len = strlen(operand2); + strcpy(pred->regex.pattern, operand2); + pred->regex.len = strlen(pred->regex.pattern); pred->op = op; @@ -1089,8 +1167,8 @@ static int check_preds(struct filter_parse_state *ps) return 0; } -static int replace_preds(struct event_subsystem *system, - struct ftrace_event_call *call, +static int replace_preds(struct ftrace_event_call *call, + struct event_filter *filter, struct filter_parse_state *ps, char *filter_string, bool dry_run) @@ -1137,11 +1215,7 @@ static int replace_preds(struct event_subsystem *system, add_pred: if (!pred) return -ENOMEM; - if (call) - err = filter_add_pred(ps, call, pred, false); - else - err = filter_add_subsystem_pred(ps, system, pred, - filter_string, dry_run); + err = filter_add_pred(ps, call, filter, pred, dry_run); filter_free_pred(pred); if (err) return err; @@ -1152,10 +1226,50 @@ add_pred: return 0; } -int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +static int replace_system_preds(struct event_subsystem *system, + struct filter_parse_state *ps, + char *filter_string) { + struct ftrace_event_call *call; + bool fail = true; int err; + list_for_each_entry(call, &ftrace_events, list) { + struct event_filter *filter = call->filter; + + if (!call->define_fields) + continue; + + if (strcmp(call->system, system->name) != 0) + continue; + + /* try to see if the filter can be applied */ + err = replace_preds(call, filter, ps, filter_string, true); + if (err) + continue; + + /* really apply the filter */ + filter_disable_preds(call); + err = replace_preds(call, filter, ps, filter_string, false); + if (err) + filter_disable_preds(call); + else { + call->filter_active = 1; + replace_filter_string(filter, filter_string); + } + fail = false; + } + + if (fail) { + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return -EINVAL; + } + return 0; +} + +int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +{ + int err; struct filter_parse_state *ps; mutex_lock(&event_mutex); @@ -1167,8 +1281,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) if (!strcmp(strstrip(filter_string), "0")) { filter_disable_preds(call); remove_filter_string(call->filter); - mutex_unlock(&event_mutex); - return 0; + goto out_unlock; } err = -ENOMEM; @@ -1186,10 +1299,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out; } - err = replace_preds(NULL, call, ps, filter_string, false); + err = replace_preds(call, call->filter, ps, filter_string, false); if (err) append_filter_err(ps, call->filter); - + else + call->filter_active = 1; out: filter_opstack_clear(ps); postfix_clear(ps); @@ -1204,7 +1318,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string) { int err; - struct filter_parse_state *ps; mutex_lock(&event_mutex); @@ -1214,10 +1327,9 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out_unlock; if (!strcmp(strstrip(filter_string), "0")) { - filter_free_subsystem_preds(system, FILTER_DISABLE_ALL); + filter_free_subsystem_preds(system); remove_filter_string(system->filter); - mutex_unlock(&event_mutex); - return 0; + goto out_unlock; } err = -ENOMEM; @@ -1234,31 +1346,87 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out; } - filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET); - - /* try to see the filter can be applied to which events */ - err = replace_preds(system, NULL, ps, filter_string, true); - if (err) { + err = replace_system_preds(system, ps, filter_string); + if (err) append_filter_err(ps, system->filter); - goto out; + +out: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); +out_unlock: + mutex_unlock(&event_mutex); + + return err; +} + +#ifdef CONFIG_EVENT_PROFILE + +void ftrace_profile_free_filter(struct perf_event *event) +{ + struct event_filter *filter = event->filter; + + event->filter = NULL; + __free_preds(filter); +} + +int ftrace_profile_set_filter(struct perf_event *event, int event_id, + char *filter_str) +{ + int err; + struct event_filter *filter; + struct filter_parse_state *ps; + struct ftrace_event_call *call = NULL; + + mutex_lock(&event_mutex); + + list_for_each_entry(call, &ftrace_events, list) { + if (call->id == event_id) + break; } - filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET); + err = -EINVAL; + if (!call) + goto out_unlock; - /* really apply the filter to the events */ - err = replace_preds(system, NULL, ps, filter_string, false); - if (err) { - append_filter_err(ps, system->filter); - filter_free_subsystem_preds(system, 2); + err = -EEXIST; + if (event->filter) + goto out_unlock; + + filter = __alloc_preds(); + if (IS_ERR(filter)) { + err = PTR_ERR(filter); + goto out_unlock; } -out: + err = -ENOMEM; + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + goto free_preds; + + parse_init(ps, filter_ops, filter_str); + err = filter_parse(ps); + if (err) + goto free_ps; + + err = replace_preds(call, filter, ps, filter_str, false); + if (!err) + event->filter = filter; + +free_ps: filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); + +free_preds: + if (err) + __free_preds(filter); + out_unlock: mutex_unlock(&event_mutex); return err; } +#endif /* CONFIG_EVENT_PROFILE */ + diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 9753fcc61bc..934d81fb4ca 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -66,44 +66,47 @@ static void __used ____ftrace_check_##name(void) \ #undef __field #define __field(type, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\n", \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ offsetof(typeof(field), item), \ - sizeof(field.item)); \ + sizeof(field.item), is_signed_type(type)); \ if (!ret) \ return 0; #undef __field_desc #define __field_desc(type, container, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\n", \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ offsetof(typeof(field), container.item), \ - sizeof(field.container.item)); \ + sizeof(field.container.item), \ + is_signed_type(type)); \ if (!ret) \ return 0; #undef __array #define __array(type, item, len) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed_type(type)); \ if (!ret) \ return 0; #undef __array_desc #define __array_desc(type, container, item, len) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\n", \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ offsetof(typeof(field), container.item), \ - sizeof(field.container.item)); \ + sizeof(field.container.item), \ + is_signed_type(type)); \ if (!ret) \ return 0; #undef __dynamic_array #define __dynamic_array(type, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:0;\n", \ - offsetof(typeof(field), item)); \ + "offset:%zu;\tsize:0;\tsigned:%u;\n", \ + offsetof(typeof(field), item), \ + is_signed_type(type)); \ if (!ret) \ return 0; @@ -131,7 +134,6 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ #include "trace_entries.h" - #undef __field #define __field(type, item) \ ret = trace_define_field(event_call, #type, #item, \ @@ -193,6 +195,11 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #include "trace_entries.h" +static int ftrace_raw_init_event(struct ftrace_event_call *call) +{ + INIT_LIST_HEAD(&call->fields); + return 0; +} #undef __field #define __field(type, item) @@ -211,7 +218,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ -static int ftrace_raw_init_event_##call(void); \ \ struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ @@ -219,14 +225,9 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .id = type, \ .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_raw_init_event_##call, \ + .raw_init = ftrace_raw_init_event, \ .show_format = ftrace_format_##call, \ .define_fields = ftrace_define_fields_##call, \ }; \ -static int ftrace_raw_init_event_##call(void) \ -{ \ - INIT_LIST_HEAD(&event_##call.fields); \ - return 0; \ -} \ #include "trace_entries.h" diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c new file mode 100644 index 00000000000..3696476f307 --- /dev/null +++ b/kernel/trace/trace_kprobe.c @@ -0,0 +1,1513 @@ +/* + * Kprobes-based tracing events + * + * Created by Masami Hiramatsu <mhiramat@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/kprobes.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/debugfs.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/ptrace.h> +#include <linux/perf_event.h> + +#include "trace.h" +#include "trace_output.h" + +#define MAX_TRACE_ARGS 128 +#define MAX_ARGSTR_LEN 63 +#define MAX_EVENT_NAME_LEN 64 +#define KPROBE_EVENT_SYSTEM "kprobes" + +/* Reserved field names */ +#define FIELD_STRING_IP "__probe_ip" +#define FIELD_STRING_NARGS "__probe_nargs" +#define FIELD_STRING_RETIP "__probe_ret_ip" +#define FIELD_STRING_FUNC "__probe_func" + +const char *reserved_field_names[] = { + "common_type", + "common_flags", + "common_preempt_count", + "common_pid", + "common_tgid", + "common_lock_depth", + FIELD_STRING_IP, + FIELD_STRING_NARGS, + FIELD_STRING_RETIP, + FIELD_STRING_FUNC, +}; + +struct fetch_func { + unsigned long (*func)(struct pt_regs *, void *); + void *data; +}; + +static __kprobes unsigned long call_fetch(struct fetch_func *f, + struct pt_regs *regs) +{ + return f->func(regs, f->data); +} + +/* fetch handlers */ +static __kprobes unsigned long fetch_register(struct pt_regs *regs, + void *offset) +{ + return regs_get_register(regs, (unsigned int)((unsigned long)offset)); +} + +static __kprobes unsigned long fetch_stack(struct pt_regs *regs, + void *num) +{ + return regs_get_kernel_stack_nth(regs, + (unsigned int)((unsigned long)num)); +} + +static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) +{ + unsigned long retval; + + if (probe_kernel_address(addr, retval)) + return 0; + return retval; +} + +static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num) +{ + return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num)); +} + +static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, + void *dummy) +{ + return regs_return_value(regs); +} + +static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs, + void *dummy) +{ + return kernel_stack_pointer(regs); +} + +/* Memory fetching by symbol */ +struct symbol_cache { + char *symbol; + long offset; + unsigned long addr; +}; + +static unsigned long update_symbol_cache(struct symbol_cache *sc) +{ + sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); + if (sc->addr) + sc->addr += sc->offset; + return sc->addr; +} + +static void free_symbol_cache(struct symbol_cache *sc) +{ + kfree(sc->symbol); + kfree(sc); +} + +static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) +{ + struct symbol_cache *sc; + + if (!sym || strlen(sym) == 0) + return NULL; + sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); + if (!sc) + return NULL; + + sc->symbol = kstrdup(sym, GFP_KERNEL); + if (!sc->symbol) { + kfree(sc); + return NULL; + } + sc->offset = offset; + + update_symbol_cache(sc); + return sc; +} + +static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data) +{ + struct symbol_cache *sc = data; + + if (sc->addr) + return fetch_memory(regs, (void *)sc->addr); + else + return 0; +} + +/* Special indirect memory access interface */ +struct indirect_fetch_data { + struct fetch_func orig; + long offset; +}; + +static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data) +{ + struct indirect_fetch_data *ind = data; + unsigned long addr; + + addr = call_fetch(&ind->orig, regs); + if (addr) { + addr += ind->offset; + return fetch_memory(regs, (void *)addr); + } else + return 0; +} + +static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) +{ + if (data->orig.func == fetch_indirect) + free_indirect_fetch_data(data->orig.data); + else if (data->orig.func == fetch_symbol) + free_symbol_cache(data->orig.data); + kfree(data); +} + +/** + * Kprobe event core functions + */ + +struct probe_arg { + struct fetch_func fetch; + const char *name; +}; + +/* Flags for trace_probe */ +#define TP_FLAG_TRACE 1 +#define TP_FLAG_PROFILE 2 + +struct trace_probe { + struct list_head list; + struct kretprobe rp; /* Use rp.kp for kprobe use */ + unsigned long nhit; + unsigned int flags; /* For TP_FLAG_* */ + const char *symbol; /* symbol name */ + struct ftrace_event_call call; + struct trace_event event; + unsigned int nr_args; + struct probe_arg args[]; +}; + +#define SIZEOF_TRACE_PROBE(n) \ + (offsetof(struct trace_probe, args) + \ + (sizeof(struct probe_arg) * (n))) + +static __kprobes int probe_is_return(struct trace_probe *tp) +{ + return tp->rp.handler != NULL; +} + +static __kprobes const char *probe_symbol(struct trace_probe *tp) +{ + return tp->symbol ? tp->symbol : "unknown"; +} + +static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff) +{ + int ret = -EINVAL; + + if (ff->func == fetch_argument) + ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data); + else if (ff->func == fetch_register) { + const char *name; + name = regs_query_register_name((unsigned int)((long)ff->data)); + ret = snprintf(buf, n, "%%%s", name); + } else if (ff->func == fetch_stack) + ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data); + else if (ff->func == fetch_memory) + ret = snprintf(buf, n, "@0x%p", ff->data); + else if (ff->func == fetch_symbol) { + struct symbol_cache *sc = ff->data; + ret = snprintf(buf, n, "@%s%+ld", sc->symbol, sc->offset); + } else if (ff->func == fetch_retvalue) + ret = snprintf(buf, n, "$retval"); + else if (ff->func == fetch_stack_address) + ret = snprintf(buf, n, "$stack"); + else if (ff->func == fetch_indirect) { + struct indirect_fetch_data *id = ff->data; + size_t l = 0; + ret = snprintf(buf, n, "%+ld(", id->offset); + if (ret >= n) + goto end; + l += ret; + ret = probe_arg_string(buf + l, n - l, &id->orig); + if (ret < 0) + goto end; + l += ret; + ret = snprintf(buf + l, n - l, ")"); + ret += l; + } +end: + if (ret >= n) + return -ENOSPC; + return ret; +} + +static int register_probe_event(struct trace_probe *tp); +static void unregister_probe_event(struct trace_probe *tp); + +static DEFINE_MUTEX(probe_lock); +static LIST_HEAD(probe_list); + +static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); +static int kretprobe_dispatcher(struct kretprobe_instance *ri, + struct pt_regs *regs); + +/* + * Allocate new trace_probe and initialize it (including kprobes). + */ +static struct trace_probe *alloc_trace_probe(const char *group, + const char *event, + void *addr, + const char *symbol, + unsigned long offs, + int nargs, int is_return) +{ + struct trace_probe *tp; + + tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); + if (!tp) + return ERR_PTR(-ENOMEM); + + if (symbol) { + tp->symbol = kstrdup(symbol, GFP_KERNEL); + if (!tp->symbol) + goto error; + tp->rp.kp.symbol_name = tp->symbol; + tp->rp.kp.offset = offs; + } else + tp->rp.kp.addr = addr; + + if (is_return) + tp->rp.handler = kretprobe_dispatcher; + else + tp->rp.kp.pre_handler = kprobe_dispatcher; + + if (!event) + goto error; + tp->call.name = kstrdup(event, GFP_KERNEL); + if (!tp->call.name) + goto error; + + if (!group) + goto error; + tp->call.system = kstrdup(group, GFP_KERNEL); + if (!tp->call.system) + goto error; + + INIT_LIST_HEAD(&tp->list); + return tp; +error: + kfree(tp->call.name); + kfree(tp->symbol); + kfree(tp); + return ERR_PTR(-ENOMEM); +} + +static void free_probe_arg(struct probe_arg *arg) +{ + if (arg->fetch.func == fetch_symbol) + free_symbol_cache(arg->fetch.data); + else if (arg->fetch.func == fetch_indirect) + free_indirect_fetch_data(arg->fetch.data); + kfree(arg->name); +} + +static void free_trace_probe(struct trace_probe *tp) +{ + int i; + + for (i = 0; i < tp->nr_args; i++) + free_probe_arg(&tp->args[i]); + + kfree(tp->call.system); + kfree(tp->call.name); + kfree(tp->symbol); + kfree(tp); +} + +static struct trace_probe *find_probe_event(const char *event, + const char *group) +{ + struct trace_probe *tp; + + list_for_each_entry(tp, &probe_list, list) + if (strcmp(tp->call.name, event) == 0 && + strcmp(tp->call.system, group) == 0) + return tp; + return NULL; +} + +/* Unregister a trace_probe and probe_event: call with locking probe_lock */ +static void unregister_trace_probe(struct trace_probe *tp) +{ + if (probe_is_return(tp)) + unregister_kretprobe(&tp->rp); + else + unregister_kprobe(&tp->rp.kp); + list_del(&tp->list); + unregister_probe_event(tp); +} + +/* Register a trace_probe and probe_event */ +static int register_trace_probe(struct trace_probe *tp) +{ + struct trace_probe *old_tp; + int ret; + + mutex_lock(&probe_lock); + + /* register as an event */ + old_tp = find_probe_event(tp->call.name, tp->call.system); + if (old_tp) { + /* delete old event */ + unregister_trace_probe(old_tp); + free_trace_probe(old_tp); + } + ret = register_probe_event(tp); + if (ret) { + pr_warning("Faild to register probe event(%d)\n", ret); + goto end; + } + + tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; + if (probe_is_return(tp)) + ret = register_kretprobe(&tp->rp); + else + ret = register_kprobe(&tp->rp.kp); + + if (ret) { + pr_warning("Could not insert probe(%d)\n", ret); + if (ret == -EILSEQ) { + pr_warning("Probing address(0x%p) is not an " + "instruction boundary.\n", + tp->rp.kp.addr); + ret = -EINVAL; + } + unregister_probe_event(tp); + } else + list_add_tail(&tp->list, &probe_list); +end: + mutex_unlock(&probe_lock); + return ret; +} + +/* Split symbol and offset. */ +static int split_symbol_offset(char *symbol, unsigned long *offset) +{ + char *tmp; + int ret; + + if (!offset) + return -EINVAL; + + tmp = strchr(symbol, '+'); + if (tmp) { + /* skip sign because strict_strtol doesn't accept '+' */ + ret = strict_strtoul(tmp + 1, 0, offset); + if (ret) + return ret; + *tmp = '\0'; + } else + *offset = 0; + return 0; +} + +#define PARAM_MAX_ARGS 16 +#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) + +static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) +{ + int ret = 0; + unsigned long param; + + if (strcmp(arg, "retval") == 0) { + if (is_return) { + ff->func = fetch_retvalue; + ff->data = NULL; + } else + ret = -EINVAL; + } else if (strncmp(arg, "stack", 5) == 0) { + if (arg[5] == '\0') { + ff->func = fetch_stack_address; + ff->data = NULL; + } else if (isdigit(arg[5])) { + ret = strict_strtoul(arg + 5, 10, ¶m); + if (ret || param > PARAM_MAX_STACK) + ret = -EINVAL; + else { + ff->func = fetch_stack; + ff->data = (void *)param; + } + } else + ret = -EINVAL; + } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) { + ret = strict_strtoul(arg + 3, 10, ¶m); + if (ret || param > PARAM_MAX_ARGS) + ret = -EINVAL; + else { + ff->func = fetch_argument; + ff->data = (void *)param; + } + } else + ret = -EINVAL; + return ret; +} + +static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +{ + int ret = 0; + unsigned long param; + long offset; + char *tmp; + + switch (arg[0]) { + case '$': + ret = parse_probe_vars(arg + 1, ff, is_return); + break; + case '%': /* named register */ + ret = regs_query_register_offset(arg + 1); + if (ret >= 0) { + ff->func = fetch_register; + ff->data = (void *)(unsigned long)ret; + ret = 0; + } + break; + case '@': /* memory or symbol */ + if (isdigit(arg[1])) { + ret = strict_strtoul(arg + 1, 0, ¶m); + if (ret) + break; + ff->func = fetch_memory; + ff->data = (void *)param; + } else { + ret = split_symbol_offset(arg + 1, &offset); + if (ret) + break; + ff->data = alloc_symbol_cache(arg + 1, offset); + if (ff->data) + ff->func = fetch_symbol; + else + ret = -EINVAL; + } + break; + case '+': /* indirect memory */ + case '-': + tmp = strchr(arg, '('); + if (!tmp) { + ret = -EINVAL; + break; + } + *tmp = '\0'; + ret = strict_strtol(arg + 1, 0, &offset); + if (ret) + break; + if (arg[0] == '-') + offset = -offset; + arg = tmp + 1; + tmp = strrchr(arg, ')'); + if (tmp) { + struct indirect_fetch_data *id; + *tmp = '\0'; + id = kzalloc(sizeof(struct indirect_fetch_data), + GFP_KERNEL); + if (!id) + return -ENOMEM; + id->offset = offset; + ret = parse_probe_arg(arg, &id->orig, is_return); + if (ret) + kfree(id); + else { + ff->func = fetch_indirect; + ff->data = (void *)id; + } + } else + ret = -EINVAL; + break; + default: + /* TODO: support custom handler */ + ret = -EINVAL; + } + return ret; +} + +/* Return 1 if name is reserved or already used by another argument */ +static int conflict_field_name(const char *name, + struct probe_arg *args, int narg) +{ + int i; + for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) + if (strcmp(reserved_field_names[i], name) == 0) + return 1; + for (i = 0; i < narg; i++) + if (strcmp(args[i].name, name) == 0) + return 1; + return 0; +} + +static int create_trace_probe(int argc, char **argv) +{ + /* + * Argument syntax: + * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] + * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] + * Fetch args: + * $argN : fetch Nth of function argument. (N:0-) + * $retval : fetch return value + * $stack : fetch stack address + * $stackN : fetch Nth of stack (N:0-) + * @ADDR : fetch memory at ADDR (ADDR should be in kernel) + * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) + * %REG : fetch register REG + * Indirect memory fetch: + * +|-offs(ARG) : fetch memory at ARG +|- offs address. + * Alias name of args: + * NAME=FETCHARG : set NAME as alias of FETCHARG. + */ + struct trace_probe *tp; + int i, ret = 0; + int is_return = 0; + char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; + unsigned long offset = 0; + void *addr = NULL; + char buf[MAX_EVENT_NAME_LEN]; + + if (argc < 2) { + pr_info("Probe point is not specified.\n"); + return -EINVAL; + } + + if (argv[0][0] == 'p') + is_return = 0; + else if (argv[0][0] == 'r') + is_return = 1; + else { + pr_info("Probe definition must be started with 'p' or 'r'.\n"); + return -EINVAL; + } + + if (argv[0][1] == ':') { + event = &argv[0][2]; + if (strchr(event, '/')) { + group = event; + event = strchr(group, '/') + 1; + event[-1] = '\0'; + if (strlen(group) == 0) { + pr_info("Group name is not specifiled\n"); + return -EINVAL; + } + } + if (strlen(event) == 0) { + pr_info("Event name is not specifiled\n"); + return -EINVAL; + } + } + + if (isdigit(argv[1][0])) { + if (is_return) { + pr_info("Return probe point must be a symbol.\n"); + return -EINVAL; + } + /* an address specified */ + ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); + if (ret) { + pr_info("Failed to parse address.\n"); + return ret; + } + } else { + /* a symbol specified */ + symbol = argv[1]; + /* TODO: support .init module functions */ + ret = split_symbol_offset(symbol, &offset); + if (ret) { + pr_info("Failed to parse symbol.\n"); + return ret; + } + if (offset && is_return) { + pr_info("Return probe must be used without offset.\n"); + return -EINVAL; + } + } + argc -= 2; argv += 2; + + /* setup a probe */ + if (!group) + group = KPROBE_EVENT_SYSTEM; + if (!event) { + /* Make a new event name */ + if (symbol) + snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld", + is_return ? 'r' : 'p', symbol, offset); + else + snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p", + is_return ? 'r' : 'p', addr); + event = buf; + } + tp = alloc_trace_probe(group, event, addr, symbol, offset, argc, + is_return); + if (IS_ERR(tp)) { + pr_info("Failed to allocate trace_probe.(%d)\n", + (int)PTR_ERR(tp)); + return PTR_ERR(tp); + } + + /* parse arguments */ + ret = 0; + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + /* Parse argument name */ + arg = strchr(argv[i], '='); + if (arg) + *arg++ = '\0'; + else + arg = argv[i]; + + if (conflict_field_name(argv[i], tp->args, i)) { + pr_info("Argument%d name '%s' conflicts with " + "another field.\n", i, argv[i]); + ret = -EINVAL; + goto error; + } + + tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); + + /* Parse fetch argument */ + if (strlen(arg) > MAX_ARGSTR_LEN) { + pr_info("Argument%d(%s) is too long.\n", i, arg); + ret = -ENOSPC; + goto error; + } + ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return); + if (ret) { + pr_info("Parse error at argument%d. (%d)\n", i, ret); + goto error; + } + } + tp->nr_args = i; + + ret = register_trace_probe(tp); + if (ret) + goto error; + return 0; + +error: + free_trace_probe(tp); + return ret; +} + +static void cleanup_all_probes(void) +{ + struct trace_probe *tp; + + mutex_lock(&probe_lock); + /* TODO: Use batch unregistration */ + while (!list_empty(&probe_list)) { + tp = list_entry(probe_list.next, struct trace_probe, list); + unregister_trace_probe(tp); + free_trace_probe(tp); + } + mutex_unlock(&probe_lock); +} + + +/* Probes listing interfaces */ +static void *probes_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&probe_lock); + return seq_list_start(&probe_list, *pos); +} + +static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &probe_list, pos); +} + +static void probes_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&probe_lock); +} + +static int probes_seq_show(struct seq_file *m, void *v) +{ + struct trace_probe *tp = v; + int i, ret; + char buf[MAX_ARGSTR_LEN + 1]; + + seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); + seq_printf(m, ":%s", tp->call.name); + + if (tp->symbol) + seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset); + else + seq_printf(m, " 0x%p", tp->rp.kp.addr); + + for (i = 0; i < tp->nr_args; i++) { + ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch); + if (ret < 0) { + pr_warning("Argument%d decoding error(%d).\n", i, ret); + return ret; + } + seq_printf(m, " %s=%s", tp->args[i].name, buf); + } + seq_printf(m, "\n"); + return 0; +} + +static const struct seq_operations probes_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_seq_show +}; + +static int probes_open(struct inode *inode, struct file *file) +{ + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + cleanup_all_probes(); + + return seq_open(file, &probes_seq_op); +} + +static int command_trace_probe(const char *buf) +{ + char **argv; + int argc = 0, ret = 0; + + argv = argv_split(GFP_KERNEL, buf, &argc); + if (!argv) + return -ENOMEM; + + if (argc) + ret = create_trace_probe(argc, argv); + + argv_free(argv); + return ret; +} + +#define WRITE_BUFSIZE 128 + +static ssize_t probes_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char *kbuf, *tmp; + int ret; + size_t done; + size_t size; + + kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + ret = done = 0; + while (done < count) { + size = count - done; + if (size >= WRITE_BUFSIZE) + size = WRITE_BUFSIZE - 1; + if (copy_from_user(kbuf, buffer + done, size)) { + ret = -EFAULT; + goto out; + } + kbuf[size] = '\0'; + tmp = strchr(kbuf, '\n'); + if (tmp) { + *tmp = '\0'; + size = tmp - kbuf + 1; + } else if (done + size < count) { + pr_warning("Line length is too long: " + "Should be less than %d.", WRITE_BUFSIZE); + ret = -EINVAL; + goto out; + } + done += size; + /* Remove comments */ + tmp = strchr(kbuf, '#'); + if (tmp) + *tmp = '\0'; + + ret = command_trace_probe(kbuf); + if (ret) + goto out; + } + ret = done; +out: + kfree(kbuf); + return ret; +} + +static const struct file_operations kprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, +}; + +/* Probes profiling interfaces */ +static int probes_profile_seq_show(struct seq_file *m, void *v) +{ + struct trace_probe *tp = v; + + seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit, + tp->rp.kp.nmissed); + + return 0; +} + +static const struct seq_operations profile_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_profile_seq_show +}; + +static int profile_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &profile_seq_op); +} + +static const struct file_operations kprobe_profile_ops = { + .owner = THIS_MODULE, + .open = profile_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* Kprobe handler */ +static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + struct kprobe_trace_entry *entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + int size, i, pc; + unsigned long irq_flags; + struct ftrace_event_call *call = &tp->call; + + tp->nhit++; + + local_save_flags(irq_flags); + pc = preempt_count(); + + size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + + event = trace_current_buffer_lock_reserve(&buffer, call->id, size, + irq_flags, pc); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->nargs = tp->nr_args; + entry->ip = (unsigned long)kp->addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + + if (!filter_current_check_discard(buffer, call, entry, event)) + trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); + return 0; +} + +/* Kretprobe handler */ +static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct kretprobe_trace_entry *entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + int size, i, pc; + unsigned long irq_flags; + struct ftrace_event_call *call = &tp->call; + + local_save_flags(irq_flags); + pc = preempt_count(); + + size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + + event = trace_current_buffer_lock_reserve(&buffer, call->id, size, + irq_flags, pc); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->nargs = tp->nr_args; + entry->func = (unsigned long)tp->rp.kp.addr; + entry->ret_ip = (unsigned long)ri->ret_addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + + if (!filter_current_check_discard(buffer, call, entry, event)) + trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); + + return 0; +} + +/* Event entry printers */ +enum print_line_t +print_kprobe_event(struct trace_iterator *iter, int flags) +{ + struct kprobe_trace_entry *field; + struct trace_seq *s = &iter->seq; + struct trace_event *event; + struct trace_probe *tp; + int i; + + field = (struct kprobe_trace_entry *)iter->ent; + event = ftrace_find_event(field->ent.type); + tp = container_of(event, struct trace_probe, event); + + if (!trace_seq_printf(s, "%s: (", tp->call.name)) + goto partial; + + if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, ")")) + goto partial; + + for (i = 0; i < field->nargs; i++) + if (!trace_seq_printf(s, " %s=%lx", + tp->args[i].name, field->args[i])) + goto partial; + + if (!trace_seq_puts(s, "\n")) + goto partial; + + return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +enum print_line_t +print_kretprobe_event(struct trace_iterator *iter, int flags) +{ + struct kretprobe_trace_entry *field; + struct trace_seq *s = &iter->seq; + struct trace_event *event; + struct trace_probe *tp; + int i; + + field = (struct kretprobe_trace_entry *)iter->ent; + event = ftrace_find_event(field->ent.type); + tp = container_of(event, struct trace_probe, event); + + if (!trace_seq_printf(s, "%s: (", tp->call.name)) + goto partial; + + if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, " <- ")) + goto partial; + + if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, ")")) + goto partial; + + for (i = 0; i < field->nargs; i++) + if (!trace_seq_printf(s, " %s=%lx", + tp->args[i].name, field->args[i])) + goto partial; + + if (!trace_seq_puts(s, "\n")) + goto partial; + + return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static int probe_event_enable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags |= TP_FLAG_TRACE; + if (probe_is_return(tp)) + return enable_kretprobe(&tp->rp); + else + return enable_kprobe(&tp->rp.kp); +} + +static void probe_event_disable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags &= ~TP_FLAG_TRACE; + if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { + if (probe_is_return(tp)) + disable_kretprobe(&tp->rp); + else + disable_kprobe(&tp->rp.kp); + } +} + +static int probe_event_raw_init(struct ftrace_event_call *event_call) +{ + INIT_LIST_HEAD(&event_call->fields); + + return 0; +} + +#undef DEFINE_FIELD +#define DEFINE_FIELD(type, item, name, is_signed) \ + do { \ + ret = trace_define_field(event_call, #type, name, \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed, \ + FILTER_OTHER); \ + if (ret) \ + return ret; \ + } while (0) + +static int kprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i; + struct kprobe_trace_entry field; + struct trace_probe *tp = (struct trace_probe *)event_call->data; + + ret = trace_define_common_fields(event_call); + if (!ret) + return ret; + + DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); + DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) + DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); + return 0; +} + +static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i; + struct kretprobe_trace_entry field; + struct trace_probe *tp = (struct trace_probe *)event_call->data; + + ret = trace_define_common_fields(event_call); + if (!ret) + return ret; + + DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); + DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); + DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) + DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); + return 0; +} + +static int __probe_event_show_format(struct trace_seq *s, + struct trace_probe *tp, const char *fmt, + const char *arg) +{ + int i; + + /* Show format */ + if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt)) + return 0; + + for (i = 0; i < tp->nr_args; i++) + if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) + return 0; + + if (!trace_seq_printf(s, "\", %s", arg)) + return 0; + + for (i = 0; i < tp->nr_args; i++) + if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name)) + return 0; + + return trace_seq_puts(s, "\n"); +} + +#undef SHOW_FIELD +#define SHOW_FIELD(type, item, name) \ + do { \ + ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \ + "offset:%u;\tsize:%u;\n", name, \ + (unsigned int)offsetof(typeof(field), item),\ + (unsigned int)sizeof(type)); \ + if (!ret) \ + return 0; \ + } while (0) + +static int kprobe_event_show_format(struct ftrace_event_call *call, + struct trace_seq *s) +{ + struct kprobe_trace_entry field __attribute__((unused)); + int ret, i; + struct trace_probe *tp = (struct trace_probe *)call->data; + + SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); + SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); + + /* Show fields */ + for (i = 0; i < tp->nr_args; i++) + SHOW_FIELD(unsigned long, args[i], tp->args[i].name); + trace_seq_puts(s, "\n"); + + return __probe_event_show_format(s, tp, "(%lx)", + "REC->" FIELD_STRING_IP); +} + +static int kretprobe_event_show_format(struct ftrace_event_call *call, + struct trace_seq *s) +{ + struct kretprobe_trace_entry field __attribute__((unused)); + int ret, i; + struct trace_probe *tp = (struct trace_probe *)call->data; + + SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); + SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); + SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); + + /* Show fields */ + for (i = 0; i < tp->nr_args; i++) + SHOW_FIELD(unsigned long, args[i], tp->args[i].name); + trace_seq_puts(s, "\n"); + + return __probe_event_show_format(s, tp, "(%lx <- %lx)", + "REC->" FIELD_STRING_FUNC + ", REC->" FIELD_STRING_RETIP); +} + +#ifdef CONFIG_EVENT_PROFILE + +/* Kprobe profile handler */ +static __kprobes int kprobe_profile_func(struct kprobe *kp, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + struct ftrace_event_call *call = &tp->call; + struct kprobe_trace_entry *entry; + struct perf_trace_buf *trace_buf; + struct trace_entry *ent; + int size, __size, i, pc, __cpu; + unsigned long irq_flags; + char *raw_data; + + pc = preempt_count(); + __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + "profile buffer not large enough")) + return 0; + + /* + * Protect the non nmi buffer + * This also protects the rcu read side + */ + local_irq_save(irq_flags); + __cpu = smp_processor_id(); + + if (in_nmi()) + trace_buf = rcu_dereference(perf_trace_buf_nmi); + else + trace_buf = rcu_dereference(perf_trace_buf); + + if (!trace_buf) + goto end; + + trace_buf = per_cpu_ptr(trace_buf, __cpu); + + if (trace_buf->recursion++) + goto end_recursion; + + /* + * Make recursion update visible before entering perf_tp_event + * so that we protect from perf recursions. + */ + barrier(); + + raw_data = trace_buf->buf; + + /* Zero dead bytes from alignment to avoid buffer leak to userspace */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + entry = (struct kprobe_trace_entry *)raw_data; + ent = &entry->ent; + + tracing_generic_entry_update(ent, irq_flags, pc); + ent->type = call->id; + entry->nargs = tp->nr_args; + entry->ip = (unsigned long)kp->addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + perf_tp_event(call->id, entry->ip, 1, entry, size); + +end_recursion: + trace_buf->recursion--; +end: + local_irq_restore(irq_flags); + + return 0; +} + +/* Kretprobe profile handler */ +static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct ftrace_event_call *call = &tp->call; + struct kretprobe_trace_entry *entry; + struct perf_trace_buf *trace_buf; + struct trace_entry *ent; + int size, __size, i, pc, __cpu; + unsigned long irq_flags; + char *raw_data; + + pc = preempt_count(); + __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + "profile buffer not large enough")) + return 0; + + /* + * Protect the non nmi buffer + * This also protects the rcu read side + */ + local_irq_save(irq_flags); + __cpu = smp_processor_id(); + + if (in_nmi()) + trace_buf = rcu_dereference(perf_trace_buf_nmi); + else + trace_buf = rcu_dereference(perf_trace_buf); + + if (!trace_buf) + goto end; + + trace_buf = per_cpu_ptr(trace_buf, __cpu); + + if (trace_buf->recursion++) + goto end_recursion; + + /* + * Make recursion update visible before entering perf_tp_event + * so that we protect from perf recursions. + */ + barrier(); + + raw_data = trace_buf->buf; + + /* Zero dead bytes from alignment to avoid buffer leak to userspace */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + entry = (struct kretprobe_trace_entry *)raw_data; + ent = &entry->ent; + + tracing_generic_entry_update(ent, irq_flags, pc); + ent->type = call->id; + entry->nargs = tp->nr_args; + entry->func = (unsigned long)tp->rp.kp.addr; + entry->ret_ip = (unsigned long)ri->ret_addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + perf_tp_event(call->id, entry->ret_ip, 1, entry, size); + +end_recursion: + trace_buf->recursion--; +end: + local_irq_restore(irq_flags); + + return 0; +} + +static int probe_profile_enable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags |= TP_FLAG_PROFILE; + + if (probe_is_return(tp)) + return enable_kretprobe(&tp->rp); + else + return enable_kprobe(&tp->rp.kp); +} + +static void probe_profile_disable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags &= ~TP_FLAG_PROFILE; + + if (!(tp->flags & TP_FLAG_TRACE)) { + if (probe_is_return(tp)) + disable_kretprobe(&tp->rp); + else + disable_kprobe(&tp->rp.kp); + } +} +#endif /* CONFIG_EVENT_PROFILE */ + + +static __kprobes +int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + + if (tp->flags & TP_FLAG_TRACE) + kprobe_trace_func(kp, regs); +#ifdef CONFIG_EVENT_PROFILE + if (tp->flags & TP_FLAG_PROFILE) + kprobe_profile_func(kp, regs); +#endif /* CONFIG_EVENT_PROFILE */ + return 0; /* We don't tweek kernel, so just return 0 */ +} + +static __kprobes +int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + + if (tp->flags & TP_FLAG_TRACE) + kretprobe_trace_func(ri, regs); +#ifdef CONFIG_EVENT_PROFILE + if (tp->flags & TP_FLAG_PROFILE) + kretprobe_profile_func(ri, regs); +#endif /* CONFIG_EVENT_PROFILE */ + return 0; /* We don't tweek kernel, so just return 0 */ +} + +static int register_probe_event(struct trace_probe *tp) +{ + struct ftrace_event_call *call = &tp->call; + int ret; + + /* Initialize ftrace_event_call */ + if (probe_is_return(tp)) { + tp->event.trace = print_kretprobe_event; + call->raw_init = probe_event_raw_init; + call->show_format = kretprobe_event_show_format; + call->define_fields = kretprobe_event_define_fields; + } else { + tp->event.trace = print_kprobe_event; + call->raw_init = probe_event_raw_init; + call->show_format = kprobe_event_show_format; + call->define_fields = kprobe_event_define_fields; + } + call->event = &tp->event; + call->id = register_ftrace_event(&tp->event); + if (!call->id) + return -ENODEV; + call->enabled = 0; + call->regfunc = probe_event_enable; + call->unregfunc = probe_event_disable; + +#ifdef CONFIG_EVENT_PROFILE + atomic_set(&call->profile_count, -1); + call->profile_enable = probe_profile_enable; + call->profile_disable = probe_profile_disable; +#endif + call->data = tp; + ret = trace_add_event_call(call); + if (ret) { + pr_info("Failed to register kprobe event: %s\n", call->name); + unregister_ftrace_event(&tp->event); + } + return ret; +} + +static void unregister_probe_event(struct trace_probe *tp) +{ + /* tp->event is unregistered in trace_remove_event_call() */ + trace_remove_event_call(&tp->call); +} + +/* Make a debugfs interface for controling probe points */ +static __init int init_kprobe_trace(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + entry = debugfs_create_file("kprobe_events", 0644, d_tracer, + NULL, &kprobe_events_ops); + + /* Event list interface */ + if (!entry) + pr_warning("Could not create debugfs " + "'kprobe_events' entry\n"); + + /* Profile interface */ + entry = debugfs_create_file("kprobe_profile", 0444, d_tracer, + NULL, &kprobe_profile_ops); + + if (!entry) + pr_warning("Could not create debugfs " + "'kprobe_profile' entry\n"); + return 0; +} +fs_initcall(init_kprobe_trace); + + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +static int kprobe_trace_selftest_target(int a1, int a2, int a3, + int a4, int a5, int a6) +{ + return a1 + a2 + a3 + a4 + a5 + a6; +} + +static __init int kprobe_trace_self_tests_init(void) +{ + int ret; + int (*target)(int, int, int, int, int, int); + + target = kprobe_trace_selftest_target; + + pr_info("Testing kprobe tracing: "); + + ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " + "$arg1 $arg2 $arg3 $arg4 $stack $stack0"); + if (WARN_ON_ONCE(ret)) + pr_warning("error enabling function entry\n"); + + ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " + "$retval"); + if (WARN_ON_ONCE(ret)) + pr_warning("error enabling function return\n"); + + ret = target(1, 2, 3, 4, 5, 6); + + cleanup_all_probes(); + + pr_cont("OK\n"); + return 0; +} + +late_initcall(kprobe_trace_self_tests_init); + +#endif diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index ed17565826b..b6c12c6a1bc 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -69,6 +69,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) * @s: trace sequence descriptor * @fmt: printf format string * + * It returns 0 if the trace oversizes the buffer's free + * space, 1 otherwise. + * * The tracer may use either sequence operations or its own * copy to user routines. To simplify formating of a trace * trace_seq_printf is used to store strings into a special @@ -95,7 +98,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) s->len += ret; - return len; + return 1; } EXPORT_SYMBOL_GPL(trace_seq_printf); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 527e17eae57..51213b0aa81 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -14,6 +14,69 @@ static int sys_refcount_exit; static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); +extern unsigned long __start_syscalls_metadata[]; +extern unsigned long __stop_syscalls_metadata[]; + +static struct syscall_metadata **syscalls_metadata; + +static struct syscall_metadata *find_syscall_meta(unsigned long syscall) +{ + struct syscall_metadata *start; + struct syscall_metadata *stop; + char str[KSYM_SYMBOL_LEN]; + + + start = (struct syscall_metadata *)__start_syscalls_metadata; + stop = (struct syscall_metadata *)__stop_syscalls_metadata; + kallsyms_lookup(syscall, NULL, NULL, NULL, str); + + for ( ; start < stop; start++) { + /* + * Only compare after the "sys" prefix. Archs that use + * syscall wrappers may have syscalls symbols aliases prefixed + * with "SyS" instead of "sys", leading to an unwanted + * mismatch. + */ + if (start->name && !strcmp(start->name + 3, str + 3)) + return start; + } + return NULL; +} + +static struct syscall_metadata *syscall_nr_to_meta(int nr) +{ + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) + return NULL; + + return syscalls_metadata[nr]; +} + +int syscall_name_to_nr(char *name) +{ + int i; + + if (!syscalls_metadata) + return -1; + + for (i = 0; i < NR_syscalls; i++) { + if (syscalls_metadata[i]) { + if (!strcmp(syscalls_metadata[i]->name, name)) + return i; + } + } + return -1; +} + +void set_syscall_enter_id(int num, int id) +{ + syscalls_metadata[num]->enter_id = id; +} + +void set_syscall_exit_id(int num, int id) +{ + syscalls_metadata[num]->exit_id = id; +} + enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags) { @@ -103,7 +166,8 @@ extern char *__bad_type_size(void); #define SYSCALL_FIELD(type, name) \ sizeof(type) != sizeof(trace.name) ? \ __bad_type_size() : \ - #type, #name, offsetof(typeof(trace), name), sizeof(trace.name) + #type, #name, offsetof(typeof(trace), name), \ + sizeof(trace.name), is_signed_type(type) int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) { @@ -120,7 +184,8 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) if (!entry) return 0; - ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", + ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" + "\tsigned:%u;\n", SYSCALL_FIELD(int, nr)); if (!ret) return 0; @@ -130,8 +195,10 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) entry->args[i]); if (!ret) return 0; - ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset, - sizeof(unsigned long)); + ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;" + "\tsigned:%u;\n", offset, + sizeof(unsigned long), + is_signed_type(unsigned long)); if (!ret) return 0; offset += sizeof(unsigned long); @@ -163,8 +230,10 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) struct syscall_trace_exit trace; ret = trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" + "\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" + "\tsigned:%u;\n", SYSCALL_FIELD(int, nr), SYSCALL_FIELD(long, ret)); if (!ret) @@ -212,7 +281,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call) if (ret) return ret; - ret = trace_define_field(call, SYSCALL_FIELD(long, ret), 0, + ret = trace_define_field(call, SYSCALL_FIELD(long, ret), FILTER_OTHER); return ret; @@ -285,13 +354,13 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -int reg_event_syscall_enter(void *ptr) +int reg_event_syscall_enter(struct ftrace_event_call *call) { int ret = 0; int num; char *name; - name = (char *)ptr; + name = (char *)call->data; num = syscall_name_to_nr(name); if (num < 0 || num >= NR_syscalls) return -ENOSYS; @@ -309,12 +378,12 @@ int reg_event_syscall_enter(void *ptr) return ret; } -void unreg_event_syscall_enter(void *ptr) +void unreg_event_syscall_enter(struct ftrace_event_call *call) { int num; char *name; - name = (char *)ptr; + name = (char *)call->data; num = syscall_name_to_nr(name); if (num < 0 || num >= NR_syscalls) return; @@ -326,13 +395,13 @@ void unreg_event_syscall_enter(void *ptr) mutex_unlock(&syscall_trace_lock); } -int reg_event_syscall_exit(void *ptr) +int reg_event_syscall_exit(struct ftrace_event_call *call) { int ret = 0; int num; char *name; - name = (char *)ptr; + name = call->data; num = syscall_name_to_nr(name); if (num < 0 || num >= NR_syscalls) return -ENOSYS; @@ -350,12 +419,12 @@ int reg_event_syscall_exit(void *ptr) return ret; } -void unreg_event_syscall_exit(void *ptr) +void unreg_event_syscall_exit(struct ftrace_event_call *call) { int num; char *name; - name = (char *)ptr; + name = call->data; num = syscall_name_to_nr(name); if (num < 0 || num >= NR_syscalls) return; @@ -375,6 +444,29 @@ struct trace_event event_syscall_exit = { .trace = print_syscall_exit, }; +int __init init_ftrace_syscalls(void) +{ + struct syscall_metadata *meta; + unsigned long addr; + int i; + + syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * + NR_syscalls, GFP_KERNEL); + if (!syscalls_metadata) { + WARN_ON(1); + return -ENOMEM; + } + + for (i = 0; i < NR_syscalls; i++) { + addr = arch_syscall_addr(i); + meta = find_syscall_meta(addr); + syscalls_metadata[i] = meta; + } + + return 0; +} +core_initcall(init_ftrace_syscalls); + #ifdef CONFIG_EVENT_PROFILE static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); @@ -385,6 +477,7 @@ static int sys_prof_refcount_exit; static void prof_syscall_enter(struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; + struct perf_trace_buf *trace_buf; struct syscall_trace_enter *rec; unsigned long flags; char *raw_data; @@ -415,14 +508,25 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) cpu = smp_processor_id(); if (in_nmi()) - raw_data = rcu_dereference(trace_profile_buf_nmi); + trace_buf = rcu_dereference(perf_trace_buf_nmi); else - raw_data = rcu_dereference(trace_profile_buf); + trace_buf = rcu_dereference(perf_trace_buf); - if (!raw_data) + if (!trace_buf) goto end; - raw_data = per_cpu_ptr(raw_data, cpu); + trace_buf = per_cpu_ptr(trace_buf, cpu); + + if (trace_buf->recursion++) + goto end_recursion; + + /* + * Make recursion update visible before entering perf_tp_event + * so that we protect from perf recursions. + */ + barrier(); + + raw_data = trace_buf->buf; /* zero the dead bytes from align to not leak stack to user */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; @@ -435,6 +539,8 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) (unsigned long *)&rec->args); perf_tp_event(sys_data->enter_id, 0, 1, rec, size); +end_recursion: + trace_buf->recursion--; end: local_irq_restore(flags); } @@ -482,6 +588,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; + struct perf_trace_buf *trace_buf; unsigned long flags; int syscall_nr; char *raw_data; @@ -513,14 +620,25 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) cpu = smp_processor_id(); if (in_nmi()) - raw_data = rcu_dereference(trace_profile_buf_nmi); + trace_buf = rcu_dereference(perf_trace_buf_nmi); else - raw_data = rcu_dereference(trace_profile_buf); + trace_buf = rcu_dereference(perf_trace_buf); - if (!raw_data) + if (!trace_buf) goto end; - raw_data = per_cpu_ptr(raw_data, cpu); + trace_buf = per_cpu_ptr(trace_buf, cpu); + + if (trace_buf->recursion++) + goto end_recursion; + + /* + * Make recursion update visible before entering perf_tp_event + * so that we protect from perf recursions. + */ + barrier(); + + raw_data = trace_buf->buf; /* zero the dead bytes from align to not leak stack to user */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; @@ -534,6 +652,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) perf_tp_event(sys_data->exit_id, 0, 1, rec, size); +end_recursion: + trace_buf->recursion--; end: local_irq_restore(flags); } diff --git a/kernel/user.c b/kernel/user.c index 2c000e7132a..46d0165ca70 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -330,9 +330,9 @@ done: */ static void free_user(struct user_struct *up, unsigned long flags) { - spin_unlock_irqrestore(&uidhash_lock, flags); INIT_DELAYED_WORK(&up->work, cleanup_user_struct); schedule_delayed_work(&up->work, msecs_to_jiffies(1000)); + spin_unlock_irqrestore(&uidhash_lock, flags); } #else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index addfe2df93b..12328147132 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -640,6 +640,24 @@ int schedule_delayed_work(struct delayed_work *dwork, EXPORT_SYMBOL(schedule_delayed_work); /** + * flush_delayed_work - block until a dwork_struct's callback has terminated + * @dwork: the delayed work which is to be flushed + * + * Any timeout is cancelled, and any pending work is run immediately. + */ +void flush_delayed_work(struct delayed_work *dwork) +{ + if (del_timer_sync(&dwork->timer)) { + struct cpu_workqueue_struct *cwq; + cwq = wq_per_cpu(keventd_wq, get_cpu()); + __queue_work(cwq, &dwork->work); + put_cpu(); + } + flush_work(&dwork->work); +} +EXPORT_SYMBOL(flush_delayed_work); + +/** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use * @dwork: job to be done @@ -667,21 +685,38 @@ EXPORT_SYMBOL(schedule_delayed_work_on); int schedule_on_each_cpu(work_func_t func) { int cpu; + int orig = -1; struct work_struct *works; works = alloc_percpu(struct work_struct); if (!works) return -ENOMEM; + /* + * when running in keventd don't schedule a work item on itself. + * Can just call directly because the work queue is already bound. + * This also is faster. + * Make this a generic parameter for other workqueues? + */ + if (current_is_keventd()) { + orig = raw_smp_processor_id(); + INIT_WORK(per_cpu_ptr(works, orig), func); + func(per_cpu_ptr(works, orig)); + } + get_online_cpus(); for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); + if (cpu == orig) + continue; INIT_WORK(work, func); schedule_work_on(cpu, work); } - for_each_online_cpu(cpu) - flush_work(per_cpu_ptr(works, cpu)); + for_each_online_cpu(cpu) { + if (cpu != orig) + flush_work(per_cpu_ptr(works, cpu)); + } put_online_cpus(); free_percpu(works); return 0; |