diff options
Diffstat (limited to 'kernel')
54 files changed, 1397 insertions, 854 deletions
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 2451dc6f328..4b05bd9479d 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -277,7 +277,7 @@ static void untag_chunk(struct node *p) owner->root = NULL; } - for (i = j = 0; i < size; i++, j++) { + for (i = j = 0; j <= size; i++, j++) { struct audit_tree *s; if (&chunk->owners[j] == p) { list_del_init(&p->list); @@ -290,7 +290,7 @@ static void untag_chunk(struct node *p) if (!s) /* result of earlier fallback */ continue; get_tree(s); - list_replace_init(&chunk->owners[i].list, &new->owners[j].list); + list_replace_init(&chunk->owners[j].list, &new->owners[i].list); } list_replace_rcu(&chunk->hash, &new->hash); @@ -373,15 +373,17 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) for (n = 0; n < old->count; n++) { if (old->owners[n].owner == tree) { spin_unlock(&hash_lock); - put_inotify_watch(watch); + put_inotify_watch(&old->watch); return 0; } } spin_unlock(&hash_lock); chunk = alloc_chunk(old->count + 1); - if (!chunk) + if (!chunk) { + put_inotify_watch(&old->watch); return -ENOMEM; + } mutex_lock(&inode->inotify_mutex); if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { @@ -425,7 +427,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&hash_lock); inotify_evict_watch(&old->watch); mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(&old->watch); + put_inotify_watch(&old->watch); /* pair to inotify_find_watch */ + put_inotify_watch(&old->watch); /* and kill it */ return 0; } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 267e484f019..fc0f928167e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -250,7 +250,6 @@ struct audit_context { #endif }; -#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) static inline int open_arg(int flags, int mask) { int n = ACC_MODE(flags); diff --git a/kernel/bounds.c b/kernel/bounds.c index 3c530138183..98a51f26c13 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -12,7 +12,7 @@ void foo(void) { - /* The enum constants to put into include/linux/bounds.h */ + /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); /* End of constants */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0249f4be9b5..aa3bee56644 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2468,7 +2468,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, /* make sure l doesn't vanish out from under us */ down_write(&l->mutex); mutex_unlock(&cgrp->pidlist_mutex); - l->use_count++; return l; } } @@ -2937,14 +2936,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_subsys(root, ss) { struct cgroup_subsys_state *css = ss->create(ss, cgrp); + if (IS_ERR(css)) { err = PTR_ERR(css); goto err_destroy; } init_cgroup_css(css, ss, cgrp); - if (ss->use_id) - if (alloc_css_id(ss, parent, cgrp)) + if (ss->use_id) { + err = alloc_css_id(ss, parent, cgrp); + if (err) goto err_destroy; + } /* At error, ->destroy() callback has to free assigned ID. */ } diff --git a/kernel/cpu.c b/kernel/cpu.c index 291ac586f37..677f25376a3 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -151,13 +151,13 @@ static inline void check_for_tasks(int cpu) write_lock_irq(&tasklist_lock); for_each_process(p) { - if (task_cpu(p) == cpu && + if (task_cpu(p) == cpu && p->state == TASK_RUNNING && (!cputime_eq(p->utime, cputime_zero) || !cputime_eq(p->stime, cputime_zero))) - printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ - (state = %ld, flags = %x) \n", - p->comm, task_pid_nr(p), cpu, - p->state, p->flags); + printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " + "(state = %ld, flags = %x)\n", + p->comm, task_pid_nr(p), cpu, + p->state, p->flags); } write_unlock_irq(&tasklist_lock); } @@ -209,6 +209,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) return -ENOMEM; cpu_hotplug_begin(); + set_cpu_active(cpu, false); err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err == NOTIFY_BAD) { @@ -280,18 +281,6 @@ int __ref cpu_down(unsigned int cpu) goto out; } - set_cpu_active(cpu, false); - - /* - * Make sure the all cpus did the reschedule and are not - * using stale version of the cpu_active_mask. - * This is not strictly necessary becuase stop_machine() - * that we run down the line already provides the required - * synchronization. But it's really a side effect and we do not - * want to depend on the innards of the stop_machine here. - */ - synchronize_sched(); - err = _cpu_down(cpu, 0); out: @@ -382,19 +371,12 @@ int disable_nonboot_cpus(void) return error; cpu_maps_update_begin(); first_cpu = cpumask_first(cpu_online_mask); - /* We take down all of the non-boot CPUs in one shot to avoid races + /* + * We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time */ cpumask_clear(frozen_cpus); - for_each_online_cpu(cpu) { - if (cpu == first_cpu) - continue; - set_cpu_active(cpu, false); - } - - synchronize_sched(); - printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) diff --git a/kernel/cred.c b/kernel/cred.c index dd76cfe5f5b..1ed8ca18790 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -224,7 +224,7 @@ struct cred *cred_alloc_blank(void) #ifdef CONFIG_KEYS new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); if (!new->tgcred) { - kfree(new); + kmem_cache_free(cred_jar, new); return NULL; } atomic_set(&new->tgcred->usage, 1); diff --git a/kernel/exit.c b/kernel/exit.c index 5962d7ccf24..546774a31a6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -68,10 +68,10 @@ static void __unhash_process(struct task_struct *p) detach_pid(p, PIDTYPE_SID); list_del_rcu(&p->tasks); + list_del_init(&p->sibling); __get_cpu_var(process_counts)--; } list_del_rcu(&p->thread_group); - list_del_init(&p->sibling); } /* @@ -736,12 +736,9 @@ static struct task_struct *find_new_reaper(struct task_struct *father) /* * Any that need to be release_task'd are put on the @dead list. */ -static void reparent_thread(struct task_struct *father, struct task_struct *p, +static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { - if (p->pdeath_signal) - group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); - list_move_tail(&p->sibling, &p->real_parent->children); if (task_detached(p)) @@ -780,12 +777,18 @@ static void forget_original_parent(struct task_struct *father) reaper = find_new_reaper(father); list_for_each_entry_safe(p, n, &father->children, sibling) { - p->real_parent = reaper; - if (p->parent == father) { - BUG_ON(task_ptrace(p)); - p->parent = p->real_parent; - } - reparent_thread(father, p, &dead_children); + struct task_struct *t = p; + do { + t->real_parent = reaper; + if (t->parent == father) { + BUG_ON(task_ptrace(t)); + t->parent = t->real_parent; + } + if (t->pdeath_signal) + group_send_sig_info(t->pdeath_signal, + SEND_SIG_NOINFO, t); + } while_each_thread(p, t); + reparent_leader(father, p, &dead_children); } write_unlock_irq(&tasklist_lock); @@ -1551,14 +1554,9 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) struct task_struct *p; list_for_each_entry(p, &tsk->children, sibling) { - /* - * Do not consider detached threads. - */ - if (!task_detached(p)) { - int ret = wait_consider_task(wo, 0, p); - if (ret) - return ret; - } + int ret = wait_consider_task(wo, 0, p); + if (ret) + return ret; } return 0; diff --git a/kernel/fork.c b/kernel/fork.c index 202a0ba63d3..f88bd984df3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1241,21 +1241,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); - /* - * The task hasn't been attached yet, so its cpus_allowed mask will - * not be changed, nor will its assigned CPU. - * - * The cpus_allowed mask of the parent may have changed after it was - * copied first time - so re-copy it here, then check the child's CPU - * to ensure it is on a valid CPU (and if not, just force it back to - * parent's CPU). This avoids alot of nasty races. - */ - p->cpus_allowed = current->cpus_allowed; - p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; - if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || - !cpu_online(task_cpu(p)))) - set_task_cpu(p, smp_processor_id()); - /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; @@ -1291,7 +1276,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, } if (likely(p->pid)) { - list_add_tail(&p->sibling, &p->real_parent->children); tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) { @@ -1303,6 +1287,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->signal->tty = tty_kref_get(current->signal->tty); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); + list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; } diff --git a/kernel/futex.c b/kernel/futex.c index 8e3c3ffe1b9..e7a35f1039e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -203,8 +203,6 @@ static void drop_futex_key_refs(union futex_key *key) * @uaddr: virtual address of the futex * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED * @key: address where result is stored. - * @rw: mapping needs to be read/write (values: VERIFY_READ, - * VERIFY_WRITE) * * Returns a negative error code or 0 * The key words are stored in *key on success. @@ -216,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key) * lock_page() might sleep, the caller should not hold a spinlock. */ static int -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) +get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -239,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) * but access_ok() should be faster than find_vma() */ if (!fshared) { - if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) + if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) return -EFAULT; key->private.mm = mm; key->private.address = address; @@ -248,7 +246,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) } again: - err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page); + err = get_user_pages_fast(address, 1, 1, &page); if (err < 0) return err; @@ -532,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return -EINVAL; WARN_ON(!atomic_read(&pi_state->refcount)); - WARN_ON(pid && pi_state->owner && - pi_state->owner->pid != pid); + + /* + * When pi_state->owner is NULL then the owner died + * and another waiter is on the fly. pi_state->owner + * is fixed up by the task which acquires + * pi_state->rt_mutex. + * + * We do not check for pid == 0 which can happen when + * the owner died and robust_list_exit() cleared the + * TID. + */ + if (pid && pi_state->owner) { + /* + * Bail out if user space manipulated the + * futex value. + */ + if (pid != task_pid_vnr(pi_state->owner)) + return -EINVAL; + } atomic_inc(&pi_state->refcount); *ps = pi_state; @@ -760,6 +775,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (!pi_state) return -EINVAL; + /* + * If current does not own the pi_state then the futex is + * inconsistent and user space fiddled with the futex value. + */ + if (pi_state->owner != current) + return -EINVAL; + raw_spin_lock(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); @@ -867,7 +889,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) if (!bitset) return -EINVAL; - ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ); + ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) goto out; @@ -913,10 +935,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int ret, op_ret; retry: - ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); + ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out_put_key1; @@ -1175,11 +1197,10 @@ retry: pi_state = NULL; } - ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); + ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2, - requeue_pi ? VERIFY_WRITE : VERIFY_READ); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out_put_key1; @@ -1738,7 +1759,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, */ retry: q->key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ); + ret = get_futex_key(uaddr, fshared, &q->key); if (unlikely(ret != 0)) return ret; @@ -1904,7 +1925,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, q.requeue_pi_key = NULL; retry: q.key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); + ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) goto out; @@ -1974,7 +1995,7 @@ retry_private: /* Unqueue and drop the lock */ unqueue_me_pi(&q); - goto out; + goto out_put_key; out_unlock_put_key: queue_unlock(&q, hb); @@ -2023,7 +2044,7 @@ retry: if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) return -EPERM; - ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE); + ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) goto out; @@ -2215,7 +2236,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, rt_waiter.task = NULL; key2 = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out; diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index dbcbf6a33a0..967e66143e1 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -40,6 +40,7 @@ #include <linux/percpu.h> #include <linux/sched.h> #include <linux/init.h> +#include <linux/cpu.h> #include <linux/smp.h> #include <linux/hw_breakpoint.h> @@ -242,38 +243,70 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM */ -int reserve_bp_slot(struct perf_event *bp) +static int __reserve_bp_slot(struct perf_event *bp) { struct bp_busy_slots slots = {0}; - int ret = 0; - - mutex_lock(&nr_bp_mutex); fetch_bp_busy_slots(&slots, bp); /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) == HBP_NUM) { - ret = -ENOSPC; - goto end; - } + if (slots.pinned + (!!slots.flexible) == HBP_NUM) + return -ENOSPC; toggle_bp_slot(bp, true); -end: + return 0; +} + +int reserve_bp_slot(struct perf_event *bp) +{ + int ret; + + mutex_lock(&nr_bp_mutex); + + ret = __reserve_bp_slot(bp); + mutex_unlock(&nr_bp_mutex); return ret; } +static void __release_bp_slot(struct perf_event *bp) +{ + toggle_bp_slot(bp, false); +} + void release_bp_slot(struct perf_event *bp) { mutex_lock(&nr_bp_mutex); - toggle_bp_slot(bp, false); + __release_bp_slot(bp); mutex_unlock(&nr_bp_mutex); } +/* + * Allow the kernel debugger to reserve breakpoint slots without + * taking a lock using the dbg_* variant of for the reserve and + * release breakpoint slots. + */ +int dbg_reserve_bp_slot(struct perf_event *bp) +{ + if (mutex_is_locked(&nr_bp_mutex)) + return -1; + + return __reserve_bp_slot(bp); +} + +int dbg_release_bp_slot(struct perf_event *bp) +{ + if (mutex_is_locked(&nr_bp_mutex)) + return -1; + + __release_bp_slot(bp); + + return 0; +} int register_perf_hw_breakpoint(struct perf_event *bp) { @@ -295,6 +328,10 @@ int register_perf_hw_breakpoint(struct perf_event *bp) if (!bp->attr.disabled || !bp->overflow_handler) ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + /* if arch_validate_hwbkpt_settings() fails then release bp slot */ + if (ret) + release_bp_slot(bp); + return ret; } @@ -323,8 +360,8 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { u64 old_addr = bp->attr.bp_addr; + u64 old_len = bp->attr.bp_len; int old_type = bp->attr.bp_type; - int old_len = bp->attr.bp_len; int err = 0; perf_event_disable(bp); @@ -388,7 +425,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, if (!cpu_events) return ERR_PTR(-ENOMEM); - for_each_possible_cpu(cpu) { + get_online_cpus(); + for_each_online_cpu(cpu) { pevent = per_cpu_ptr(cpu_events, cpu); bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); @@ -399,18 +437,20 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, goto fail; } } + put_online_cpus(); return cpu_events; fail: - for_each_possible_cpu(cpu) { + for_each_online_cpu(cpu) { pevent = per_cpu_ptr(cpu_events, cpu); if (IS_ERR(*pevent)) break; unregister_hw_breakpoint(*pevent); } + put_online_cpus(); + free_percpu(cpu_events); - /* return the error if any */ return ERR_PTR(err); } EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); diff --git a/kernel/kexec.c b/kernel/kexec.c index ae217488fef..ef077fb7315 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -21,7 +21,7 @@ #include <linux/hardirq.h> #include <linux/elf.h> #include <linux/elfcore.h> -#include <linux/utsrelease.h> +#include <generated/utsrelease.h> #include <linux/utsname.h> #include <linux/numa.h> #include <linux/suspend.h> diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 3765ff3c1bb..35edbe22e9a 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -1,6 +1,7 @@ /* - * A simple kernel FIFO implementation. + * A generic kernel FIFO implementation. * + * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net> * Copyright (C) 2004 Stelian Pop <stelian@popies.net> * * This program is free software; you can redistribute it and/or modify @@ -25,50 +26,48 @@ #include <linux/err.h> #include <linux/kfifo.h> #include <linux/log2.h> +#include <linux/uaccess.h> + +static void _kfifo_init(struct kfifo *fifo, void *buffer, + unsigned int size) +{ + fifo->buffer = buffer; + fifo->size = size; + + kfifo_reset(fifo); +} /** - * kfifo_init - allocates a new FIFO using a preallocated buffer + * kfifo_init - initialize a FIFO using a preallocated buffer + * @fifo: the fifo to assign the buffer * @buffer: the preallocated buffer to be used. - * @size: the size of the internal buffer, this have to be a power of 2. - * @gfp_mask: get_free_pages mask, passed to kmalloc() - * @lock: the lock to be used to protect the fifo buffer + * @size: the size of the internal buffer, this has to be a power of 2. * - * Do NOT pass the kfifo to kfifo_free() after use! Simply free the - * &struct kfifo with kfree(). */ -struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, - gfp_t gfp_mask, spinlock_t *lock) +void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) { - struct kfifo *fifo; - /* size must be a power of 2 */ BUG_ON(!is_power_of_2(size)); - fifo = kmalloc(sizeof(struct kfifo), gfp_mask); - if (!fifo) - return ERR_PTR(-ENOMEM); - - fifo->buffer = buffer; - fifo->size = size; - fifo->in = fifo->out = 0; - fifo->lock = lock; - - return fifo; + _kfifo_init(fifo, buffer, size); } EXPORT_SYMBOL(kfifo_init); /** - * kfifo_alloc - allocates a new FIFO and its internal buffer - * @size: the size of the internal buffer to be allocated. + * kfifo_alloc - allocates a new FIFO internal buffer + * @fifo: the fifo to assign then new buffer + * @size: the size of the buffer to be allocated, this have to be a power of 2. * @gfp_mask: get_free_pages mask, passed to kmalloc() - * @lock: the lock to be used to protect the fifo buffer + * + * This function dynamically allocates a new fifo internal buffer * * The size will be rounded-up to a power of 2. + * The buffer will be release with kfifo_free(). + * Return 0 if no error, otherwise the an error code */ -struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock) +int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask) { unsigned char *buffer; - struct kfifo *ret; /* * round up to the next power of 2, since our 'let the indices @@ -80,48 +79,93 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock) } buffer = kmalloc(size, gfp_mask); - if (!buffer) - return ERR_PTR(-ENOMEM); - - ret = kfifo_init(buffer, size, gfp_mask, lock); + if (!buffer) { + _kfifo_init(fifo, NULL, 0); + return -ENOMEM; + } - if (IS_ERR(ret)) - kfree(buffer); + _kfifo_init(fifo, buffer, size); - return ret; + return 0; } EXPORT_SYMBOL(kfifo_alloc); /** - * kfifo_free - frees the FIFO + * kfifo_free - frees the FIFO internal buffer * @fifo: the fifo to be freed. */ void kfifo_free(struct kfifo *fifo) { kfree(fifo->buffer); - kfree(fifo); + _kfifo_init(fifo, NULL, 0); } EXPORT_SYMBOL(kfifo_free); /** - * __kfifo_put - puts some data into the FIFO, no locking version + * kfifo_skip - skip output data * @fifo: the fifo to be used. - * @buffer: the data to be added. - * @len: the length of the data to be added. - * - * This function copies at most @len bytes from the @buffer into - * the FIFO depending on the free space, and returns the number of - * bytes copied. - * - * Note that with only one concurrent reader and one concurrent - * writer, you don't need extra locking to use these functions. + * @len: number of bytes to skip */ -unsigned int __kfifo_put(struct kfifo *fifo, - const unsigned char *buffer, unsigned int len) +void kfifo_skip(struct kfifo *fifo, unsigned int len) +{ + if (len < kfifo_len(fifo)) { + __kfifo_add_out(fifo, len); + return; + } + kfifo_reset_out(fifo); +} +EXPORT_SYMBOL(kfifo_skip); + +static inline void __kfifo_in_data(struct kfifo *fifo, + const void *from, unsigned int len, unsigned int off) { unsigned int l; - len = min(len, fifo->size - fifo->in + fifo->out); + /* + * Ensure that we sample the fifo->out index -before- we + * start putting bytes into the kfifo. + */ + + smp_mb(); + + off = __kfifo_off(fifo, fifo->in + off); + + /* first put the data starting from fifo->in to buffer end */ + l = min(len, fifo->size - off); + memcpy(fifo->buffer + off, from, l); + + /* then put the rest (if any) at the beginning of the buffer */ + memcpy(fifo->buffer, from + l, len - l); +} + +static inline void __kfifo_out_data(struct kfifo *fifo, + void *to, unsigned int len, unsigned int off) +{ + unsigned int l; + + /* + * Ensure that we sample the fifo->in index -before- we + * start removing bytes from the kfifo. + */ + + smp_rmb(); + + off = __kfifo_off(fifo, fifo->out + off); + + /* first get the data from fifo->out until the end of the buffer */ + l = min(len, fifo->size - off); + memcpy(to, fifo->buffer + off, l); + + /* then get the rest (if any) from the beginning of the buffer */ + memcpy(to + l, fifo->buffer, len - l); +} + +static inline int __kfifo_from_user_data(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned int off, + unsigned *lenout) +{ + unsigned int l; + int ret; /* * Ensure that we sample the fifo->out index -before- we @@ -130,68 +174,272 @@ unsigned int __kfifo_put(struct kfifo *fifo, smp_mb(); + off = __kfifo_off(fifo, fifo->in + off); + /* first put the data starting from fifo->in to buffer end */ - l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); - memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l); + l = min(len, fifo->size - off); + ret = copy_from_user(fifo->buffer + off, from, l); + if (unlikely(ret)) { + *lenout = ret; + return -EFAULT; + } + *lenout = l; /* then put the rest (if any) at the beginning of the buffer */ - memcpy(fifo->buffer, buffer + l, len - l); + ret = copy_from_user(fifo->buffer, from + l, len - l); + *lenout += ret ? ret : len - l; + return ret ? -EFAULT : 0; +} + +static inline int __kfifo_to_user_data(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned int off, unsigned *lenout) +{ + unsigned int l; + int ret; /* - * Ensure that we add the bytes to the kfifo -before- - * we update the fifo->in index. + * Ensure that we sample the fifo->in index -before- we + * start removing bytes from the kfifo. */ - smp_wmb(); + smp_rmb(); + + off = __kfifo_off(fifo, fifo->out + off); + + /* first get the data from fifo->out until the end of the buffer */ + l = min(len, fifo->size - off); + ret = copy_to_user(to, fifo->buffer + off, l); + *lenout = l; + if (unlikely(ret)) { + *lenout -= ret; + return -EFAULT; + } + + /* then get the rest (if any) from the beginning of the buffer */ + len -= l; + ret = copy_to_user(to + l, fifo->buffer, len); + if (unlikely(ret)) { + *lenout += len - ret; + return -EFAULT; + } + *lenout += len; + return 0; +} + +unsigned int __kfifo_in_n(struct kfifo *fifo, + const void *from, unsigned int len, unsigned int recsize) +{ + if (kfifo_avail(fifo) < len + recsize) + return len + 1; + + __kfifo_in_data(fifo, from, len, recsize); + return 0; +} +EXPORT_SYMBOL(__kfifo_in_n); - fifo->in += len; +/** + * kfifo_in - puts some data into the FIFO + * @fifo: the fifo to be used. + * @from: the data to be added. + * @len: the length of the data to be added. + * + * This function copies at most @len bytes from the @from buffer into + * the FIFO depending on the free space, and returns the number of + * bytes copied. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +unsigned int kfifo_in(struct kfifo *fifo, const void *from, + unsigned int len) +{ + len = min(kfifo_avail(fifo), len); + __kfifo_in_data(fifo, from, len, 0); + __kfifo_add_in(fifo, len); return len; } -EXPORT_SYMBOL(__kfifo_put); +EXPORT_SYMBOL(kfifo_in); + +unsigned int __kfifo_in_generic(struct kfifo *fifo, + const void *from, unsigned int len, unsigned int recsize) +{ + return __kfifo_in_rec(fifo, from, len, recsize); +} +EXPORT_SYMBOL(__kfifo_in_generic); + +unsigned int __kfifo_out_n(struct kfifo *fifo, + void *to, unsigned int len, unsigned int recsize) +{ + if (kfifo_len(fifo) < len + recsize) + return len; + + __kfifo_out_data(fifo, to, len, recsize); + __kfifo_add_out(fifo, len + recsize); + return 0; +} +EXPORT_SYMBOL(__kfifo_out_n); /** - * __kfifo_get - gets some data from the FIFO, no locking version + * kfifo_out - gets some data from the FIFO * @fifo: the fifo to be used. - * @buffer: where the data must be copied. + * @to: where the data must be copied. * @len: the size of the destination buffer. * * This function copies at most @len bytes from the FIFO into the - * @buffer and returns the number of copied bytes. + * @to buffer and returns the number of copied bytes. * * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. */ -unsigned int __kfifo_get(struct kfifo *fifo, - unsigned char *buffer, unsigned int len) +unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len) { - unsigned int l; + len = min(kfifo_len(fifo), len); - len = min(len, fifo->in - fifo->out); + __kfifo_out_data(fifo, to, len, 0); + __kfifo_add_out(fifo, len); - /* - * Ensure that we sample the fifo->in index -before- we - * start removing bytes from the kfifo. - */ + return len; +} +EXPORT_SYMBOL(kfifo_out); - smp_rmb(); +/** + * kfifo_out_peek - copy some data from the FIFO, but do not remove it + * @fifo: the fifo to be used. + * @to: where the data must be copied. + * @len: the size of the destination buffer. + * @offset: offset into the fifo + * + * This function copies at most @len bytes at @offset from the FIFO + * into the @to buffer and returns the number of copied bytes. + * The data is not removed from the FIFO. + */ +unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len, + unsigned offset) +{ + len = min(kfifo_len(fifo), len + offset); - /* first get the data from fifo->out until the end of the buffer */ - l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); - memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l); + __kfifo_out_data(fifo, to, len, offset); + return len; +} +EXPORT_SYMBOL(kfifo_out_peek); - /* then get the rest (if any) from the beginning of the buffer */ - memcpy(buffer + l, fifo->buffer, len - l); +unsigned int __kfifo_out_generic(struct kfifo *fifo, + void *to, unsigned int len, unsigned int recsize, + unsigned int *total) +{ + return __kfifo_out_rec(fifo, to, len, recsize, total); +} +EXPORT_SYMBOL(__kfifo_out_generic); - /* - * Ensure that we remove the bytes from the kfifo -before- - * we update the fifo->out index. - */ +unsigned int __kfifo_from_user_n(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned int recsize) +{ + unsigned total; - smp_mb(); + if (kfifo_avail(fifo) < len + recsize) + return len + 1; - fifo->out += len; + __kfifo_from_user_data(fifo, from, len, recsize, &total); + return total; +} +EXPORT_SYMBOL(__kfifo_from_user_n); - return len; +/** + * kfifo_from_user - puts some data from user space into the FIFO + * @fifo: the fifo to be used. + * @from: pointer to the data to be added. + * @len: the length of the data to be added. + * @total: the actual returned data length. + * + * This function copies at most @len bytes from the @from into the + * FIFO depending and returns -EFAULT/0. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +int kfifo_from_user(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned *total) +{ + int ret; + len = min(kfifo_avail(fifo), len); + ret = __kfifo_from_user_data(fifo, from, len, 0, total); + if (ret) + return ret; + __kfifo_add_in(fifo, len); + return 0; } -EXPORT_SYMBOL(__kfifo_get); +EXPORT_SYMBOL(kfifo_from_user); + +unsigned int __kfifo_from_user_generic(struct kfifo *fifo, + const void __user *from, unsigned int len, unsigned int recsize) +{ + return __kfifo_from_user_rec(fifo, from, len, recsize); +} +EXPORT_SYMBOL(__kfifo_from_user_generic); + +unsigned int __kfifo_to_user_n(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned int reclen, + unsigned int recsize) +{ + unsigned int ret, total; + + if (kfifo_len(fifo) < reclen + recsize) + return len; + + ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total); + + if (likely(ret == 0)) + __kfifo_add_out(fifo, reclen + recsize); + + return total; +} +EXPORT_SYMBOL(__kfifo_to_user_n); + +/** + * kfifo_to_user - gets data from the FIFO and write it to user space + * @fifo: the fifo to be used. + * @to: where the data must be copied. + * @len: the size of the destination buffer. + * @lenout: pointer to output variable with copied data + * + * This function copies at most @len bytes from the FIFO into the + * @to buffer and 0 or -EFAULT. + * + * Note that with only one concurrent reader and one concurrent + * writer, you don't need extra locking to use these functions. + */ +int kfifo_to_user(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned *lenout) +{ + int ret; + len = min(kfifo_len(fifo), len); + ret = __kfifo_to_user_data(fifo, to, len, 0, lenout); + __kfifo_add_out(fifo, *lenout); + return ret; +} +EXPORT_SYMBOL(kfifo_to_user); + +unsigned int __kfifo_to_user_generic(struct kfifo *fifo, + void __user *to, unsigned int len, unsigned int recsize, + unsigned int *total) +{ + return __kfifo_to_user_rec(fifo, to, len, recsize, total); +} +EXPORT_SYMBOL(__kfifo_to_user_generic); + +unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize) +{ + if (recsize == 0) + return kfifo_avail(fifo); + + return __kfifo_peek_n(fifo, recsize); +} +EXPORT_SYMBOL(__kfifo_peek_generic); + +void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize) +{ + __kfifo_skip_rec(fifo, recsize); +} +EXPORT_SYMBOL(__kfifo_skip_generic); + diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 2eb517e2351..761fdd2b303 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -583,6 +583,9 @@ static void kgdb_wait(struct pt_regs *regs) smp_wmb(); atomic_set(&cpu_in_kgdb[cpu], 1); + /* Disable any cpu specific hw breakpoints */ + kgdb_disable_hw_debug(regs); + /* Wait till primary CPU is done with debugging */ while (atomic_read(&passive_cpu_wait[cpu])) cpu_relax(); @@ -596,7 +599,7 @@ static void kgdb_wait(struct pt_regs *regs) /* Signal the primary CPU that we are done: */ atomic_set(&cpu_in_kgdb[cpu], 0); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); } @@ -1450,7 +1453,7 @@ acquirelock: (kgdb_info[cpu].task && kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); @@ -1550,7 +1553,7 @@ kgdb_restore: } /* Free kgdb_active */ atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); diff --git a/kernel/kmod.c b/kernel/kmod.c index 25b10319036..bf0e231d970 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -520,13 +520,15 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp, return -ENOMEM; ret = call_usermodehelper_stdinpipe(sub_info, filp); - if (ret < 0) - goto out; + if (ret < 0) { + call_usermodehelper_freeinfo(sub_info); + return ret; + } - return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); + ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); + if (ret < 0) /* Failed to execute helper, close pipe */ + filp_close(*filp, NULL); - out: - call_usermodehelper_freeinfo(sub_info); return ret; } EXPORT_SYMBOL(call_usermodehelper_pipe); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e5342a344c4..b7df302a020 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1035,7 +1035,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp) /* Pre-allocate memory for max kretprobe instances */ if (rp->maxactive <= 0) { #ifdef CONFIG_PREEMPT - rp->maxactive = max(10, 2 * num_possible_cpus()); + rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus()); #else rp->maxactive = num_possible_cpus(); #endif diff --git a/kernel/kthread.c b/kernel/kthread.c index ab7ae57773e..fbb6222fe7e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -150,6 +150,29 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), EXPORT_SYMBOL(kthread_create); /** + * kthread_bind - bind a just-created kthread to a cpu. + * @p: thread created by kthread_create(). + * @cpu: cpu (might not be online, must be possible) for @k to run on. + * + * Description: This function is equivalent to set_cpus_allowed(), + * except that @cpu doesn't need to be online, and the thread must be + * stopped (i.e., just returned from kthread_create()). + */ +void kthread_bind(struct task_struct *p, unsigned int cpu) +{ + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { + WARN_ON(1); + return; + } + + p->cpus_allowed = cpumask_of_cpu(cpu); + p->rt.nr_cpus_allowed = 1; + p->flags |= PF_THREAD_BOUND; +} +EXPORT_SYMBOL(kthread_bind); + +/** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 5feaddcdbe4..c62ec14609b 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2147,7 +2147,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, return ret; return print_irq_inversion_bug(curr, &root, target_entry, - this, 1, irqclass); + this, 0, irqclass); } void print_irqtrace_events(struct task_struct *curr) diff --git a/kernel/module.c b/kernel/module.c index 12afc5a3ddd..f82386bd9ee 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -880,11 +880,23 @@ static int try_to_force_load(struct module *mod, const char *reason) } #ifdef CONFIG_MODVERSIONS +/* If the arch applies (non-zero) relocations to kernel kcrctab, unapply it. */ +static unsigned long maybe_relocated(unsigned long crc, + const struct module *crc_owner) +{ +#ifdef ARCH_RELOCATES_KCRCTAB + if (crc_owner == NULL) + return crc - (unsigned long)reloc_start; +#endif + return crc; +} + static int check_version(Elf_Shdr *sechdrs, unsigned int versindex, const char *symname, struct module *mod, - const unsigned long *crc) + const unsigned long *crc, + const struct module *crc_owner) { unsigned int i, num_versions; struct modversion_info *versions; @@ -905,10 +917,10 @@ static int check_version(Elf_Shdr *sechdrs, if (strcmp(versions[i].name, symname) != 0) continue; - if (versions[i].crc == *crc) + if (versions[i].crc == maybe_relocated(*crc, crc_owner)) return 1; DEBUGP("Found checksum %lX vs module %lX\n", - *crc, versions[i].crc); + maybe_relocated(*crc, crc_owner), versions[i].crc); goto bad_version; } @@ -931,7 +943,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, &crc, true, false)) BUG(); - return check_version(sechdrs, versindex, "module_layout", mod, crc); + return check_version(sechdrs, versindex, "module_layout", mod, crc, + NULL); } /* First part is kernel version, which we ignore if module has crcs. */ @@ -949,7 +962,8 @@ static inline int check_version(Elf_Shdr *sechdrs, unsigned int versindex, const char *symname, struct module *mod, - const unsigned long *crc) + const unsigned long *crc, + const struct module *crc_owner) { return 1; } @@ -984,8 +998,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, /* use_module can fail due to OOM, or module initialization or unloading */ if (sym) { - if (!check_version(sechdrs, versindex, name, mod, crc) || - !use_module(mod, owner)) + if (!check_version(sechdrs, versindex, name, mod, crc, owner) + || !use_module(mod, owner)) sym = NULL; } return sym; @@ -996,6 +1010,12 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, * J. Corbet <corbet@lwn.net> */ #if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) + +static inline bool sect_empty(const Elf_Shdr *sect) +{ + return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; +} + struct module_sect_attr { struct module_attribute mattr; @@ -1037,8 +1057,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, /* Count loaded sections and allocate structures */ for (i = 0; i < nsect; i++) - if (sechdrs[i].sh_flags & SHF_ALLOC - && sechdrs[i].sh_size) + if (!sect_empty(&sechdrs[i])) nloaded++; size[0] = ALIGN(sizeof(*sect_attrs) + nloaded * sizeof(sect_attrs->attrs[0]), @@ -1056,9 +1075,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, sattr = §_attrs->attrs[0]; gattr = §_attrs->grp.attrs[0]; for (i = 0; i < nsect; i++) { - if (! (sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - if (!sechdrs[i].sh_size) + if (sect_empty(&sechdrs[i])) continue; sattr->address = sechdrs[i].sh_addr; sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, @@ -1142,7 +1159,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, /* Count notes sections and allocate structures. */ notes = 0; for (i = 0; i < nsect; i++) - if ((sechdrs[i].sh_flags & SHF_ALLOC) && + if (!sect_empty(&sechdrs[i]) && (sechdrs[i].sh_type == SHT_NOTE)) ++notes; @@ -1158,7 +1175,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, notes_attrs->notes = notes; nattr = ¬es_attrs->attrs[0]; for (loaded = i = 0; i < nsect; ++i) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + if (sect_empty(&sechdrs[i])) continue; if (sechdrs[i].sh_type == SHT_NOTE) { nattr->attr.name = mod->sect_attrs->attrs[loaded].name; @@ -1896,9 +1913,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, unsigned int i; /* only scan the sections containing data */ - kmemleak_scan_area(mod->module_core, (unsigned long)mod - - (unsigned long)mod->module_core, - sizeof(struct module), GFP_KERNEL); + kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); for (i = 1; i < hdr->e_shnum; i++) { if (!(sechdrs[i].sh_flags & SHF_ALLOC)) @@ -1907,8 +1922,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0) continue; - kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr - - (unsigned long)mod->module_core, + kmemleak_scan_area((void *)sechdrs[i].sh_addr, sechdrs[i].sh_size, GFP_KERNEL); } } @@ -2236,6 +2250,12 @@ static noinline struct module *load_module(void __user *umod, "_ftrace_events", sizeof(*mod->trace_events), &mod->num_trace_events); + /* + * This section contains pointers to allocated objects in the trace + * code and not scanning it leads to false positives. + */ + kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * + mod->num_trace_events, GFP_KERNEL); #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD /* sechdrs[0].sh_size is always zero */ diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 9052d6c8c9f..2ae7409bf38 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -782,6 +782,9 @@ static void __perf_install_in_context(void *info) add_event_to_ctx(event, ctx); + if (event->cpu != -1 && event->cpu != smp_processor_id()) + goto unlock; + /* * Don't put the event on if it is disabled or if * it is in a group and the group isn't on. @@ -925,6 +928,9 @@ static void __perf_event_enable(void *info) goto unlock; __perf_event_mark_enabled(event, ctx); + if (event->cpu != -1 && event->cpu != smp_processor_id()) + goto unlock; + /* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. @@ -1375,6 +1381,9 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) if (event->state != PERF_EVENT_STATE_ACTIVE) continue; + if (event->cpu != -1 && event->cpu != smp_processor_id()) + continue; + hwc = &event->hw; interrupts = hwc->interrupts; @@ -1595,15 +1604,12 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) unsigned long flags; int err; - /* - * If cpu is not a wildcard then this is a percpu event: - */ - if (cpu != -1) { + if (pid == -1 && cpu != -1) { /* Must be root to operate on a CPU event: */ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); - if (cpu < 0 || cpu > num_possible_cpus()) + if (cpu < 0 || cpu >= nr_cpumask_bits) return ERR_PTR(-EINVAL); /* @@ -1611,7 +1617,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) * offline CPU and activate it when the CPU comes up, but * that's for later. */ - if (!cpu_isset(cpu, cpu_online_map)) + if (!cpu_online(cpu)) return ERR_PTR(-ENODEV); cpuctx = &per_cpu(perf_cpu_context, cpu); @@ -3253,8 +3259,6 @@ static void perf_event_task_output(struct perf_event *event, task_event->event_id.tid = perf_event_tid(event, task); task_event->event_id.ptid = perf_event_tid(event, current); - task_event->event_id.time = perf_clock(); - perf_output_put(&handle, task_event->event_id); perf_output_end(&handle); @@ -3262,6 +3266,12 @@ static void perf_event_task_output(struct perf_event *event, static int perf_event_task_match(struct perf_event *event) { + if (event->state < PERF_EVENT_STATE_INACTIVE) + return 0; + + if (event->cpu != -1 && event->cpu != smp_processor_id()) + return 0; + if (event->attr.comm || event->attr.mmap || event->attr.task) return 1; @@ -3287,12 +3297,11 @@ static void perf_event_task_event(struct perf_task_event *task_event) rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_task_ctx(&cpuctx->ctx, task_event); - put_cpu_var(perf_cpu_context); - if (!ctx) - ctx = rcu_dereference(task_event->task->perf_event_ctxp); + ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_event_task_ctx(ctx, task_event); + put_cpu_var(perf_cpu_context); rcu_read_unlock(); } @@ -3320,6 +3329,7 @@ static void perf_event_task(struct task_struct *task, /* .ppid */ /* .tid */ /* .ptid */ + .time = perf_clock(), }, }; @@ -3369,6 +3379,12 @@ static void perf_event_comm_output(struct perf_event *event, static int perf_event_comm_match(struct perf_event *event) { + if (event->state < PERF_EVENT_STATE_INACTIVE) + return 0; + + if (event->cpu != -1 && event->cpu != smp_processor_id()) + return 0; + if (event->attr.comm) return 1; @@ -3405,15 +3421,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_comm_ctx(&cpuctx->ctx, comm_event); - put_cpu_var(perf_cpu_context); - - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_event_comm_ctx(ctx, comm_event); + put_cpu_var(perf_cpu_context); rcu_read_unlock(); } @@ -3488,6 +3499,12 @@ static void perf_event_mmap_output(struct perf_event *event, static int perf_event_mmap_match(struct perf_event *event, struct perf_mmap_event *mmap_event) { + if (event->state < PERF_EVENT_STATE_INACTIVE) + return 0; + + if (event->cpu != -1 && event->cpu != smp_processor_id()) + return 0; + if (event->attr.mmap) return 1; @@ -3561,15 +3578,10 @@ got_name: rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); - put_cpu_var(perf_cpu_context); - - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_event_mmap_ctx(ctx, mmap_event); + put_cpu_var(perf_cpu_context); rcu_read_unlock(); kfree(buf); @@ -3860,6 +3872,9 @@ static int perf_swevent_match(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { + if (event->cpu != -1 && event->cpu != smp_processor_id()) + return 0; + if (!perf_swevent_is_counting(event)) return 0; @@ -4564,7 +4579,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->type >= PERF_TYPE_MAX) return -EINVAL; - if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) + if (attr->__reserved_1) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) @@ -4717,7 +4732,7 @@ SYSCALL_DEFINE5(perf_event_open, if (IS_ERR(event)) goto err_put_context; - err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0); + err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR); if (err < 0) goto err_free_put_context; @@ -5141,7 +5156,7 @@ int perf_event_init_task(struct task_struct *child) GFP_KERNEL); if (!child_ctx) { ret = -ENOMEM; - goto exit; + break; } __perf_event_init_context(child_ctx, child); @@ -5157,7 +5172,7 @@ int perf_event_init_task(struct task_struct *child) } } - if (inherited_all) { + if (child_ctx && inherited_all) { /* * Mark the child context as a clone of the parent * context, or of whatever the parent is a clone of. @@ -5177,7 +5192,6 @@ int perf_event_init_task(struct task_struct *child) get_ctx(child_ctx->parent_ctx); } -exit: mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); diff --git a/kernel/printk.c b/kernel/printk.c index 2c9dc0b03a5..1751c456b71 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1412,7 +1412,7 @@ static LIST_HEAD(dump_list); /** * kmsg_dump_register - register a kernel log dumper. - * @dump: pointer to the kmsg_dumper structure + * @dumper: pointer to the kmsg_dumper structure * * Adds a kernel log dumper to the system. The dump callback in the * structure will be called when the kernel oopses or panics and must be @@ -1442,7 +1442,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_register); /** * kmsg_dump_unregister - unregister a kmsg dumper. - * @dump: pointer to the kmsg_dumper structure + * @dumper: pointer to the kmsg_dumper structure * * Removes a dump device from the system. Returns zero on success and * %-EINVAL otherwise. diff --git a/kernel/resource.c b/kernel/resource.c index dc15686b7a7..af96c1e4b54 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -308,37 +308,37 @@ static int find_resource(struct resource *root, struct resource *new, void *alignf_data) { struct resource *this = root->child; - resource_size_t start, end; + struct resource tmp = *new; - start = root->start; + tmp.start = root->start; /* * Skip past an allocated resource that starts at 0, since the assignment - * of this->start - 1 to new->end below would cause an underflow. + * of this->start - 1 to tmp->end below would cause an underflow. */ if (this && this->start == 0) { - start = this->end + 1; + tmp.start = this->end + 1; this = this->sibling; } for(;;) { if (this) - end = this->start - 1; + tmp.end = this->start - 1; else - end = root->end; - if (start < min) - start = min; - if (end > max) - end = max; - start = ALIGN(start, align); + tmp.end = root->end; + if (tmp.start < min) + tmp.start = min; + if (tmp.end > max) + tmp.end = max; + tmp.start = ALIGN(tmp.start, align); if (alignf) - alignf(alignf_data, new, size, align); - if (start < end && end - start >= size - 1) { - new->start = start; - new->end = start + size - 1; + alignf(alignf_data, &tmp, size, align); + if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { + new->start = tmp.start; + new->end = tmp.start + size - 1; return 0; } if (!this) break; - start = this->end + 1; + tmp.start = this->end + 1; this = this->sibling; } return -EBUSY; diff --git a/kernel/sched.c b/kernel/sched.c index 18cceeecce3..3a8fb30a91b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2002,39 +2002,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio, running); } -/** - * kthread_bind - bind a just-created kthread to a cpu. - * @p: thread created by kthread_create(). - * @cpu: cpu (might not be online, must be possible) for @k to run on. - * - * Description: This function is equivalent to set_cpus_allowed(), - * except that @cpu doesn't need to be online, and the thread must be - * stopped (i.e., just returned from kthread_create()). - * - * Function lives here instead of kthread.c because it messes with - * scheduler internals which require locking. - */ -void kthread_bind(struct task_struct *p, unsigned int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - /* Must have done schedule() in kthread() before we set_task_cpu */ - if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { - WARN_ON(1); - return; - } - - raw_spin_lock_irqsave(&rq->lock, flags); - update_rq_clock(rq); - set_task_cpu(p, cpu); - p->cpus_allowed = cpumask_of_cpu(cpu); - p->rt.nr_cpus_allowed = 1; - p->flags |= PF_THREAD_BOUND; - raw_spin_unlock_irqrestore(&rq->lock, flags); -} -EXPORT_SYMBOL(kthread_bind); - #ifdef CONFIG_SMP /* * Is this task likely cache-hot: @@ -2044,6 +2011,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) { s64 delta; + if (p->sched_class != &fair_sched_class) + return 0; + /* * Buddy candidates are cache hot: */ @@ -2052,9 +2022,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) &p->se == cfs_rq_of(&p->se)->last)) return 1; - if (p->sched_class != &fair_sched_class) - return 0; - if (sysctl_sched_migration_cost == -1) return 1; if (sysctl_sched_migration_cost == 0) @@ -2065,22 +2032,23 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) return delta < (s64)sysctl_sched_migration_cost; } - void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { - int old_cpu = task_cpu(p); - struct cfs_rq *old_cfsrq = task_cfs_rq(p), - *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); +#ifdef CONFIG_SCHED_DEBUG + /* + * We should never call set_task_cpu() on a blocked task, + * ttwu() will sort out the placement. + */ + WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && + !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); +#endif trace_sched_migrate_task(p, new_cpu); - if (old_cpu != new_cpu) { + if (task_cpu(p) != new_cpu) { p->se.nr_migrations++; - perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, - 1, 1, NULL, 0); + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } - p->se.vruntime -= old_cfsrq->min_vruntime - - new_cfsrq->min_vruntime; __set_task_cpu(p, new_cpu); } @@ -2105,13 +2073,10 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) /* * If the task is not on a runqueue (and not running), then - * it is sufficient to simply update the task's cpu field. + * the next wake-up will properly place the task. */ - if (!p->se.on_rq && !task_running(rq, p)) { - update_rq_clock(rq); - set_task_cpu(p, dest_cpu); + if (!p->se.on_rq && !task_running(rq, p)) return 0; - } init_completion(&req->done); req->task = p; @@ -2317,10 +2282,71 @@ void task_oncpu_function_call(struct task_struct *p, } #ifdef CONFIG_SMP +static int select_fallback_rq(int cpu, struct task_struct *p) +{ + int dest_cpu; + const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); + + /* Look for allowed, online CPU in same node. */ + for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + return dest_cpu; + + /* Any allowed, online CPU? */ + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); + if (dest_cpu < nr_cpu_ids) + return dest_cpu; + + /* No more Mr. Nice Guy. */ + if (dest_cpu >= nr_cpu_ids) { + rcu_read_lock(); + cpuset_cpus_allowed_locked(p, &p->cpus_allowed); + rcu_read_unlock(); + dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); + + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); + } + } + + return dest_cpu; +} + +/* + * Gets called from 3 sites (exec, fork, wakeup), since it is called without + * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done + * by: + * + * exec: is unstable, retry loop + * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING + */ static inline int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) { - return p->sched_class->select_task_rq(p, sd_flags, wake_flags); + int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); + + /* + * In order not to call set_task_cpu() on a blocking task we need + * to rely on ttwu() to place the task on a valid ->cpus_allowed + * cpu. + * + * Since this is common to all placement strategies, this lives here. + * + * [ this allows ->select_task() to simply return task_cpu(p) and + * not worry about this generic constraint ] + */ + if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || + !cpu_online(cpu))) + cpu = select_fallback_rq(task_cpu(p), p); + + return cpu; } #endif @@ -2375,6 +2401,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (task_contributes_to_load(p)) rq->nr_uninterruptible--; p->state = TASK_WAKING; + + if (p->sched_class->task_waking) + p->sched_class->task_waking(rq, p); + __task_rq_unlock(rq); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); @@ -2438,8 +2468,8 @@ out_running: p->state = TASK_RUNNING; #ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); if (unlikely(rq->idle_stamp)) { u64 delta = rq->clock - rq->idle_stamp; @@ -2538,14 +2568,6 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif - - /* - * We mark the process as running here, but have not actually - * inserted it onto the runqueue yet. This guarantees that - * nobody will actually run it, and a signal or other external - * event cannot wake it up and insert it on the runqueue either. - */ - p->state = TASK_RUNNING; } /* @@ -2556,6 +2578,12 @@ void sched_fork(struct task_struct *p, int clone_flags) int cpu = get_cpu(); __sched_fork(p); + /* + * We mark the process as waking here. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_WAKING; /* * Revert to default priority/policy on fork if requested. @@ -2590,9 +2618,6 @@ void sched_fork(struct task_struct *p, int clone_flags) if (p->sched_class->task_fork) p->sched_class->task_fork(p); -#ifdef CONFIG_SMP - cpu = select_task_rq(p, SD_BALANCE_FORK, 0); -#endif set_task_cpu(p, cpu); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) @@ -2622,18 +2647,35 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { unsigned long flags; struct rq *rq; + int cpu = get_cpu(); + +#ifdef CONFIG_SMP + /* + * Fork balancing, do it here and not earlier because: + * - cpus_allowed can change in the fork path + * - any previously selected cpu might disappear through hotplug + * + * We still have TASK_WAKING but PF_STARTING is gone now, meaning + * ->cpus_allowed is stable, we have preemption disabled, meaning + * cpu_online_mask is stable. + */ + cpu = select_task_rq(p, SD_BALANCE_FORK, 0); + set_task_cpu(p, cpu); +#endif rq = task_rq_lock(p, &flags); - BUG_ON(p->state != TASK_RUNNING); + BUG_ON(p->state != TASK_WAKING); + p->state = TASK_RUNNING; update_rq_clock(rq); activate_task(rq, p, 0); trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); #endif task_rq_unlock(rq, &flags); + put_cpu(); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -3101,21 +3143,36 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) } /* - * If dest_cpu is allowed for this process, migrate the task to it. - * This is accomplished by forcing the cpu_allowed mask to only - * allow dest_cpu, which will force the cpu onto dest_cpu. Then - * the cpu_allowed mask is restored. + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint. */ -static void sched_migrate_task(struct task_struct *p, int dest_cpu) +void sched_exec(void) { + struct task_struct *p = current; struct migration_req req; + int dest_cpu, this_cpu; unsigned long flags; struct rq *rq; +again: + this_cpu = get_cpu(); + dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0); + if (dest_cpu == this_cpu) { + put_cpu(); + return; + } + rq = task_rq_lock(p, &flags); + put_cpu(); + + /* + * select_task_rq() can race against ->cpus_allowed + */ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) - || unlikely(!cpu_active(dest_cpu))) - goto out; + || unlikely(!cpu_active(dest_cpu))) { + task_rq_unlock(rq, &flags); + goto again; + } /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { @@ -3130,24 +3187,10 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) return; } -out: task_rq_unlock(rq, &flags); } /* - * sched_exec - execve() is a valuable balancing opportunity, because at - * this point the task has the smallest effective memory and cache footprint. - */ -void sched_exec(void) -{ - int new_cpu, this_cpu = get_cpu(); - new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0); - put_cpu(); - if (new_cpu != this_cpu) - sched_migrate_task(current, new_cpu); -} - -/* * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ @@ -5498,8 +5541,11 @@ need_resched_nonpreemptible: post_schedule(rq); - if (unlikely(reacquire_kernel_lock(current) < 0)) + if (unlikely(reacquire_kernel_lock(current) < 0)) { + prev = rq->curr; + switch_count = &prev->nivcsw; goto need_resched_nonpreemptible; + } preempt_enable_no_resched(); if (need_resched()) @@ -5911,14 +5957,15 @@ EXPORT_SYMBOL(wait_for_completion_killable); */ bool try_wait_for_completion(struct completion *x) { + unsigned long flags; int ret = 1; - spin_lock_irq(&x->wait.lock); + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; else x->done--; - spin_unlock_irq(&x->wait.lock); + spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(try_wait_for_completion); @@ -5933,12 +5980,13 @@ EXPORT_SYMBOL(try_wait_for_completion); */ bool completion_done(struct completion *x) { + unsigned long flags; int ret = 1; - spin_lock_irq(&x->wait.lock); + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; - spin_unlock_irq(&x->wait.lock); + spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(completion_done); @@ -6457,7 +6505,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) return -EINVAL; retval = -ESRCH; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (p) { retval = security_task_getscheduler(p); @@ -6465,7 +6513,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) retval = p->policy | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } @@ -6483,7 +6531,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (!param || pid < 0) return -EINVAL; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); retval = -ESRCH; if (!p) @@ -6494,7 +6542,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) goto out_unlock; lp.sched_priority = p->rt_priority; - read_unlock(&tasklist_lock); + rcu_read_unlock(); /* * This one might sleep, we cannot do it with a spinlock held ... @@ -6504,7 +6552,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) return retval; out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } @@ -6515,22 +6563,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) int retval; get_online_cpus(); - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (!p) { - read_unlock(&tasklist_lock); + rcu_read_unlock(); put_online_cpus(); return -ESRCH; } - /* - * It is not safe to call set_cpus_allowed with the - * tasklist_lock held. We will bump the task_struct's - * usage count and then drop tasklist_lock. - */ + /* Prevent p going away */ get_task_struct(p); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { retval = -ENOMEM; @@ -6616,7 +6660,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) int retval; get_online_cpus(); - read_lock(&tasklist_lock); + rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); @@ -6632,7 +6676,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) task_rq_unlock(rq, &flags); out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); put_online_cpus(); return retval; @@ -6876,7 +6920,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, return -EINVAL; retval = -ESRCH; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (!p) goto out_unlock; @@ -6889,13 +6933,13 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, time_slice = p->sched_class->get_rr_interval(rq, p); task_rq_unlock(rq, &flags); - read_unlock(&tasklist_lock); + rcu_read_unlock(); jiffies_to_timespec(time_slice, &t); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; return retval; out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } @@ -6986,6 +7030,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) raw_spin_lock_irqsave(&rq->lock, flags); __sched_fork(idle); + idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); @@ -7100,7 +7145,27 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) struct rq *rq; int ret = 0; + /* + * Since we rely on wake-ups to migrate sleeping tasks, don't change + * the ->cpus_allowed mask from under waking tasks, which would be + * possible when we change rq->lock in ttwu(), so synchronize against + * TASK_WAKING to avoid that. + * + * Make an exception for freshly cloned tasks, since cpuset namespaces + * might move the task about, we have to validate the target in + * wake_up_new_task() anyway since the cpu might have gone away. + */ +again: + while (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) + cpu_relax(); + rq = task_rq_lock(p, &flags); + + if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) { + task_rq_unlock(rq, &flags); + goto again; + } + if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; goto out; @@ -7156,7 +7221,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { struct rq *rq_dest, *rq_src; - int ret = 0, on_rq; + int ret = 0; if (unlikely(!cpu_active(dest_cpu))) return ret; @@ -7172,12 +7237,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto fail; - on_rq = p->se.on_rq; - if (on_rq) + /* + * If we're not on a rq, the next wake-up will ensure we're + * placed properly. + */ + if (p->se.on_rq) { deactivate_task(rq_src, p, 0); - - set_task_cpu(p, dest_cpu); - if (on_rq) { + set_task_cpu(p, dest_cpu); activate_task(rq_dest, p, 0); check_preempt_curr(rq_dest, p, 0); } @@ -7273,37 +7339,10 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { int dest_cpu; - const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu)); again: - /* Look for allowed, online CPU in same node. */ - for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) - goto move; - - /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); - if (dest_cpu < nr_cpu_ids) - goto move; - - /* No more Mr. Nice Guy. */ - if (dest_cpu >= nr_cpu_ids) { - cpuset_cpus_allowed_locked(p, &p->cpus_allowed); - dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); - - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); - } - } + dest_cpu = select_fallback_rq(dead_cpu, p); -move: /* It can have affinity changed while we were choosing. */ if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) goto again; @@ -9668,7 +9707,7 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = preempt_count() & ~PREEMPT_ACTIVE; + int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); } @@ -10083,7 +10122,7 @@ void sched_move_task(struct task_struct *tsk) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->moved_group) - tsk->sched_class->moved_group(tsk); + tsk->sched_class->moved_group(tsk, on_rq); #endif if (unlikely(running)) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 479ce5682d7..5b496132c28 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -236,6 +236,18 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); +unsigned long long cpu_clock(int cpu) +{ + unsigned long long clock; + unsigned long flags; + + local_irq_save(flags); + clock = sched_clock_cpu(cpu); + local_irq_restore(flags); + + return clock; +} + #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ void sched_clock_init(void) @@ -251,17 +263,12 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } -#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ unsigned long long cpu_clock(int cpu) { - unsigned long long clock; - unsigned long flags; + return sched_clock_cpu(cpu); +} - local_irq_save(flags); - clock = sched_clock_cpu(cpu); - local_irq_restore(flags); +#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ - return clock; -} EXPORT_SYMBOL_GPL(cpu_clock); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5bedf6e3ebf..8fe7ee81c55 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -510,6 +510,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec); delta_exec_weighted = calc_delta_fair(delta_exec, curr); + curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); } @@ -765,16 +766,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) se->vruntime = vruntime; } +#define ENQUEUE_WAKEUP 1 +#define ENQUEUE_MIGRATE 2 + static void -enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* + * Update the normalized vruntime before updating min_vruntime + * through callig update_curr(). + */ + if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) + se->vruntime += cfs_rq->min_vruntime; + + /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); account_entity_enqueue(cfs_rq, se); - if (wakeup) { + if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); enqueue_sleeper(cfs_rq, se); } @@ -828,6 +839,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) __dequeue_entity(cfs_rq, se); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); + + /* + * Normalize the entity after updating the min_vruntime because the + * update can refer to the ->curr item and we need to reflect this + * movement in our normalized position. + */ + if (!sleep) + se->vruntime -= cfs_rq->min_vruntime; } /* @@ -1038,13 +1057,19 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int flags = 0; + + if (wakeup) + flags |= ENQUEUE_WAKEUP; + if (p->state == TASK_WAKING) + flags |= ENQUEUE_MIGRATE; for_each_sched_entity(se) { if (se->on_rq) break; cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, wakeup); - wakeup = 1; + enqueue_entity(cfs_rq, se, flags); + flags = ENQUEUE_WAKEUP; } hrtick_update(rq); @@ -1120,6 +1145,14 @@ static void yield_task_fair(struct rq *rq) #ifdef CONFIG_SMP +static void task_waking_fair(struct rq *rq, struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + se->vruntime -= cfs_rq->min_vruntime; +} + #ifdef CONFIG_FAIR_GROUP_SCHED /* * effective_load() calculates the load change as seen from the root_task_group @@ -1429,6 +1462,9 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag } for_each_domain(cpu, tmp) { + if (!(tmp->flags & SD_LOAD_BALANCE)) + continue; + /* * If power savings logic is enabled for a domain, see if we * are not overloaded, if so, don't balance wider. @@ -1472,7 +1508,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag * If there's an idle sibling in this domain, make that * the wake_affine target instead of the current cpu. */ - if (tmp->flags & SD_PREFER_SIBLING) + if (tmp->flags & SD_SHARE_PKG_RESOURCES) target = select_idle_sibling(p, tmp, target); if (target >= 0) { @@ -1975,6 +2011,8 @@ static void task_fork_fair(struct task_struct *p) resched_task(rq->curr); } + se->vruntime -= cfs_rq->min_vruntime; + raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -2028,12 +2066,13 @@ static void set_curr_task_fair(struct rq *rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void moved_group_fair(struct task_struct *p) +static void moved_group_fair(struct task_struct *p, int on_rq) { struct cfs_rq *cfs_rq = task_cfs_rq(p); update_curr(cfs_rq); - place_entity(cfs_rq, &p->se, 1); + if (!on_rq) + place_entity(cfs_rq, &p->se, 1); } #endif @@ -2073,6 +2112,8 @@ static const struct sched_class fair_sched_class = { .move_one_task = move_one_task_fair, .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, + + .task_waking = task_waking_fair, #endif .set_curr_task = set_curr_task_fair, diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d2ea2828164..f48328ac216 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1472,7 +1472,7 @@ static void post_schedule_rt(struct rq *rq) * If we are not running and we are not going to reschedule soon, we should * try to push tasks away now */ -static void task_wake_up_rt(struct rq *rq, struct task_struct *p) +static void task_woken_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && @@ -1753,7 +1753,7 @@ static const struct sched_class rt_sched_class = { .rq_offline = rq_offline_rt, .pre_schedule = pre_schedule_rt, .post_schedule = post_schedule_rt, - .task_wake_up = task_wake_up_rt, + .task_woken = task_woken_rt, .switched_from = switched_from_rt, #endif diff --git a/kernel/signal.c b/kernel/signal.c index 1814e68e4de..934ae5e687b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -218,13 +218,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi struct user_struct *user; /* - * We won't get problems with the target's UID changing under us - * because changing it requires RCU be used, and if t != current, the - * caller must be holding the RCU readlock (by way of a spinlock) and - * we use RCU protection here + * Protect access to @t credentials. This can go away when all + * callers hold rcu read lock. */ + rcu_read_lock(); user = get_uid(__task_cred(t)->user); atomic_inc(&user->sigpending); + rcu_read_unlock(); if (override_rlimit || atomic_read(&user->sigpending) <= @@ -979,7 +979,8 @@ static void print_fatal_signal(struct pt_regs *regs, int signr) for (i = 0; i < 16; i++) { unsigned char insn; - __get_user(insn, (unsigned char *)(regs->ip + i)); + if (get_user(insn, (unsigned char *)(regs->ip + i))) + break; printk("%02x ", insn); } } @@ -1179,11 +1180,12 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, int ret = -EINVAL; struct task_struct *p; const struct cred *pcred; + unsigned long flags; if (!valid_signal(sig)) return ret; - read_lock(&tasklist_lock); + rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (!p) { ret = -ESRCH; @@ -1199,14 +1201,16 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, ret = security_task_kill(p, info, sig, secid); if (ret) goto out_unlock; - if (sig && p->sighand) { - unsigned long flags; - spin_lock_irqsave(&p->sighand->siglock, flags); - ret = __send_signal(sig, info, p, 1, 0); - spin_unlock_irqrestore(&p->sighand->siglock, flags); + + if (sig) { + if (lock_task_sighand(p, &flags)) { + ret = __send_signal(sig, info, p, 1, 0); + unlock_task_sighand(p, &flags); + } else + ret = -ESRCH; } out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); diff --git a/kernel/smp.c b/kernel/smp.c index de735a6637d..f1040842244 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -347,7 +347,7 @@ int smp_call_function_any(const struct cpumask *mask, goto call; /* Try for same node. */ - nodemask = cpumask_of_node(cpu); + nodemask = cpumask_of_node(cpu_to_node(cpu)); for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; cpu = cpumask_next_and(cpu, nodemask, mask)) { if (cpu_online(cpu)) diff --git a/kernel/softirq.c b/kernel/softirq.c index a09502e2ef7..7c1a67ef027 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill); */ /* - * The trampoline is called when the hrtimer expires. If this is - * called from the hrtimer interrupt then we schedule the tasklet as - * the timer callback function expects to run in softirq context. If - * it's called in softirq context anyway (i.e. high resolution timers - * disabled) then the hrtimer callback is called right away. + * The trampoline is called when the hrtimer expires. It schedules a tasklet + * to run __tasklet_hrtimer_trampoline() which in turn will call the intended + * hrtimer callback, but from softirq context. */ static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) { struct tasklet_hrtimer *ttimer = container_of(timer, struct tasklet_hrtimer, timer); - if (hrtimer_is_hres_active(timer)) { - tasklet_hi_schedule(&ttimer->tasklet); - return HRTIMER_NORESTART; - } - return ttimer->function(timer); + tasklet_hi_schedule(&ttimer->tasklet); + return HRTIMER_NORESTART; } /* diff --git a/kernel/softlockup.c b/kernel/softlockup.c index d22579087e2..0d4c7898ab8 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -25,6 +25,7 @@ static DEFINE_SPINLOCK(print_lock); static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); +static DEFINE_PER_CPU(bool, softlock_touch_sync); static int __read_mostly did_panic; int __read_mostly softlockup_thresh = 60; @@ -79,6 +80,12 @@ void touch_softlockup_watchdog(void) } EXPORT_SYMBOL(touch_softlockup_watchdog); +void touch_softlockup_watchdog_sync(void) +{ + __raw_get_cpu_var(softlock_touch_sync) = true; + __raw_get_cpu_var(softlockup_touch_ts) = 0; +} + void touch_all_softlockup_watchdogs(void) { int cpu; @@ -118,6 +125,14 @@ void softlockup_tick(void) } if (touch_ts == 0) { + if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) { + /* + * If the time stamp was touched atomically + * make sure the scheduler tick is up to date. + */ + per_cpu(softlock_touch_sync, this_cpu) = false; + sched_clock_tick(); + } __touch_softlockup_watchdog(); return; } diff --git a/kernel/sys.c b/kernel/sys.c index 20ccfb5da6a..18bde979f34 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -162,6 +162,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) if (niceval > 19) niceval = 19; + rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: @@ -199,6 +200,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) } out_unlock: read_unlock(&tasklist_lock); + rcu_read_unlock(); out: return error; } @@ -220,6 +222,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) if (which > PRIO_USER || which < PRIO_PROCESS) return -EINVAL; + rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: @@ -265,6 +268,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) } out_unlock: read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 45e4bef0012..8a68b244846 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1131,7 +1131,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_max_map_count, .maxlen = sizeof(sysctl_max_map_count), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, }, #else @@ -1214,6 +1214,7 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec_jiffies, }, #endif +#ifdef CONFIG_MMU { .procname = "mmap_min_addr", .data = &dac_mmap_min_addr, @@ -1221,6 +1222,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = mmap_min_addr_handler, }, +#endif #ifdef CONFIG_NUMA { .procname = "numa_zonelist_order", diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index b75dbf40f57..8f5d16e0707 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1399,6 +1399,13 @@ static void deprecated_sysctl_warning(const int *name, int nlen) { int i; + /* + * CTL_KERN/KERN_VERSION is used by older glibc and cannot + * ever go away. + */ + if (name[0] == CTL_KERN && name[1] == KERN_VERSION) + return; + if (printk_ratelimit()) { printk(KERN_INFO "warning: process `%s' used the deprecated sysctl " @@ -1410,6 +1417,35 @@ static void deprecated_sysctl_warning(const int *name, int nlen) return; } +#define WARN_ONCE_HASH_BITS 8 +#define WARN_ONCE_HASH_SIZE (1<<WARN_ONCE_HASH_BITS) + +static DECLARE_BITMAP(warn_once_bitmap, WARN_ONCE_HASH_SIZE); + +#define FNV32_OFFSET 2166136261U +#define FNV32_PRIME 0x01000193 + +/* + * Print each legacy sysctl (approximately) only once. + * To avoid making the tables non-const use a external + * hash-table instead. + * Worst case hash collision: 6, but very rarely. + * NOTE! We don't use the SMP-safe bit tests. We simply + * don't care enough. + */ +static void warn_on_bintable(const int *name, int nlen) +{ + int i; + u32 hash = FNV32_OFFSET; + + for (i = 0; i < nlen; i++) + hash = (hash ^ name[i]) * FNV32_PRIME; + hash %= WARN_ONCE_HASH_SIZE; + if (__test_and_set_bit(hash, warn_once_bitmap)) + return; + deprecated_sysctl_warning(name, nlen); +} + static ssize_t do_sysctl(int __user *args_name, int nlen, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { @@ -1424,7 +1460,7 @@ static ssize_t do_sysctl(int __user *args_name, int nlen, if (get_user(name[i], args_name + i)) return -EFAULT; - deprecated_sysctl_warning(name, nlen); + warn_on_bintable(name, nlen); return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen); } diff --git a/kernel/time.c b/kernel/time.c index c6324d96009..804798005d1 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -136,6 +136,7 @@ static inline void warp_clock(void) write_seqlock_irq(&xtime_lock); wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; xtime.tv_sec += sys_tz.tz_minuteswest * 60; + update_xtime_cache(0); write_sequnlock_irq(&xtime_lock); clock_was_set(); } diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 3d5fc0fd1cc..d7395fdfb9f 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -238,8 +238,9 @@ void clockevents_exchange_device(struct clock_event_device *old, */ void clockevents_notify(unsigned long reason, void *arg) { - struct list_head *node, *tmp; + struct clock_event_device *dev, *tmp; unsigned long flags; + int cpu; raw_spin_lock_irqsave(&clockevents_lock, flags); clockevents_do_notify(reason, arg); @@ -250,8 +251,20 @@ void clockevents_notify(unsigned long reason, void *arg) * Unregister the clock event devices which were * released from the users in the notify chain. */ - list_for_each_safe(node, tmp, &clockevents_released) - list_del(node); + list_for_each_entry_safe(dev, tmp, &clockevents_released, list) + list_del(&dev->list); + /* + * Now check whether the CPU has left unused per cpu devices + */ + cpu = *((int *)arg); + list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { + if (cpumask_test_cpu(cpu, dev->cpumask) && + cpumask_weight(dev->cpumask) == 1 && + !tick_is_broadcast_device(dev)) { + BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + list_del(&dev->list); + } + } break; default: break; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e85c23404d3..13700833c18 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -343,7 +343,19 @@ static void clocksource_resume_watchdog(void) { unsigned long flags; - spin_lock_irqsave(&watchdog_lock, flags); + /* + * We use trylock here to avoid a potential dead lock when + * kgdb calls this code after the kernel has been stopped with + * watchdog_lock held. When watchdog_lock is held we just + * return and accept, that the watchdog might trigger and mark + * the monitored clock source (usually TSC) unstable. + * + * This does not affect the other caller clocksource_resume() + * because at this point the kernel is UP, interrupts are + * disabled and nothing can hold watchdog_lock. + */ + if (!spin_trylock_irqsave(&watchdog_lock, flags)) + return; clocksource_reset_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); } @@ -458,8 +470,8 @@ void clocksource_resume(void) * clocksource_touch_watchdog - Update watchdog * * Update the watchdog after exception contexts such as kgdb so as not - * to incorrectly trip the watchdog. - * + * to incorrectly trip the watchdog. This might fail when the kernel + * was stopped in code which holds watchdog_lock. */ void clocksource_touch_watchdog(void) { diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index af4135f0582..e2ab064c6d4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -165,6 +165,13 @@ struct timespec raw_time; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; +static struct timespec xtime_cache __attribute__ ((aligned (16))); +void update_xtime_cache(u64 nsec) +{ + xtime_cache = xtime; + timespec_add_ns(&xtime_cache, nsec); +} + /* must hold xtime_lock */ void timekeeping_leap_insert(int leapsecond) { @@ -325,6 +332,8 @@ int do_settimeofday(struct timespec *tv) xtime = *tv; + update_xtime_cache(0); + timekeeper.ntp_error = 0; ntp_clear(); @@ -550,6 +559,7 @@ void __init timekeeping_init(void) } set_normalized_timespec(&wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); + update_xtime_cache(0); total_sleep_time.tv_sec = 0; total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -583,6 +593,7 @@ static int timekeeping_resume(struct sys_device *dev) wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); total_sleep_time = timespec_add_safe(total_sleep_time, ts); } + update_xtime_cache(0); /* re-base the last cycle value */ timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); timekeeper.ntp_error = 0; @@ -722,6 +733,7 @@ static void timekeeping_adjust(s64 offset) timekeeper.ntp_error_shift; } + /** * logarithmic_accumulation - shifted accumulation of cycles * @@ -765,6 +777,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) return offset; } + /** * update_wall_time - Uses the current clocksource to increment the wall time * @@ -774,6 +787,7 @@ void update_wall_time(void) { struct clocksource *clock; cycle_t offset; + u64 nsecs; int shift = 0, maxshift; /* Make sure we're fully resumed: */ @@ -839,6 +853,9 @@ void update_wall_time(void) timekeeper.ntp_error += timekeeper.xtime_nsec << timekeeper.ntp_error_shift; + nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); + update_xtime_cache(nsecs); + /* check to see if there is a new clocksource to use */ update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); } @@ -863,6 +880,7 @@ void getboottime(struct timespec *ts) set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); } +EXPORT_SYMBOL_GPL(getboottime); /** * monotonic_to_bootbased - Convert the monotonic time to boot based. @@ -872,16 +890,17 @@ void monotonic_to_bootbased(struct timespec *ts) { *ts = timespec_add_safe(*ts, total_sleep_time); } +EXPORT_SYMBOL_GPL(monotonic_to_bootbased); unsigned long get_seconds(void) { - return xtime.tv_sec; + return xtime_cache.tv_sec; } EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - return xtime; + return xtime_cache; } struct timespec current_kernel_time(void) @@ -891,7 +910,8 @@ struct timespec current_kernel_time(void) do { seq = read_seqbegin(&xtime_lock); - now = xtime; + + now = xtime_cache; } while (read_seqretry(&xtime_lock, seq)); return now; @@ -905,7 +925,8 @@ struct timespec get_monotonic_coarse(void) do { seq = read_seqbegin(&xtime_lock); - now = xtime; + + now = xtime_cache; mono = wall_to_monotonic; } while (read_seqretry(&xtime_lock, seq)); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 28265636b6c..bdfb8dd1050 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -237,10 +237,10 @@ static void timer_list_show_tickdevices(struct seq_file *m) #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST print_tickdevice(m, tick_get_broadcast_device(), -1); SEQ_printf(m, "tick_broadcast_mask: %08lx\n", - tick_get_broadcast_mask()->bits[0]); + cpumask_bits(tick_get_broadcast_mask())[0]); #ifdef CONFIG_TICK_ONESHOT SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", - tick_get_broadcast_oneshot_mask()->bits[0]); + cpumask_bits(tick_get_broadcast_oneshot_mask())[0]); #endif SEQ_printf(m, "\n"); #endif diff --git a/kernel/timer.c b/kernel/timer.c index 5db5a8d2681..c61a7949387 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -656,8 +656,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - new_base = __get_cpu_var(tvec_bases); - cpu = smp_processor_id(); #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) @@ -1200,6 +1198,7 @@ void update_process_times(int user_tick) run_local_timers(); rcu_check_callbacks(cpu, user_tick); printk_tick(); + perf_event_do_pending(); scheduler_tick(); run_posix_cpu_timers(p); } @@ -1211,8 +1210,6 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = __get_cpu_var(tvec_bases); - perf_event_do_pending(); - hrtimer_run_pending(); if (time_after_eq(jiffies, base->timer_jiffies)) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d006554888d..60e2ce0181e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -12,39 +12,37 @@ config NOP_TRACER config HAVE_FTRACE_NMI_ENTER bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_TRACER bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_GRAPH_TRACER bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_GRAPH_FP_TEST bool help - An arch may pass in a unique value (frame pointer) to both the - entering and exiting of a function. On exit, the value is compared - and if it does not match, then it will panic the kernel. + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_TRACE_MCOUNT_TEST bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_DYNAMIC_FTRACE bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_FTRACE_MCOUNT_RECORD bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config HAVE_HW_BRANCH_TRACER bool @@ -52,7 +50,7 @@ config HAVE_HW_BRANCH_TRACER config HAVE_SYSCALL_TRACEPOINTS bool help - See Documentation/trace/ftrace-implementation.txt + See Documentation/trace/ftrace-design.txt config TRACER_MAX_TRACE bool @@ -83,7 +81,7 @@ config RING_BUFFER_ALLOW_SWAP # This allows those options to appear when no other tracer is selected. But the # options do not appear when something else selects it. We need the two options # GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the -# hidding of the automatic options. +# hiding of the automatic options. config TRACING bool @@ -119,7 +117,7 @@ menuconfig FTRACE bool "Tracers" default y if DEBUG_KERNEL help - Enable the kernel tracing infrastructure. + Enable the kernel tracing infrastructure. if FTRACE @@ -133,7 +131,7 @@ config FUNCTION_TRACER help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation - instruction to the beginning of every kernel function, which NOP + instruction at the beginning of every kernel function, which NOP sequence is then dynamically patched into a tracer call when tracing is enabled by the administrator. If it's runtime disabled (the bootup default), then the overhead of the instructions is very @@ -150,7 +148,7 @@ config FUNCTION_GRAPH_TRACER and its entry. Its first purpose is to trace the duration of functions and draw a call graph for each thread with some information like - the return value. This is done by setting the current return + the return value. This is done by setting the current return address on the current task structure into a stack of calls. @@ -173,7 +171,7 @@ config IRQSOFF_TRACER echo 0 > /sys/kernel/debug/tracing/tracing_max_latency - (Note that kernel size and overhead increases with this option + (Note that kernel size and overhead increase with this option enabled. This option and the preempt-off timing option can be used together or separately.) @@ -186,7 +184,7 @@ config PREEMPT_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP help - This option measures the time spent in preemption off critical + This option measures the time spent in preemption-off critical sections, with microsecond accuracy. The default measurement method is a maximum search, which is @@ -195,7 +193,7 @@ config PREEMPT_TRACER echo 0 > /sys/kernel/debug/tracing/tracing_max_latency - (Note that kernel size and overhead increases with this option + (Note that kernel size and overhead increase with this option enabled. This option and the irqs-off timing option can be used together or separately.) @@ -222,7 +220,7 @@ config ENABLE_DEFAULT_TRACERS depends on !GENERIC_TRACER select TRACING help - This tracer hooks to various trace points in the kernel + This tracer hooks to various trace points in the kernel, allowing the user to pick and choose which trace point they want to trace. It also includes the sched_switch tracer plugin. @@ -265,19 +263,19 @@ choice The likely/unlikely profiler only looks at the conditions that are annotated with a likely or unlikely macro. - The "all branch" profiler will profile every if statement in the + The "all branch" profiler will profile every if-statement in the kernel. This profiler will also enable the likely/unlikely - profiler as well. + profiler. - Either of the above profilers add a bit of overhead to the system. - If unsure choose "No branch profiling". + Either of the above profilers adds a bit of overhead to the system. + If unsure, choose "No branch profiling". config BRANCH_PROFILE_NONE bool "No branch profiling" help - No branch profiling. Branch profiling adds a bit of overhead. - Only enable it if you want to analyse the branching behavior. - Otherwise keep it disabled. + No branch profiling. Branch profiling adds a bit of overhead. + Only enable it if you want to analyse the branching behavior. + Otherwise keep it disabled. config PROFILE_ANNOTATED_BRANCHES bool "Trace likely/unlikely profiler" @@ -288,7 +286,7 @@ config PROFILE_ANNOTATED_BRANCHES /sys/kernel/debug/tracing/profile_annotated_branch - Note: this will add a significant overhead, only turn this + Note: this will add a significant overhead; only turn this on if you need to profile the system's use of these macros. config PROFILE_ALL_BRANCHES @@ -305,7 +303,7 @@ config PROFILE_ALL_BRANCHES This configuration, when enabled, will impose a great overhead on the system. This should only be enabled when the system - is to be analyzed + is to be analyzed in much detail. endchoice config TRACING_BRANCHES @@ -335,7 +333,7 @@ config POWER_TRACER depends on X86 select GENERIC_TRACER help - This tracer helps developers to analyze and optimize the kernels + This tracer helps developers to analyze and optimize the kernel's power management decisions, specifically the C-state and P-state behavior. @@ -391,14 +389,14 @@ config HW_BRANCH_TRACER select GENERIC_TRACER help This tracer records all branches on the system in a circular - buffer giving access to the last N branches for each cpu. + buffer, giving access to the last N branches for each cpu. config KMEMTRACE bool "Trace SLAB allocations" select GENERIC_TRACER help kmemtrace provides tracing for slab allocator functions, such as - kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected + kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected data is then fed to the userspace application in order to analyse allocation hotspots, internal fragmentation and so on, making it possible to see how well an allocator performs, as well as debug @@ -417,15 +415,15 @@ config WORKQUEUE_TRACER bool "Trace workqueues" select GENERIC_TRACER help - The workqueue tracer provides some statistical informations + The workqueue tracer provides some statistical information about each cpu workqueue thread such as the number of the works inserted and executed since their creation. It can help - to evaluate the amount of work each of them have to perform. + to evaluate the amount of work each of them has to perform. For example it can help a developer to decide whether he should - choose a per cpu workqueue instead of a singlethreaded one. + choose a per-cpu workqueue instead of a singlethreaded one. config BLK_DEV_IO_TRACE - bool "Support for tracing block io actions" + bool "Support for tracing block IO actions" depends on SYSFS depends on BLOCK select RELAY @@ -456,15 +454,15 @@ config KPROBE_EVENT select TRACING default y help - This allows the user to add tracing events (similar to tracepoints) on the fly - via the ftrace interface. See Documentation/trace/kprobetrace.txt - for more details. + This allows the user to add tracing events (similar to tracepoints) + on the fly via the ftrace interface. See + Documentation/trace/kprobetrace.txt for more details. Those events can be inserted wherever kprobes can probe, and record various register and memory values. - This option is also required by perf-probe subcommand of perf tools. If - you want to use perf tools, this option is strongly recommended. + This option is also required by perf-probe subcommand of perf tools. + If you want to use perf tools, this option is strongly recommended. config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" @@ -472,32 +470,32 @@ config DYNAMIC_FTRACE depends on HAVE_DYNAMIC_FTRACE default y help - This option will modify all the calls to ftrace dynamically - (will patch them out of the binary image and replaces them - with a No-Op instruction) as they are called. A table is - created to dynamically enable them again. + This option will modify all the calls to ftrace dynamically + (will patch them out of the binary image and replace them + with a No-Op instruction) as they are called. A table is + created to dynamically enable them again. - This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise - has native performance as long as no tracing is active. + This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but + otherwise has native performance as long as no tracing is active. - The changes to the code are done by a kernel thread that - wakes up once a second and checks to see if any ftrace calls - were made. If so, it runs stop_machine (stops all CPUS) - and modifies the code to jump over the call to ftrace. + The changes to the code are done by a kernel thread that + wakes up once a second and checks to see if any ftrace calls + were made. If so, it runs stop_machine (stops all CPUS) + and modifies the code to jump over the call to ftrace. config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER default n help - This option enables the kernel function profiler. A file is created - in debugfs called function_profile_enabled which defaults to zero. - When a 1 is echoed into this file profiling begins, and when a - zero is entered, profiling stops. A file in the trace_stats - directory called functions, that show the list of functions that - have been hit and their counters. + This option enables the kernel function profiler. A file is created + in debugfs called function_profile_enabled which defaults to zero. + When a 1 is echoed into this file profiling begins, and when a + zero is entered, profiling stops. A "functions" file is created in + the trace_stats directory; this file shows the list of functions that + have been hit and their counters. - If in doubt, say N + If in doubt, say N. config FTRACE_MCOUNT_RECORD def_bool y @@ -556,8 +554,8 @@ config RING_BUFFER_BENCHMARK tristate "Ring buffer benchmark stress tester" depends on RING_BUFFER help - This option creates a test to stress the ring buffer and bench mark it. - It creates its own ring buffer such that it will not interfer with + This option creates a test to stress the ring buffer and benchmark it. + It creates its own ring buffer such that it will not interfere with any other users of the ring buffer (such as ftrace). It then creates a producer and consumer that will run for 10 seconds and sleep for 10 seconds. Each interval it will print out the number of events @@ -566,7 +564,7 @@ config RING_BUFFER_BENCHMARK It does not disable interrupts or raise its priority, so it may be affected by processes that are running. - If unsure, say N + If unsure, say N. endif # FTRACE diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e51a1bcb7be..1e6640f8045 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1690,7 +1690,7 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin) static int ftrace_match(char *str, char *regex, int len, int type) { int matched = 0; - char *ptr; + int slen; switch (type) { case MATCH_FULL: @@ -1706,8 +1706,8 @@ static int ftrace_match(char *str, char *regex, int len, int type) matched = 1; break; case MATCH_END_ONLY: - ptr = strstr(str, regex); - if (ptr && (ptr[len] == 0)) + slen = strlen(str); + if (slen >= len && memcmp(str + slen - len, regex, len) == 0) matched = 1; break; } @@ -1724,7 +1724,7 @@ ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) return ftrace_match(str, regex, len, type); } -static void ftrace_match_records(char *buff, int len, int enable) +static int ftrace_match_records(char *buff, int len, int enable) { unsigned int search_len; struct ftrace_page *pg; @@ -1733,6 +1733,7 @@ static void ftrace_match_records(char *buff, int len, int enable) char *search; int type; int not; + int found = 0; flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; type = filter_parse_regex(buff, len, &search, ¬); @@ -1750,6 +1751,7 @@ static void ftrace_match_records(char *buff, int len, int enable) rec->flags &= ~flag; else rec->flags |= flag; + found = 1; } /* * Only enable filtering if we have a function that @@ -1759,6 +1761,8 @@ static void ftrace_match_records(char *buff, int len, int enable) ftrace_filtered = 1; } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); + + return found; } static int @@ -1780,7 +1784,7 @@ ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, return 1; } -static void ftrace_match_module_records(char *buff, char *mod, int enable) +static int ftrace_match_module_records(char *buff, char *mod, int enable) { unsigned search_len = 0; struct ftrace_page *pg; @@ -1789,6 +1793,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable) char *search = buff; unsigned long flag; int not = 0; + int found = 0; flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; @@ -1819,12 +1824,15 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable) rec->flags &= ~flag; else rec->flags |= flag; + found = 1; } if (enable && (rec->flags & FTRACE_FL_FILTER)) ftrace_filtered = 1; } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); + + return found; } /* @@ -1853,8 +1861,9 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable) if (!strlen(mod)) return -EINVAL; - ftrace_match_module_records(func, mod, enable); - return 0; + if (ftrace_match_module_records(func, mod, enable)) + return 0; + return -EINVAL; } static struct ftrace_func_command ftrace_mod_cmd = { @@ -2151,8 +2160,9 @@ static int ftrace_process_regex(char *buff, int len, int enable) func = strsep(&next, ":"); if (!next) { - ftrace_match_records(func, len, enable); - return 0; + if (ftrace_match_records(func, len, enable)) + return 0; + return ret; } /* command found */ @@ -2198,10 +2208,9 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, !trace_parser_cont(parser)) { ret = ftrace_process_regex(parser->buffer, parser->idx, enable); + trace_parser_clear(parser); if (ret) goto out_unlock; - - trace_parser_clear(parser); } ret = read; @@ -2543,10 +2552,9 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) exists = true; break; } - if (!exists) { + if (!exists) array[(*idx)++] = rec->ip; - found = 1; - } + found = 1; } } while_for_each_ftrace_rec(); diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index e06c6e3d56a..9f4f565b01e 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -14,7 +14,5 @@ #define CREATE_TRACE_POINTS #include <trace/events/power.h> -EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); -EXPORT_TRACEPOINT_SYMBOL_GPL(power_end); EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f58c9ad1583..8c1b2d29071 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -464,6 +464,8 @@ struct ring_buffer_iter { struct ring_buffer_per_cpu *cpu_buffer; unsigned long head; struct buffer_page *head_page; + struct buffer_page *cache_reader_page; + unsigned long cache_read; u64 read_stamp; }; @@ -1193,9 +1195,6 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) struct list_head *p; unsigned i; - atomic_inc(&cpu_buffer->record_disabled); - synchronize_sched(); - spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); @@ -1211,12 +1210,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) return; rb_reset_cpu(cpu_buffer); - spin_unlock_irq(&cpu_buffer->reader_lock); - rb_check_pages(cpu_buffer); - atomic_dec(&cpu_buffer->record_disabled); - + spin_unlock_irq(&cpu_buffer->reader_lock); } static void @@ -1227,9 +1223,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, struct list_head *p; unsigned i; - atomic_inc(&cpu_buffer->record_disabled); - synchronize_sched(); - spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); @@ -1242,11 +1235,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add_tail(&bpage->list, cpu_buffer->pages); } rb_reset_cpu(cpu_buffer); - spin_unlock_irq(&cpu_buffer->reader_lock); - rb_check_pages(cpu_buffer); - atomic_dec(&cpu_buffer->record_disabled); + spin_unlock_irq(&cpu_buffer->reader_lock); } /** @@ -1254,11 +1245,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, * @buffer: the buffer to resize. * @size: the new size. * - * The tracer is responsible for making sure that the buffer is - * not being used while changing the size. - * Note: We may be able to change the above requirement by using - * RCU synchronizations. - * * Minimum size is 2 * BUF_PAGE_SIZE. * * Returns -1 on failure. @@ -1290,6 +1276,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) if (size == buffer_size) return size; + atomic_inc(&buffer->record_disabled); + + /* Make sure all writers are done with this buffer. */ + synchronize_sched(); + mutex_lock(&buffer->mutex); get_online_cpus(); @@ -1352,6 +1343,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) put_online_cpus(); mutex_unlock(&buffer->mutex); + atomic_dec(&buffer->record_disabled); + return size; free_pages: @@ -1361,6 +1354,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) } put_online_cpus(); mutex_unlock(&buffer->mutex); + atomic_dec(&buffer->record_disabled); return -ENOMEM; /* @@ -1370,6 +1364,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) out_fail: put_online_cpus(); mutex_unlock(&buffer->mutex); + atomic_dec(&buffer->record_disabled); return -1; } EXPORT_SYMBOL_GPL(ring_buffer_resize); @@ -2723,6 +2718,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) iter->read_stamp = cpu_buffer->read_stamp; else iter->read_stamp = iter->head_page->page->time_stamp; + iter->cache_reader_page = cpu_buffer->reader_page; + iter->cache_read = cpu_buffer->read; } /** @@ -2876,7 +2873,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * Splice the empty reader page into the list around the head. */ reader = rb_set_head_page(cpu_buffer); - cpu_buffer->reader_page->list.next = reader->list.next; + cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); cpu_buffer->reader_page->list.prev = reader->list.prev; /* @@ -2913,7 +2910,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * * Now make the new head point back to the reader page. */ - reader->list.next->prev = &cpu_buffer->reader_page->list; + rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list; rb_inc_page(cpu_buffer, &cpu_buffer->head_page); /* Finally update the reader page to the new head */ @@ -3067,13 +3064,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event *event; int nr_loops = 0; - if (ring_buffer_iter_empty(iter)) - return NULL; - cpu_buffer = iter->cpu_buffer; buffer = cpu_buffer->buffer; + /* + * Check if someone performed a consuming read to + * the buffer. A consuming read invalidates the iterator + * and we need to reset the iterator in this case. + */ + if (unlikely(iter->cache_read != cpu_buffer->read || + iter->cache_reader_page != cpu_buffer->reader_page)) + rb_iter_reset(iter); + again: + if (ring_buffer_iter_empty(iter)) + return NULL; + /* * We repeat when a timestamp is encountered. * We can get multiple timestamps by nested interrupts or also @@ -3088,6 +3094,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) if (rb_per_cpu_empty(cpu_buffer)) return NULL; + if (iter->head >= local_read(&iter->head_page->page->commit)) { + rb_inc_iter(iter); + goto again; + } + event = rb_iter_head_event(iter); switch (event->type_len) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 31118ae16f0..eac6875cb99 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -12,7 +12,7 @@ * Copyright (C) 2004 William Lee Irwin III */ #include <linux/ring_buffer.h> -#include <linux/utsrelease.h> +#include <generated/utsrelease.h> #include <linux/stacktrace.h> #include <linux/writeback.h> #include <linux/kallsyms.h> @@ -313,7 +313,6 @@ static const char *trace_options[] = { "bin", "block", "stacktrace", - "sched-tree", "trace_printk", "ftrace_preempt", "branch", @@ -952,6 +951,11 @@ void trace_find_cmdline(int pid, char comm[]) return; } + if (WARN_ON_ONCE(pid < 0)) { + strcpy(comm, "<XXX>"); + return; + } + if (pid > PID_MAX_DEFAULT) { strcpy(comm, "<...>"); return; @@ -1151,6 +1155,22 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, __ftrace_trace_stack(tr->buffer, flags, skip, pc); } +/** + * trace_dump_stack - record a stack back trace in the trace buffer + */ +void trace_dump_stack(void) +{ + unsigned long flags; + + if (tracing_disabled || tracing_selftest_running) + return; + + local_save_flags(flags); + + /* skipping 3 traces, seems to get us at the caller of this function */ + __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); +} + void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { @@ -2316,67 +2336,49 @@ static const struct file_operations tracing_cpumask_fops = { .write = tracing_cpumask_write, }; -static ssize_t -tracing_trace_options_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) +static int tracing_trace_options_show(struct seq_file *m, void *v) { struct tracer_opt *trace_opts; u32 tracer_flags; - int len = 0; - char *buf; - int r = 0; int i; - - /* calculate max size */ - for (i = 0; trace_options[i]; i++) { - len += strlen(trace_options[i]); - len += 3; /* "no" and newline */ - } - mutex_lock(&trace_types_lock); tracer_flags = current_trace->flags->val; trace_opts = current_trace->flags->opts; - /* - * Increase the size with names of options specific - * of the current tracer. - */ - for (i = 0; trace_opts[i].name; i++) { - len += strlen(trace_opts[i].name); - len += 3; /* "no" and newline */ - } - - /* +1 for \0 */ - buf = kmalloc(len + 1, GFP_KERNEL); - if (!buf) { - mutex_unlock(&trace_types_lock); - return -ENOMEM; - } - for (i = 0; trace_options[i]; i++) { if (trace_flags & (1 << i)) - r += sprintf(buf + r, "%s\n", trace_options[i]); + seq_printf(m, "%s\n", trace_options[i]); else - r += sprintf(buf + r, "no%s\n", trace_options[i]); + seq_printf(m, "no%s\n", trace_options[i]); } for (i = 0; trace_opts[i].name; i++) { if (tracer_flags & trace_opts[i].bit) - r += sprintf(buf + r, "%s\n", - trace_opts[i].name); + seq_printf(m, "%s\n", trace_opts[i].name); else - r += sprintf(buf + r, "no%s\n", - trace_opts[i].name); + seq_printf(m, "no%s\n", trace_opts[i].name); } mutex_unlock(&trace_types_lock); - WARN_ON(r >= len + 1); + return 0; +} - r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +static int __set_tracer_option(struct tracer *trace, + struct tracer_flags *tracer_flags, + struct tracer_opt *opts, int neg) +{ + int ret; - kfree(buf); - return r; + ret = trace->set_flag(tracer_flags->val, opts->bit, !neg); + if (ret) + return ret; + + if (neg) + tracer_flags->val &= ~opts->bit; + else + tracer_flags->val |= opts->bit; + return 0; } /* Try to assign a tracer specific option */ @@ -2384,33 +2386,17 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) { struct tracer_flags *tracer_flags = trace->flags; struct tracer_opt *opts = NULL; - int ret = 0, i = 0; - int len; + int i; for (i = 0; tracer_flags->opts[i].name; i++) { opts = &tracer_flags->opts[i]; - len = strlen(opts->name); - if (strncmp(cmp, opts->name, len) == 0) { - ret = trace->set_flag(tracer_flags->val, - opts->bit, !neg); - break; - } + if (strcmp(cmp, opts->name) == 0) + return __set_tracer_option(trace, trace->flags, + opts, neg); } - /* Not found */ - if (!tracer_flags->opts[i].name) - return -EINVAL; - - /* Refused to handle */ - if (ret) - return ret; - - if (neg) - tracer_flags->val &= ~opts->bit; - else - tracer_flags->val |= opts->bit; - return 0; + return -EINVAL; } static void set_tracer_flags(unsigned int mask, int enabled) @@ -2430,7 +2416,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; - char *cmp = buf; + char *cmp; int neg = 0; int ret; int i; @@ -2442,16 +2428,15 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, return -EFAULT; buf[cnt] = 0; + cmp = strstrip(buf); - if (strncmp(buf, "no", 2) == 0) { + if (strncmp(cmp, "no", 2) == 0) { neg = 1; cmp += 2; } for (i = 0; trace_options[i]; i++) { - int len = strlen(trace_options[i]); - - if (strncmp(cmp, trace_options[i], len) == 0) { + if (strcmp(cmp, trace_options[i]) == 0) { set_tracer_flags(1 << i, !neg); break; } @@ -2471,9 +2456,18 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, return cnt; } +static int tracing_trace_options_open(struct inode *inode, struct file *file) +{ + if (tracing_disabled) + return -ENODEV; + return single_open(file, tracing_trace_options_show, NULL); +} + static const struct file_operations tracing_iter_fops = { - .open = tracing_open_generic, - .read = tracing_trace_options_read, + .open = tracing_trace_options_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, .write = tracing_trace_options_write, }; @@ -3392,21 +3386,18 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, return cnt; } -static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) +static int tracing_clock_show(struct seq_file *m, void *v) { - char buf[64]; - int bufiter = 0; int i; for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) - bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, + seq_printf(m, "%s%s%s%s", i ? " " : "", i == trace_clock_id ? "[" : "", trace_clocks[i].name, i == trace_clock_id ? "]" : ""); - bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n"); + seq_putc(m, '\n'); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter); + return 0; } static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, @@ -3448,6 +3439,13 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, return cnt; } +static int tracing_clock_open(struct inode *inode, struct file *file) +{ + if (tracing_disabled) + return -ENODEV; + return single_open(file, tracing_clock_show, NULL); +} + static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -3486,8 +3484,10 @@ static const struct file_operations tracing_mark_fops = { }; static const struct file_operations trace_clock_fops = { - .open = tracing_open_generic, - .read = tracing_clock_read, + .open = tracing_clock_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, .write = tracing_clock_write, }; @@ -3948,39 +3948,16 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret < 0) return ret; - ret = 0; - switch (val) { - case 0: - /* do nothing if already cleared */ - if (!(topt->flags->val & topt->opt->bit)) - break; - - mutex_lock(&trace_types_lock); - if (current_trace->set_flag) - ret = current_trace->set_flag(topt->flags->val, - topt->opt->bit, 0); - mutex_unlock(&trace_types_lock); - if (ret) - return ret; - topt->flags->val &= ~topt->opt->bit; - break; - case 1: - /* do nothing if already set */ - if (topt->flags->val & topt->opt->bit) - break; + if (val != 0 && val != 1) + return -EINVAL; + if (!!(topt->flags->val & topt->opt->bit) != val) { mutex_lock(&trace_types_lock); - if (current_trace->set_flag) - ret = current_trace->set_flag(topt->flags->val, - topt->opt->bit, 1); + ret = __set_tracer_option(current_trace, topt->flags, + topt->opt, !val); mutex_unlock(&trace_types_lock); if (ret) return ret; - topt->flags->val |= topt->opt->bit; - break; - - default: - return -EINVAL; } *ppos += cnt; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a52bed2eedd..4df6a77eb19 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -597,18 +597,17 @@ enum trace_iterator_flags { TRACE_ITER_BIN = 0x40, TRACE_ITER_BLOCK = 0x80, TRACE_ITER_STACKTRACE = 0x100, - TRACE_ITER_SCHED_TREE = 0x200, - TRACE_ITER_PRINTK = 0x400, - TRACE_ITER_PREEMPTONLY = 0x800, - TRACE_ITER_BRANCH = 0x1000, - TRACE_ITER_ANNOTATE = 0x2000, - TRACE_ITER_USERSTACKTRACE = 0x4000, - TRACE_ITER_SYM_USEROBJ = 0x8000, - TRACE_ITER_PRINTK_MSGONLY = 0x10000, - TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ - TRACE_ITER_LATENCY_FMT = 0x40000, - TRACE_ITER_SLEEP_TIME = 0x80000, - TRACE_ITER_GRAPH_TIME = 0x100000, + TRACE_ITER_PRINTK = 0x200, + TRACE_ITER_PREEMPTONLY = 0x400, + TRACE_ITER_BRANCH = 0x800, + TRACE_ITER_ANNOTATE = 0x1000, + TRACE_ITER_USERSTACKTRACE = 0x2000, + TRACE_ITER_SYM_USEROBJ = 0x4000, + TRACE_ITER_PRINTK_MSGONLY = 0x8000, + TRACE_ITER_CONTEXT_INFO = 0x10000, /* Print pid/cpu/time */ + TRACE_ITER_LATENCY_FMT = 0x20000, + TRACE_ITER_SLEEP_TIME = 0x40000, + TRACE_ITER_GRAPH_TIME = 0x80000, }; /* diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index d9c60f80aa0..9e25573242c 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -25,7 +25,7 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event) char *buf; int ret = -ENOMEM; - if (atomic_inc_return(&event->profile_count)) + if (event->profile_count++ > 0) return 0; if (!total_profile_count) { @@ -56,7 +56,7 @@ fail_buf_nmi: perf_trace_buf = NULL; } fail_buf: - atomic_dec(&event->profile_count); + event->profile_count--; return ret; } @@ -83,7 +83,7 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event) { char *buf, *nmi_buf; - if (!atomic_add_negative(-1, &event->profile_count)) + if (--event->profile_count > 0) return; event->profile_disable(event); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1d18315dc83..189b09baf4f 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -78,7 +78,7 @@ EXPORT_SYMBOL_GPL(trace_define_field); if (ret) \ return ret; -int trace_define_common_fields(struct ftrace_event_call *call) +static int trace_define_common_fields(struct ftrace_event_call *call) { int ret; struct trace_entry ent; @@ -91,7 +91,6 @@ int trace_define_common_fields(struct ftrace_event_call *call) return ret; } -EXPORT_SYMBOL_GPL(trace_define_common_fields); void trace_destroy_fields(struct ftrace_event_call *call) { @@ -105,9 +104,25 @@ void trace_destroy_fields(struct ftrace_event_call *call) } } -static void ftrace_event_enable_disable(struct ftrace_event_call *call, +int trace_event_raw_init(struct ftrace_event_call *call) +{ + int id; + + id = register_ftrace_event(call->event); + if (!id) + return -ENODEV; + call->id = id; + INIT_LIST_HEAD(&call->fields); + + return 0; +} +EXPORT_SYMBOL_GPL(trace_event_raw_init); + +static int ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { + int ret = 0; + switch (enable) { case 0: if (call->enabled) { @@ -118,12 +133,20 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, break; case 1: if (!call->enabled) { - call->enabled = 1; tracing_start_cmdline_record(); - call->regfunc(call); + ret = call->regfunc(call); + if (ret) { + tracing_stop_cmdline_record(); + pr_info("event trace: Could not enable event " + "%s\n", call->name); + break; + } + call->enabled = 1; } break; } + + return ret; } static void ftrace_clear_events(void) @@ -402,7 +425,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, case 0: case 1: mutex_lock(&event_mutex); - ftrace_event_enable_disable(call, val); + ret = ftrace_event_enable_disable(call, val); mutex_unlock(&event_mutex); break; @@ -412,7 +435,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, *ppos += cnt; - return cnt; + return ret ? ret : cnt; } static ssize_t @@ -913,7 +936,9 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, id); if (call->define_fields) { - ret = call->define_fields(call); + ret = trace_define_common_fields(call); + if (!ret) + ret = call->define_fields(call); if (ret < 0) { pr_warning("Could not initialize trace point" " events/%s\n", call->name); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 50504cb228d..e42af9aad69 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -211,8 +211,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, { char **addr = (char **)(event + pred->offset); int cmp, match; + int len = strlen(*addr) + 1; /* including tailing '\0' */ - cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len); + cmp = pred->regex.match(*addr, &pred->regex, len); match = cmp ^ pred->not; @@ -251,7 +252,18 @@ static int filter_pred_none(struct filter_pred *pred, void *event, return 0; } -/* Basic regex callbacks */ +/* + * regex_match_foo - Basic regex callbacks + * + * @str: the string to be searched + * @r: the regex structure containing the pattern string + * @len: the length of the string to be searched (including '\0') + * + * Note: + * - @str might not be NULL-terminated if it's of type DYN_STRING + * or STATIC_STRING + */ + static int regex_match_full(char *str, struct regex *r, int len) { if (strncmp(str, r->pattern, len) == 0) @@ -261,23 +273,24 @@ static int regex_match_full(char *str, struct regex *r, int len) static int regex_match_front(char *str, struct regex *r, int len) { - if (strncmp(str, r->pattern, len) == 0) + if (strncmp(str, r->pattern, r->len) == 0) return 1; return 0; } static int regex_match_middle(char *str, struct regex *r, int len) { - if (strstr(str, r->pattern)) + if (strnstr(str, r->pattern, len)) return 1; return 0; } static int regex_match_end(char *str, struct regex *r, int len) { - char *ptr = strstr(str, r->pattern); + int strlen = len - 1; - if (ptr && (ptr[r->len] == 0)) + if (strlen >= r->len && + memcmp(str + strlen - r->len, r->pattern, r->len) == 0) return 1; return 0; } @@ -781,10 +794,8 @@ static int filter_add_pred(struct filter_parse_state *ps, pred->regex.field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) fn = filter_pred_strloc; - else { + else fn = filter_pred_pchar; - pred->regex.field_len = strlen(pred->regex.pattern); - } } else { if (field->is_signed) ret = strict_strtoll(pred->regex.pattern, 0, &val); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index dff8c84ddf1..d4fa5dc1ee4 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -158,7 +158,8 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ - sizeof(field.item), 0, FILTER_OTHER); \ + sizeof(field.item), \ + is_signed_type(type), FILTER_OTHER); \ if (ret) \ return ret; @@ -168,8 +169,8 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), \ container.item), \ - sizeof(field.container.item), 0, \ - FILTER_OTHER); \ + sizeof(field.container.item), \ + is_signed_type(type), FILTER_OTHER); \ if (ret) \ return ret; @@ -184,10 +185,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ struct struct_name field; \ int ret; \ \ - ret = trace_define_common_fields(event_call); \ - if (ret) \ - return ret; \ - \ tstruct; \ \ return ret; \ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 3aa7eaa2114..2974bc7538c 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -151,6 +151,8 @@ check_critical_timing(struct trace_array *tr, goto out_unlock; trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); + /* Skip 5 functions to get to the irq/preempt enable function */ + __trace_stack(tr, flags, 5, pc); if (data->critical_sequence != max_sequence) goto out_unlock; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b52d397e57e..50b1b823980 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -282,6 +282,18 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs); +/* Check the name is good for event/group */ +static int check_event_name(const char *name) +{ + if (!isalpha(*name) && *name != '_') + return 0; + while (*++name != '\0') { + if (!isalpha(*name) && !isdigit(*name) && *name != '_') + return 0; + } + return 1; +} + /* * Allocate new trace_probe and initialize it (including kprobes). */ @@ -293,10 +305,11 @@ static struct trace_probe *alloc_trace_probe(const char *group, int nargs, int is_return) { struct trace_probe *tp; + int ret = -ENOMEM; tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); if (!tp) - return ERR_PTR(-ENOMEM); + return ERR_PTR(ret); if (symbol) { tp->symbol = kstrdup(symbol, GFP_KERNEL); @@ -312,14 +325,20 @@ static struct trace_probe *alloc_trace_probe(const char *group, else tp->rp.kp.pre_handler = kprobe_dispatcher; - if (!event) + if (!event || !check_event_name(event)) { + ret = -EINVAL; goto error; + } + tp->call.name = kstrdup(event, GFP_KERNEL); if (!tp->call.name) goto error; - if (!group) + if (!group || !check_event_name(group)) { + ret = -EINVAL; goto error; + } + tp->call.system = kstrdup(group, GFP_KERNEL); if (!tp->call.system) goto error; @@ -330,7 +349,7 @@ error: kfree(tp->call.name); kfree(tp->symbol); kfree(tp); - return ERR_PTR(-ENOMEM); + return ERR_PTR(ret); } static void free_probe_arg(struct probe_arg *arg) @@ -670,7 +689,7 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } /* an address specified */ - ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); + ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); if (ret) { pr_info("Failed to parse address.\n"); return ret; @@ -695,10 +714,10 @@ static int create_trace_probe(int argc, char **argv) if (!event) { /* Make a new event name */ if (symbol) - snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld", + snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", is_return ? 'r' : 'p', symbol, offset); else - snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p", + snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", is_return ? 'r' : 'p', addr); event = buf; } @@ -1132,10 +1151,6 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) struct kprobe_trace_entry field; struct trace_probe *tp = (struct trace_probe *)event_call->data; - ret = trace_define_common_fields(event_call); - if (ret) - return ret; - DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); /* Set argument names as fields */ @@ -1150,10 +1165,6 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) struct kretprobe_trace_entry field; struct trace_probe *tp = (struct trace_probe *)event_call->data; - ret = trace_define_common_fields(event_call); - if (ret) - return ret; - DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); @@ -1190,10 +1201,11 @@ static int __probe_event_show_format(struct trace_seq *s, #undef SHOW_FIELD #define SHOW_FIELD(type, item, name) \ do { \ - ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \ - "offset:%u;\tsize:%u;\n", name, \ + ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \ + "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\ (unsigned int)offsetof(typeof(field), item),\ - (unsigned int)sizeof(type)); \ + (unsigned int)sizeof(type), \ + is_signed_type(type)); \ if (!ret) \ return 0; \ } while (0) @@ -1453,7 +1465,6 @@ static int register_probe_event(struct trace_probe *tp) call->unregfunc = probe_event_disable; #ifdef CONFIG_EVENT_PROFILE - atomic_set(&call->profile_count, -1); call->profile_enable = probe_profile_enable; call->profile_disable = probe_profile_disable; #endif diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index acb87d4a4ac..94103cdcf9d 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -26,12 +26,13 @@ #include <linux/fs.h> #include "trace_output.h" -#include "trace_stat.h" #include "trace.h" #include <linux/hw_breakpoint.h> #include <asm/hw_breakpoint.h> +#include <asm/atomic.h> + /* * For now, let us restrict the no. of symbols traced simultaneously to number * of available hardware breakpoint registers. @@ -44,7 +45,7 @@ struct trace_ksym { struct perf_event **ksym_hbp; struct perf_event_attr attr; #ifdef CONFIG_PROFILE_KSYM_TRACER - unsigned long counter; + atomic64_t counter; #endif struct hlist_node ksym_hlist; }; @@ -69,9 +70,8 @@ void ksym_collect_stats(unsigned long hbp_hit_addr) rcu_read_lock(); hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { - if ((entry->attr.bp_addr == hbp_hit_addr) && - (entry->counter <= MAX_UL_INT)) { - entry->counter++; + if (entry->attr.bp_addr == hbp_hit_addr) { + atomic64_inc(&entry->counter); break; } } @@ -197,7 +197,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) entry->attr.bp_addr = addr; entry->attr.bp_len = HW_BREAKPOINT_LEN_4; - ret = -EAGAIN; entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr, ksym_hbp_handler); @@ -236,7 +235,8 @@ static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf, mutex_lock(&ksym_tracer_mutex); hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { - ret = trace_seq_printf(s, "%pS:", (void *)entry->attr.bp_addr); + ret = trace_seq_printf(s, "%pS:", + (void *)(unsigned long)entry->attr.bp_addr); if (entry->attr.bp_type == HW_BREAKPOINT_R) ret = trace_seq_puts(s, "r--\n"); else if (entry->attr.bp_type == HW_BREAKPOINT_W) @@ -278,21 +278,20 @@ static ssize_t ksym_trace_filter_write(struct file *file, { struct trace_ksym *entry; struct hlist_node *node; - char *input_string, *ksymname = NULL; + char *buf, *input_string, *ksymname = NULL; unsigned long ksym_addr = 0; int ret, op, changed = 0; - input_string = kzalloc(count + 1, GFP_KERNEL); - if (!input_string) + buf = kzalloc(count + 1, GFP_KERNEL); + if (!buf) return -ENOMEM; - if (copy_from_user(input_string, buffer, count)) { - kfree(input_string); - return -EFAULT; - } - input_string[count] = '\0'; + ret = -EFAULT; + if (copy_from_user(buf, buffer, count)) + goto out; - strstrip(input_string); + buf[count] = '\0'; + input_string = strstrip(buf); /* * Clear all breakpoints if: @@ -303,15 +302,13 @@ static ssize_t ksym_trace_filter_write(struct file *file, if (!input_string[0] || !strcmp(input_string, "0") || !strcmp(input_string, "*:---")) { __ksym_trace_reset(); - kfree(input_string); - return count; + ret = 0; + goto out; } ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr); - if (ret < 0) { - kfree(input_string); - return ret; - } + if (ret < 0) + goto out; mutex_lock(&ksym_tracer_mutex); @@ -322,7 +319,7 @@ static ssize_t ksym_trace_filter_write(struct file *file, if (entry->attr.bp_type != op) changed = 1; else - goto out; + goto out_unlock; break; } } @@ -337,28 +334,24 @@ static ssize_t ksym_trace_filter_write(struct file *file, if (IS_ERR(entry->ksym_hbp)) ret = PTR_ERR(entry->ksym_hbp); else - goto out; + goto out_unlock; } /* Error or "symbol:---" case: drop it */ ksym_filter_entry_count--; hlist_del_rcu(&(entry->ksym_hlist)); synchronize_rcu(); kfree(entry); - goto out; + goto out_unlock; } else { /* Check for malformed request: (4) */ - if (op == 0) - goto out; - ret = process_new_ksym_entry(ksymname, op, ksym_addr); + if (op) + ret = process_new_ksym_entry(ksymname, op, ksym_addr); } -out: +out_unlock: mutex_unlock(&ksym_tracer_mutex); - - kfree(input_string); - - if (!ret) - ret = count; - return ret; +out: + kfree(buf); + return !ret ? count : ret; } static const struct file_operations ksym_tracing_fops = { @@ -450,102 +443,77 @@ struct tracer ksym_tracer __read_mostly = .print_line = ksym_trace_output }; -__init static int init_ksym_trace(void) -{ - struct dentry *d_tracer; - struct dentry *entry; - - d_tracer = tracing_init_dentry(); - ksym_filter_entry_count = 0; - - entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer, - NULL, &ksym_tracing_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'ksym_trace_filter' file\n"); - - return register_tracer(&ksym_tracer); -} -device_initcall(init_ksym_trace); - - #ifdef CONFIG_PROFILE_KSYM_TRACER -static int ksym_tracer_stat_headers(struct seq_file *m) +static int ksym_profile_show(struct seq_file *m, void *v) { + struct hlist_node *node; + struct trace_ksym *entry; + int access_type = 0; + char fn_name[KSYM_NAME_LEN]; + seq_puts(m, " Access Type "); seq_puts(m, " Symbol Counter\n"); seq_puts(m, " ----------- "); seq_puts(m, " ------ -------\n"); - return 0; -} -static int ksym_tracer_stat_show(struct seq_file *m, void *v) -{ - struct hlist_node *stat = v; - struct trace_ksym *entry; - int access_type = 0; - char fn_name[KSYM_NAME_LEN]; + rcu_read_lock(); + hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { - entry = hlist_entry(stat, struct trace_ksym, ksym_hlist); + access_type = entry->attr.bp_type; - access_type = entry->attr.bp_type; + switch (access_type) { + case HW_BREAKPOINT_R: + seq_puts(m, " R "); + break; + case HW_BREAKPOINT_W: + seq_puts(m, " W "); + break; + case HW_BREAKPOINT_R | HW_BREAKPOINT_W: + seq_puts(m, " RW "); + break; + default: + seq_puts(m, " NA "); + } - switch (access_type) { - case HW_BREAKPOINT_R: - seq_puts(m, " R "); - break; - case HW_BREAKPOINT_W: - seq_puts(m, " W "); - break; - case HW_BREAKPOINT_R | HW_BREAKPOINT_W: - seq_puts(m, " RW "); - break; - default: - seq_puts(m, " NA "); + if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0) + seq_printf(m, " %-36s", fn_name); + else + seq_printf(m, " %-36s", "<NA>"); + seq_printf(m, " %15llu\n", + (unsigned long long)atomic64_read(&entry->counter)); } - - if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0) - seq_printf(m, " %-36s", fn_name); - else - seq_printf(m, " %-36s", "<NA>"); - seq_printf(m, " %15lu\n", entry->counter); + rcu_read_unlock(); return 0; } -static void *ksym_tracer_stat_start(struct tracer_stat *trace) +static int ksym_profile_open(struct inode *node, struct file *file) { - return ksym_filter_head.first; + return single_open(file, ksym_profile_show, NULL); } -static void * -ksym_tracer_stat_next(void *v, int idx) -{ - struct hlist_node *stat = v; - - return stat->next; -} - -static struct tracer_stat ksym_tracer_stats = { - .name = "ksym_tracer", - .stat_start = ksym_tracer_stat_start, - .stat_next = ksym_tracer_stat_next, - .stat_headers = ksym_tracer_stat_headers, - .stat_show = ksym_tracer_stat_show +static const struct file_operations ksym_profile_fops = { + .open = ksym_profile_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; +#endif /* CONFIG_PROFILE_KSYM_TRACER */ -__init static int ksym_tracer_stat_init(void) +__init static int init_ksym_trace(void) { - int ret; + struct dentry *d_tracer; - ret = register_stat_tracer(&ksym_tracer_stats); - if (ret) { - printk(KERN_WARNING "Warning: could not register " - "ksym tracer stats\n"); - return 1; - } + d_tracer = tracing_init_dentry(); - return 0; + trace_create_file("ksym_trace_filter", 0644, d_tracer, + NULL, &ksym_tracing_fops); + +#ifdef CONFIG_PROFILE_KSYM_TRACER + trace_create_file("ksym_profile", 0444, d_tracer, + NULL, &ksym_profile_fops); +#endif + + return register_tracer(&ksym_tracer); } -fs_initcall(ksym_tracer_stat_init); -#endif /* CONFIG_PROFILE_KSYM_TRACER */ +device_initcall(init_ksym_trace); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 678a5120ee3..f4bc9b27de5 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, unsigned long val, flags; char buf[64]; int ret; + int cpu; if (count >= sizeof(buf)) return -EINVAL; @@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, return ret; local_irq_save(flags); + + /* + * In case we trace inside arch_spin_lock() or after (NMI), + * we will cause circular lock, so we also need to increase + * the percpu trace_active here. + */ + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)++; + arch_spin_lock(&max_stack_lock); *ptr = val; arch_spin_unlock(&max_stack_lock); + + per_cpu(trace_active, cpu)--; local_irq_restore(flags); return count; @@ -206,7 +218,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { + int cpu; + local_irq_disable(); + + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)++; + arch_spin_lock(&max_stack_lock); if (*pos == 0) @@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void t_stop(struct seq_file *m, void *p) { + int cpu; + arch_spin_unlock(&max_stack_lock); + + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)--; + local_irq_enable(); } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 57501d90096..75289f372dd 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -217,10 +217,6 @@ int syscall_enter_define_fields(struct ftrace_event_call *call) int i; int offset = offsetof(typeof(trace), args); - ret = trace_define_common_fields(call); - if (ret) - return ret; - ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); if (ret) return ret; @@ -241,10 +237,6 @@ int syscall_exit_define_fields(struct ftrace_event_call *call) struct syscall_trace_exit trace; int ret; - ret = trace_define_common_fields(call); - if (ret) - return ret; - ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); if (ret) return ret; @@ -333,10 +325,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call) mutex_lock(&syscall_trace_lock); if (!sys_refcount_enter) ret = register_trace_sys_enter(ftrace_syscall_enter); - if (ret) { - pr_info("event trace: Could not activate" - "syscall entry trace point"); - } else { + if (!ret) { set_bit(num, enabled_enter_syscalls); sys_refcount_enter++; } @@ -370,10 +359,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call) mutex_lock(&syscall_trace_lock); if (!sys_refcount_exit) ret = register_trace_sys_exit(ftrace_syscall_exit); - if (ret) { - pr_info("event trace: Could not activate" - "syscall exit trace point"); - } else { + if (!ret) { set_bit(num, enabled_exit_syscalls); sys_refcount_exit++; } diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index f6693969287..a7974a552ca 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -93,6 +93,7 @@ static const struct stacktrace_ops backtrace_ops = { .warning_symbol = backtrace_warning_symbol, .stack = backtrace_stack, .address = backtrace_address, + .walk_stack = print_context_stack, }; static int |