diff options
Diffstat (limited to 'kernel')
37 files changed, 1042 insertions, 159 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 864ff75d65f..6aebdeb2aa3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -100,6 +100,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o +obj-$(CONFIG_PADATA) += padata.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1fbcc748044..aa3bee56644 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2936,14 +2936,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_subsys(root, ss) { struct cgroup_subsys_state *css = ss->create(ss, cgrp); + if (IS_ERR(css)) { err = PTR_ERR(css); goto err_destroy; } init_cgroup_css(css, ss, cgrp); - if (ss->use_id) - if (alloc_css_id(ss, parent, cgrp)) + if (ss->use_id) { + err = alloc_css_id(ss, parent, cgrp); + if (err) goto err_destroy; + } /* At error, ->destroy() callback has to free assigned ID. */ } diff --git a/kernel/cpu.c b/kernel/cpu.c index 1c8ddd6ee94..677f25376a3 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -151,13 +151,13 @@ static inline void check_for_tasks(int cpu) write_lock_irq(&tasklist_lock); for_each_process(p) { - if (task_cpu(p) == cpu && + if (task_cpu(p) == cpu && p->state == TASK_RUNNING && (!cputime_eq(p->utime, cputime_zero) || !cputime_eq(p->stime, cputime_zero))) - printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ - (state = %ld, flags = %x) \n", - p->comm, task_pid_nr(p), cpu, - p->state, p->flags); + printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " + "(state = %ld, flags = %x)\n", + p->comm, task_pid_nr(p), cpu, + p->state, p->flags); } write_unlock_irq(&tasklist_lock); } diff --git a/kernel/cred.c b/kernel/cred.c index dd76cfe5f5b..1ed8ca18790 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -224,7 +224,7 @@ struct cred *cred_alloc_blank(void) #ifdef CONFIG_KEYS new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); if (!new->tgcred) { - kfree(new); + kmem_cache_free(cred_jar, new); return NULL; } atomic_set(&new->tgcred->usage, 1); diff --git a/kernel/fork.c b/kernel/fork.c index 5b2959b3ffc..f88bd984df3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1241,21 +1241,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); - /* - * The task hasn't been attached yet, so its cpus_allowed mask will - * not be changed, nor will its assigned CPU. - * - * The cpus_allowed mask of the parent may have changed after it was - * copied first time - so re-copy it here, then check the child's CPU - * to ensure it is on a valid CPU (and if not, just force it back to - * parent's CPU). This avoids alot of nasty races. - */ - p->cpus_allowed = current->cpus_allowed; - p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; - if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || - !cpu_online(task_cpu(p)))) - set_task_cpu(p, smp_processor_id()); - /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; diff --git a/kernel/futex.c b/kernel/futex.c index d9b3a2228f9..e7a35f1039e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -530,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return -EINVAL; WARN_ON(!atomic_read(&pi_state->refcount)); - WARN_ON(pid && pi_state->owner && - pi_state->owner->pid != pid); + + /* + * When pi_state->owner is NULL then the owner died + * and another waiter is on the fly. pi_state->owner + * is fixed up by the task which acquires + * pi_state->rt_mutex. + * + * We do not check for pid == 0 which can happen when + * the owner died and robust_list_exit() cleared the + * TID. + */ + if (pid && pi_state->owner) { + /* + * Bail out if user space manipulated the + * futex value. + */ + if (pid != task_pid_vnr(pi_state->owner)) + return -EINVAL; + } atomic_inc(&pi_state->refcount); *ps = pi_state; @@ -758,6 +775,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (!pi_state) return -EINVAL; + /* + * If current does not own the pi_state then the futex is + * inconsistent and user space fiddled with the futex value. + */ + if (pi_state->owner != current) + return -EINVAL; + raw_spin_lock(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); @@ -1971,7 +1995,7 @@ retry_private: /* Unqueue and drop the lock */ unqueue_me_pi(&q); - goto out; + goto out_put_key; out_unlock_put_key: queue_unlock(&q, hb); diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 50dbd599958..967e66143e1 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -243,38 +243,70 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM */ -int reserve_bp_slot(struct perf_event *bp) +static int __reserve_bp_slot(struct perf_event *bp) { struct bp_busy_slots slots = {0}; - int ret = 0; - - mutex_lock(&nr_bp_mutex); fetch_bp_busy_slots(&slots, bp); /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) == HBP_NUM) { - ret = -ENOSPC; - goto end; - } + if (slots.pinned + (!!slots.flexible) == HBP_NUM) + return -ENOSPC; toggle_bp_slot(bp, true); -end: + return 0; +} + +int reserve_bp_slot(struct perf_event *bp) +{ + int ret; + + mutex_lock(&nr_bp_mutex); + + ret = __reserve_bp_slot(bp); + mutex_unlock(&nr_bp_mutex); return ret; } +static void __release_bp_slot(struct perf_event *bp) +{ + toggle_bp_slot(bp, false); +} + void release_bp_slot(struct perf_event *bp) { mutex_lock(&nr_bp_mutex); - toggle_bp_slot(bp, false); + __release_bp_slot(bp); mutex_unlock(&nr_bp_mutex); } +/* + * Allow the kernel debugger to reserve breakpoint slots without + * taking a lock using the dbg_* variant of for the reserve and + * release breakpoint slots. + */ +int dbg_reserve_bp_slot(struct perf_event *bp) +{ + if (mutex_is_locked(&nr_bp_mutex)) + return -1; + + return __reserve_bp_slot(bp); +} + +int dbg_release_bp_slot(struct perf_event *bp) +{ + if (mutex_is_locked(&nr_bp_mutex)) + return -1; + + __release_bp_slot(bp); + + return 0; +} int register_perf_hw_breakpoint(struct perf_event *bp) { @@ -296,6 +328,10 @@ int register_perf_hw_breakpoint(struct perf_event *bp) if (!bp->attr.disabled || !bp->overflow_handler) ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + /* if arch_validate_hwbkpt_settings() fails then release bp slot */ + if (ret) + release_bp_slot(bp); + return ret; } @@ -324,8 +360,8 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { u64 old_addr = bp->attr.bp_addr; + u64 old_len = bp->attr.bp_len; int old_type = bp->attr.bp_type; - int old_len = bp->attr.bp_len; int err = 0; perf_event_disable(bp); diff --git a/kernel/kexec.c b/kernel/kexec.c index a9a93d9ee7a..ef077fb7315 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -32,6 +32,7 @@ #include <linux/console.h> #include <linux/vmalloc.h> #include <linux/swap.h> +#include <linux/kmsg_dump.h> #include <asm/page.h> #include <asm/uaccess.h> @@ -1074,6 +1075,9 @@ void crash_kexec(struct pt_regs *regs) if (mutex_trylock(&kexec_mutex)) { if (kexec_crash_image) { struct pt_regs fixed_regs; + + kmsg_dump(KMSG_DUMP_KEXEC); + crash_setup_regs(&fixed_regs, regs); crash_save_vmcoreinfo(); machine_crash_shutdown(&fixed_regs); diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 32c5c15d750..35edbe22e9a 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -80,7 +80,7 @@ int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask) buffer = kmalloc(size, gfp_mask); if (!buffer) { - _kfifo_init(fifo, 0, 0); + _kfifo_init(fifo, NULL, 0); return -ENOMEM; } @@ -97,6 +97,7 @@ EXPORT_SYMBOL(kfifo_alloc); void kfifo_free(struct kfifo *fifo) { kfree(fifo->buffer); + _kfifo_init(fifo, NULL, 0); } EXPORT_SYMBOL(kfifo_free); @@ -349,6 +350,7 @@ EXPORT_SYMBOL(__kfifo_from_user_n); * @fifo: the fifo to be used. * @from: pointer to the data to be added. * @len: the length of the data to be added. + * @total: the actual returned data length. * * This function copies at most @len bytes from the @from into the * FIFO depending and returns -EFAULT/0. @@ -399,7 +401,7 @@ EXPORT_SYMBOL(__kfifo_to_user_n); * @fifo: the fifo to be used. * @to: where the data must be copied. * @len: the size of the destination buffer. - @ @lenout: pointer to output variable with copied data + * @lenout: pointer to output variable with copied data * * This function copies at most @len bytes from the FIFO into the * @to buffer and 0 or -EFAULT. diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 2eb517e2351..761fdd2b303 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -583,6 +583,9 @@ static void kgdb_wait(struct pt_regs *regs) smp_wmb(); atomic_set(&cpu_in_kgdb[cpu], 1); + /* Disable any cpu specific hw breakpoints */ + kgdb_disable_hw_debug(regs); + /* Wait till primary CPU is done with debugging */ while (atomic_read(&passive_cpu_wait[cpu])) cpu_relax(); @@ -596,7 +599,7 @@ static void kgdb_wait(struct pt_regs *regs) /* Signal the primary CPU that we are done: */ atomic_set(&cpu_in_kgdb[cpu], 0); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); } @@ -1450,7 +1453,7 @@ acquirelock: (kgdb_info[cpu].task && kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); @@ -1550,7 +1553,7 @@ kgdb_restore: } /* Free kgdb_active */ atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b7df302a020..c4b43430d39 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -93,6 +93,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = { {"native_get_debugreg",}, {"irq_entries_start",}, {"common_interrupt",}, + {"mcount",}, /* mcount can be called from everywhere */ {NULL} /* Terminator */ }; diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 5feaddcdbe4..c62ec14609b 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2147,7 +2147,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, return ret; return print_irq_inversion_bug(curr, &root, target_entry, - this, 1, irqclass); + this, 0, irqclass); } void print_irqtrace_events(struct task_struct *curr) diff --git a/kernel/padata.c b/kernel/padata.c new file mode 100644 index 00000000000..6f9bcb8313d --- /dev/null +++ b/kernel/padata.c @@ -0,0 +1,690 @@ +/* + * padata.c - generic interface to process data streams in parallel + * + * Copyright (C) 2008, 2009 secunet Security Networks AG + * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <linux/module.h> +#include <linux/cpumask.h> +#include <linux/err.h> +#include <linux/cpu.h> +#include <linux/padata.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/rcupdate.h> + +#define MAX_SEQ_NR INT_MAX - NR_CPUS +#define MAX_OBJ_NUM 10000 * NR_CPUS + +static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) +{ + int cpu, target_cpu; + + target_cpu = cpumask_first(pd->cpumask); + for (cpu = 0; cpu < cpu_index; cpu++) + target_cpu = cpumask_next(target_cpu, pd->cpumask); + + return target_cpu; +} + +static int padata_cpu_hash(struct padata_priv *padata) +{ + int cpu_index; + struct parallel_data *pd; + + pd = padata->pd; + + /* + * Hash the sequence numbers to the cpus by taking + * seq_nr mod. number of cpus in use. + */ + cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask); + + return padata_index_to_cpu(pd, cpu_index); +} + +static void padata_parallel_worker(struct work_struct *work) +{ + struct padata_queue *queue; + struct parallel_data *pd; + struct padata_instance *pinst; + LIST_HEAD(local_list); + + local_bh_disable(); + queue = container_of(work, struct padata_queue, pwork); + pd = queue->pd; + pinst = pd->pinst; + + spin_lock(&queue->parallel.lock); + list_replace_init(&queue->parallel.list, &local_list); + spin_unlock(&queue->parallel.lock); + + while (!list_empty(&local_list)) { + struct padata_priv *padata; + + padata = list_entry(local_list.next, + struct padata_priv, list); + + list_del_init(&padata->list); + + padata->parallel(padata); + } + + local_bh_enable(); +} + +/* + * padata_do_parallel - padata parallelization function + * + * @pinst: padata instance + * @padata: object to be parallelized + * @cb_cpu: cpu the serialization callback function will run on, + * must be in the cpumask of padata. + * + * The parallelization callback function will run with BHs off. + * Note: Every object which is parallelized by padata_do_parallel + * must be seen by padata_do_serial. + */ +int padata_do_parallel(struct padata_instance *pinst, + struct padata_priv *padata, int cb_cpu) +{ + int target_cpu, err; + struct padata_queue *queue; + struct parallel_data *pd; + + rcu_read_lock_bh(); + + pd = rcu_dereference(pinst->pd); + + err = 0; + if (!(pinst->flags & PADATA_INIT)) + goto out; + + err = -EBUSY; + if ((pinst->flags & PADATA_RESET)) + goto out; + + if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) + goto out; + + err = -EINVAL; + if (!cpumask_test_cpu(cb_cpu, pd->cpumask)) + goto out; + + err = -EINPROGRESS; + atomic_inc(&pd->refcnt); + padata->pd = pd; + padata->cb_cpu = cb_cpu; + + if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr)) + atomic_set(&pd->seq_nr, -1); + + padata->seq_nr = atomic_inc_return(&pd->seq_nr); + + target_cpu = padata_cpu_hash(padata); + queue = per_cpu_ptr(pd->queue, target_cpu); + + spin_lock(&queue->parallel.lock); + list_add_tail(&padata->list, &queue->parallel.list); + spin_unlock(&queue->parallel.lock); + + queue_work_on(target_cpu, pinst->wq, &queue->pwork); + +out: + rcu_read_unlock_bh(); + + return err; +} +EXPORT_SYMBOL(padata_do_parallel); + +static struct padata_priv *padata_get_next(struct parallel_data *pd) +{ + int cpu, num_cpus, empty, calc_seq_nr; + int seq_nr, next_nr, overrun, next_overrun; + struct padata_queue *queue, *next_queue; + struct padata_priv *padata; + struct padata_list *reorder; + + empty = 0; + next_nr = -1; + next_overrun = 0; + next_queue = NULL; + + num_cpus = cpumask_weight(pd->cpumask); + + for_each_cpu(cpu, pd->cpumask) { + queue = per_cpu_ptr(pd->queue, cpu); + reorder = &queue->reorder; + + /* + * Calculate the seq_nr of the object that should be + * next in this queue. + */ + overrun = 0; + calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus) + + queue->cpu_index; + + if (unlikely(calc_seq_nr > pd->max_seq_nr)) { + calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1; + overrun = 1; + } + + if (!list_empty(&reorder->list)) { + padata = list_entry(reorder->list.next, + struct padata_priv, list); + + seq_nr = padata->seq_nr; + BUG_ON(calc_seq_nr != seq_nr); + } else { + seq_nr = calc_seq_nr; + empty++; + } + + if (next_nr < 0 || seq_nr < next_nr + || (next_overrun && !overrun)) { + next_nr = seq_nr; + next_overrun = overrun; + next_queue = queue; + } + } + + padata = NULL; + + if (empty == num_cpus) + goto out; + + reorder = &next_queue->reorder; + + if (!list_empty(&reorder->list)) { + padata = list_entry(reorder->list.next, + struct padata_priv, list); + + if (unlikely(next_overrun)) { + for_each_cpu(cpu, pd->cpumask) { + queue = per_cpu_ptr(pd->queue, cpu); + atomic_set(&queue->num_obj, 0); + } + } + + spin_lock(&reorder->lock); + list_del_init(&padata->list); + atomic_dec(&pd->reorder_objects); + spin_unlock(&reorder->lock); + + atomic_inc(&next_queue->num_obj); + + goto out; + } + + if (next_nr % num_cpus == next_queue->cpu_index) { + padata = ERR_PTR(-ENODATA); + goto out; + } + + padata = ERR_PTR(-EINPROGRESS); +out: + return padata; +} + +static void padata_reorder(struct parallel_data *pd) +{ + struct padata_priv *padata; + struct padata_queue *queue; + struct padata_instance *pinst = pd->pinst; + +try_again: + if (!spin_trylock_bh(&pd->lock)) + goto out; + + while (1) { + padata = padata_get_next(pd); + + if (!padata || PTR_ERR(padata) == -EINPROGRESS) + break; + + if (PTR_ERR(padata) == -ENODATA) { + spin_unlock_bh(&pd->lock); + goto out; + } + + queue = per_cpu_ptr(pd->queue, padata->cb_cpu); + + spin_lock(&queue->serial.lock); + list_add_tail(&padata->list, &queue->serial.list); + spin_unlock(&queue->serial.lock); + + queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork); + } + + spin_unlock_bh(&pd->lock); + + if (atomic_read(&pd->reorder_objects)) + goto try_again; + +out: + return; +} + +static void padata_serial_worker(struct work_struct *work) +{ + struct padata_queue *queue; + struct parallel_data *pd; + LIST_HEAD(local_list); + + local_bh_disable(); + queue = container_of(work, struct padata_queue, swork); + pd = queue->pd; + + spin_lock(&queue->serial.lock); + list_replace_init(&queue->serial.list, &local_list); + spin_unlock(&queue->serial.lock); + + while (!list_empty(&local_list)) { + struct padata_priv *padata; + + padata = list_entry(local_list.next, + struct padata_priv, list); + + list_del_init(&padata->list); + + padata->serial(padata); + atomic_dec(&pd->refcnt); + } + local_bh_enable(); +} + +/* + * padata_do_serial - padata serialization function + * + * @padata: object to be serialized. + * + * padata_do_serial must be called for every parallelized object. + * The serialization callback function will run with BHs off. + */ +void padata_do_serial(struct padata_priv *padata) +{ + int cpu; + struct padata_queue *queue; + struct parallel_data *pd; + + pd = padata->pd; + + cpu = get_cpu(); + queue = per_cpu_ptr(pd->queue, cpu); + + spin_lock(&queue->reorder.lock); + atomic_inc(&pd->reorder_objects); + list_add_tail(&padata->list, &queue->reorder.list); + spin_unlock(&queue->reorder.lock); + + put_cpu(); + + padata_reorder(pd); +} +EXPORT_SYMBOL(padata_do_serial); + +static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, + const struct cpumask *cpumask) +{ + int cpu, cpu_index, num_cpus; + struct padata_queue *queue; + struct parallel_data *pd; + + cpu_index = 0; + + pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); + if (!pd) + goto err; + + pd->queue = alloc_percpu(struct padata_queue); + if (!pd->queue) + goto err_free_pd; + + if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) + goto err_free_queue; + + for_each_possible_cpu(cpu) { + queue = per_cpu_ptr(pd->queue, cpu); + + queue->pd = pd; + + if (cpumask_test_cpu(cpu, cpumask) + && cpumask_test_cpu(cpu, cpu_active_mask)) { + queue->cpu_index = cpu_index; + cpu_index++; + } else + queue->cpu_index = -1; + + INIT_LIST_HEAD(&queue->reorder.list); + INIT_LIST_HEAD(&queue->parallel.list); + INIT_LIST_HEAD(&queue->serial.list); + spin_lock_init(&queue->reorder.lock); + spin_lock_init(&queue->parallel.lock); + spin_lock_init(&queue->serial.lock); + + INIT_WORK(&queue->pwork, padata_parallel_worker); + INIT_WORK(&queue->swork, padata_serial_worker); + atomic_set(&queue->num_obj, 0); + } + + cpumask_and(pd->cpumask, cpumask, cpu_active_mask); + + num_cpus = cpumask_weight(pd->cpumask); + pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; + + atomic_set(&pd->seq_nr, -1); + atomic_set(&pd->reorder_objects, 0); + atomic_set(&pd->refcnt, 0); + pd->pinst = pinst; + spin_lock_init(&pd->lock); + + return pd; + +err_free_queue: + free_percpu(pd->queue); +err_free_pd: + kfree(pd); +err: + return NULL; +} + +static void padata_free_pd(struct parallel_data *pd) +{ + free_cpumask_var(pd->cpumask); + free_percpu(pd->queue); + kfree(pd); +} + +static void padata_replace(struct padata_instance *pinst, + struct parallel_data *pd_new) +{ + struct parallel_data *pd_old = pinst->pd; + + pinst->flags |= PADATA_RESET; + + rcu_assign_pointer(pinst->pd, pd_new); + + synchronize_rcu(); + + while (atomic_read(&pd_old->refcnt) != 0) + yield(); + + flush_workqueue(pinst->wq); + + padata_free_pd(pd_old); + + pinst->flags &= ~PADATA_RESET; +} + +/* + * padata_set_cpumask - set the cpumask that padata should use + * + * @pinst: padata instance + * @cpumask: the cpumask to use + */ +int padata_set_cpumask(struct padata_instance *pinst, + cpumask_var_t cpumask) +{ + struct parallel_data *pd; + int err = 0; + + might_sleep(); + + mutex_lock(&pinst->lock); + + pd = padata_alloc_pd(pinst, cpumask); + if (!pd) { + err = -ENOMEM; + goto out; + } + + cpumask_copy(pinst->cpumask, cpumask); + + padata_replace(pinst, pd); + +out: + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_set_cpumask); + +static int __padata_add_cpu(struct padata_instance *pinst, int cpu) +{ + struct parallel_data *pd; + + if (cpumask_test_cpu(cpu, cpu_active_mask)) { + pd = padata_alloc_pd(pinst, pinst->cpumask); + if (!pd) + return -ENOMEM; + + padata_replace(pinst, pd); + } + + return 0; +} + +/* + * padata_add_cpu - add a cpu to the padata cpumask + * + * @pinst: padata instance + * @cpu: cpu to add + */ +int padata_add_cpu(struct padata_instance *pinst, int cpu) +{ + int err; + + might_sleep(); + + mutex_lock(&pinst->lock); + + cpumask_set_cpu(cpu, pinst->cpumask); + err = __padata_add_cpu(pinst, cpu); + + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_add_cpu); + +static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) +{ + struct parallel_data *pd; + + if (cpumask_test_cpu(cpu, cpu_online_mask)) { + pd = padata_alloc_pd(pinst, pinst->cpumask); + if (!pd) + return -ENOMEM; + + padata_replace(pinst, pd); + } + + return 0; +} + +/* + * padata_remove_cpu - remove a cpu from the padata cpumask + * + * @pinst: padata instance + * @cpu: cpu to remove + */ +int padata_remove_cpu(struct padata_instance *pinst, int cpu) +{ + int err; + + might_sleep(); + + mutex_lock(&pinst->lock); + + cpumask_clear_cpu(cpu, pinst->cpumask); + err = __padata_remove_cpu(pinst, cpu); + + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_remove_cpu); + +/* + * padata_start - start the parallel processing + * + * @pinst: padata instance to start + */ +void padata_start(struct padata_instance *pinst) +{ + might_sleep(); + + mutex_lock(&pinst->lock); + pinst->flags |= PADATA_INIT; + mutex_unlock(&pinst->lock); +} +EXPORT_SYMBOL(padata_start); + +/* + * padata_stop - stop the parallel processing + * + * @pinst: padata instance to stop + */ +void padata_stop(struct padata_instance *pinst) +{ + might_sleep(); + + mutex_lock(&pinst->lock); + pinst->flags &= ~PADATA_INIT; + mutex_unlock(&pinst->lock); +} +EXPORT_SYMBOL(padata_stop); + +static int __cpuinit padata_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int err; + struct padata_instance *pinst; + int cpu = (unsigned long)hcpu; + + pinst = container_of(nfb, struct padata_instance, cpu_notifier); + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + if (!cpumask_test_cpu(cpu, pinst->cpumask)) + break; + mutex_lock(&pinst->lock); + err = __padata_add_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + if (err) + return NOTIFY_BAD; + break; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + if (!cpumask_test_cpu(cpu, pinst->cpumask)) + break; + mutex_lock(&pinst->lock); + err = __padata_remove_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + if (err) + return NOTIFY_BAD; + break; + + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + if (!cpumask_test_cpu(cpu, pinst->cpumask)) + break; + mutex_lock(&pinst->lock); + __padata_remove_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + if (!cpumask_test_cpu(cpu, pinst->cpumask)) + break; + mutex_lock(&pinst->lock); + __padata_add_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + } + + return NOTIFY_OK; +} + +/* + * padata_alloc - allocate and initialize a padata instance + * + * @cpumask: cpumask that padata uses for parallelization + * @wq: workqueue to use for the allocated padata instance + */ +struct padata_instance *padata_alloc(const struct cpumask *cpumask, + struct workqueue_struct *wq) +{ + int err; + struct padata_instance *pinst; + struct parallel_data *pd; + + pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); + if (!pinst) + goto err; + + pd = padata_alloc_pd(pinst, cpumask); + if (!pd) + goto err_free_inst; + + rcu_assign_pointer(pinst->pd, pd); + + pinst->wq = wq; + + cpumask_copy(pinst->cpumask, cpumask); + + pinst->flags = 0; + + pinst->cpu_notifier.notifier_call = padata_cpu_callback; + pinst->cpu_notifier.priority = 0; + err = register_hotcpu_notifier(&pinst->cpu_notifier); + if (err) + goto err_free_pd; + + mutex_init(&pinst->lock); + + return pinst; + +err_free_pd: + padata_free_pd(pd); +err_free_inst: + kfree(pinst); +err: + return NULL; +} +EXPORT_SYMBOL(padata_alloc); + +/* + * padata_free - free a padata instance + * + * @ padata_inst: padata instance to free + */ +void padata_free(struct padata_instance *pinst) +{ + padata_stop(pinst); + + synchronize_rcu(); + + while (atomic_read(&pinst->pd->refcnt) != 0) + yield(); + + unregister_hotcpu_notifier(&pinst->cpu_notifier); + padata_free_pd(pinst->pd); + kfree(pinst); +} +EXPORT_SYMBOL(padata_free); diff --git a/kernel/panic.c b/kernel/panic.c index 5827f7b9725..c787333282b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -75,7 +75,6 @@ NORET_TYPE void panic(const char * fmt, ...) dump_stack(); #endif - kmsg_dump(KMSG_DUMP_PANIC); /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. @@ -83,6 +82,8 @@ NORET_TYPE void panic(const char * fmt, ...) */ crash_kexec(NULL); + kmsg_dump(KMSG_DUMP_PANIC); + /* * Note smp_send_stop is the usual smp shutdown function, which * unfortunately means it may not be hardened to work in a panic diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 603c0d8b5df..2ae7409bf38 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -3259,8 +3259,6 @@ static void perf_event_task_output(struct perf_event *event, task_event->event_id.tid = perf_event_tid(event, task); task_event->event_id.ptid = perf_event_tid(event, current); - task_event->event_id.time = perf_clock(); - perf_output_put(&handle, task_event->event_id); perf_output_end(&handle); @@ -3268,6 +3266,9 @@ static void perf_event_task_output(struct perf_event *event, static int perf_event_task_match(struct perf_event *event) { + if (event->state < PERF_EVENT_STATE_INACTIVE) + return 0; + if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; @@ -3297,7 +3298,7 @@ static void perf_event_task_event(struct perf_task_event *task_event) cpuctx = &get_cpu_var(perf_cpu_context); perf_event_task_ctx(&cpuctx->ctx, task_event); if (!ctx) - ctx = rcu_dereference(task_event->task->perf_event_ctxp); + ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_event_task_ctx(ctx, task_event); put_cpu_var(perf_cpu_context); @@ -3328,6 +3329,7 @@ static void perf_event_task(struct task_struct *task, /* .ppid */ /* .tid */ /* .ptid */ + .time = perf_clock(), }, }; @@ -3377,6 +3379,9 @@ static void perf_event_comm_output(struct perf_event *event, static int perf_event_comm_match(struct perf_event *event) { + if (event->state < PERF_EVENT_STATE_INACTIVE) + return 0; + if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; @@ -3494,6 +3499,9 @@ static void perf_event_mmap_output(struct perf_event *event, static int perf_event_mmap_match(struct perf_event *event, struct perf_mmap_event *mmap_event) { + if (event->state < PERF_EVENT_STATE_INACTIVE) + return 0; + if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; @@ -4571,7 +4579,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->type >= PERF_TYPE_MAX) return -EINVAL; - if (attr->__reserved_1 || attr->__reserved_2) + if (attr->__reserved_1) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 91e09d3b2eb..5c36ea9d55d 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -27,6 +27,15 @@ config PM_DEBUG code. This is helpful when debugging and reporting PM bugs, like suspend support. +config PM_ADVANCED_DEBUG + bool "Extra PM attributes in sysfs for low-level debugging/testing" + depends on PM_DEBUG + default n + ---help--- + Add extra sysfs attributes allowing one to access some Power Management + fields of device objects from user space. If you are not a kernel + developer interested in debugging/testing Power Management, say "no". + config PM_VERBOSE bool "Verbose Power Management debugging" depends on PM_DEBUG @@ -85,6 +94,11 @@ config PM_SLEEP depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE default y +config PM_SLEEP_ADVANCED_DEBUG + bool + depends on PM_ADVANCED_DEBUG + default n + config SUSPEND bool "Suspend to RAM and standby" depends on PM && ARCH_SUSPEND_POSSIBLE @@ -222,3 +236,8 @@ config PM_RUNTIME and the bus type drivers of the buses the devices are on are responsible for the actual handling of the autosuspend requests and wake-up events. + +config PM_OPS + bool + depends on PM_SLEEP || PM_RUNTIME + default y diff --git a/kernel/power/main.c b/kernel/power/main.c index 0998c713905..b58800b21fc 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -44,6 +44,32 @@ int pm_notifier_call_chain(unsigned long val) == NOTIFY_BAD) ? -EINVAL : 0; } +/* If set, devices may be suspended and resumed asynchronously. */ +int pm_async_enabled = 1; + +static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", pm_async_enabled); +} + +static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (strict_strtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + pm_async_enabled = val; + return n; +} + +power_attr(pm_async); + #ifdef CONFIG_PM_DEBUG int pm_test_level = TEST_NONE; @@ -208,9 +234,12 @@ static struct attribute * g[] = { #ifdef CONFIG_PM_TRACE &pm_trace_attr.attr, #endif -#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG) +#ifdef CONFIG_PM_SLEEP + &pm_async_attr.attr, +#ifdef CONFIG_PM_DEBUG &pm_test_attr.attr, #endif +#endif NULL, }; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 36cb168e433..830cadecbdf 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1181,7 +1181,7 @@ static void free_unnecessary_pages(void) memory_bm_position_reset(©_bm); - while (to_free_normal > 0 && to_free_highmem > 0) { + while (to_free_normal > 0 || to_free_highmem > 0) { unsigned long pfn = memory_bm_next_pfn(©_bm); struct page *page = pfn_to_page(pfn); @@ -1500,7 +1500,7 @@ asmlinkage int swsusp_save(void) { unsigned int nr_pages, nr_highmem; - printk(KERN_INFO "PM: Creating hibernation image: \n"); + printk(KERN_INFO "PM: Creating hibernation image:\n"); drain_local_pages(NULL); nr_pages = count_data_pages(); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 09b2b0ae9e9..1d575733d4e 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -657,10 +657,6 @@ int swsusp_read(unsigned int *flags_p) struct swsusp_info *header; *flags_p = swsusp_header->flags; - if (IS_ERR(resume_bdev)) { - pr_debug("PM: Image device not initialised\n"); - return PTR_ERR(resume_bdev); - } memset(&snapshot, 0, sizeof(struct snapshot_handle)); error = snapshot_write_next(&snapshot, PAGE_SIZE); diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c deleted file mode 100644 index 5b3601bd189..00000000000 --- a/kernel/power/swsusp.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * linux/kernel/power/swsusp.c - * - * This file provides code to write suspend image to swap and read it back. - * - * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> - * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> - * - * This file is released under the GPLv2. - * - * I'd like to thank the following people for their work: - * - * Pavel Machek <pavel@ucw.cz>: - * Modifications, defectiveness pointing, being with me at the very beginning, - * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. - * - * Steve Doddi <dirk@loth.demon.co.uk>: - * Support the possibility of hardware state restoring. - * - * Raph <grey.havens@earthling.net>: - * Support for preserving states of network devices and virtual console - * (including X and svgatextmode) - * - * Kurt Garloff <garloff@suse.de>: - * Straightened the critical function in order to prevent compilers from - * playing tricks with local variables. - * - * Andreas Mohr <a.mohr@mailto.de> - * - * Alex Badea <vampire@go.ro>: - * Fixed runaway init - * - * Rafael J. Wysocki <rjw@sisk.pl> - * Reworked the freeing of memory and the handling of swap - * - * More state savers are welcome. Especially for the scsi layer... - * - * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt - */ - -#include <linux/mm.h> -#include <linux/suspend.h> -#include <linux/spinlock.h> -#include <linux/kernel.h> -#include <linux/major.h> -#include <linux/swap.h> -#include <linux/pm.h> -#include <linux/swapops.h> -#include <linux/bootmem.h> -#include <linux/syscalls.h> -#include <linux/highmem.h> -#include <linux/time.h> -#include <linux/rbtree.h> -#include <linux/io.h> - -#include "power.h" - -int in_suspend __nosavedata = 0; diff --git a/kernel/power/user.c b/kernel/power/user.c index bf0014d6a5f..4d2289626a8 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -195,6 +195,15 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, return res; } +static void snapshot_deprecated_ioctl(unsigned int cmd) +{ + if (printk_ratelimit()) + printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will " + "be removed soon, update your suspend-to-disk " + "utilities\n", + __builtin_return_address(0), cmd); +} + static long snapshot_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -246,8 +255,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, data->frozen = 0; break; - case SNAPSHOT_CREATE_IMAGE: case SNAPSHOT_ATOMIC_SNAPSHOT: + snapshot_deprecated_ioctl(cmd); + case SNAPSHOT_CREATE_IMAGE: if (data->mode != O_RDONLY || !data->frozen || data->ready) { error = -EPERM; break; @@ -275,8 +285,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, data->ready = 0; break; - case SNAPSHOT_PREF_IMAGE_SIZE: case SNAPSHOT_SET_IMAGE_SIZE: + snapshot_deprecated_ioctl(cmd); + case SNAPSHOT_PREF_IMAGE_SIZE: image_size = arg; break; @@ -290,15 +301,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = put_user(size, (loff_t __user *)arg); break; - case SNAPSHOT_AVAIL_SWAP_SIZE: case SNAPSHOT_AVAIL_SWAP: + snapshot_deprecated_ioctl(cmd); + case SNAPSHOT_AVAIL_SWAP_SIZE: size = count_swap_pages(data->swap, 1); size <<= PAGE_SHIFT; error = put_user(size, (loff_t __user *)arg); break; - case SNAPSHOT_ALLOC_SWAP_PAGE: case SNAPSHOT_GET_SWAP_PAGE: + snapshot_deprecated_ioctl(cmd); + case SNAPSHOT_ALLOC_SWAP_PAGE: if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { error = -ENODEV; break; @@ -321,6 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ + snapshot_deprecated_ioctl(cmd); if (!swsusp_swap_in_use()) { /* * User space encodes device types as two-byte values, @@ -362,6 +376,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ + snapshot_deprecated_ioctl(cmd); error = -EINVAL; switch (arg) { diff --git a/kernel/printk.c b/kernel/printk.c index 17463ca2e22..1751c456b71 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1467,6 +1467,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister); static const char const *kmsg_reasons[] = { [KMSG_DUMP_OOPS] = "oops", [KMSG_DUMP_PANIC] = "panic", + [KMSG_DUMP_KEXEC] = "kexec", }; static const char *kmsg_to_str(enum kmsg_dump_reason reason) diff --git a/kernel/resource.c b/kernel/resource.c index af96c1e4b54..24e9e60c145 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -188,6 +188,36 @@ static int __release_resource(struct resource *old) return -EINVAL; } +static void __release_child_resources(struct resource *r) +{ + struct resource *tmp, *p; + resource_size_t size; + + p = r->child; + r->child = NULL; + while (p) { + tmp = p; + p = p->sibling; + + tmp->parent = NULL; + tmp->sibling = NULL; + __release_child_resources(tmp); + + printk(KERN_DEBUG "release child resource %pR\n", tmp); + /* need to restore size, and keep flags */ + size = resource_size(tmp); + tmp->start = 0; + tmp->end = size - 1; + } +} + +void release_child_resources(struct resource *r) +{ + write_lock(&resource_lock); + __release_child_resources(r); + write_unlock(&resource_lock); +} + /** * request_resource - request and reserve an I/O or memory resource * @root: root resource descriptor @@ -303,8 +333,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, static int find_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, resource_size_t max, resource_size_t align, - void (*alignf)(void *, struct resource *, - resource_size_t, resource_size_t), + resource_size_t (*alignf)(void *, + const struct resource *, + resource_size_t, + resource_size_t), void *alignf_data) { struct resource *this = root->child; @@ -330,7 +362,7 @@ static int find_resource(struct resource *root, struct resource *new, tmp.end = max; tmp.start = ALIGN(tmp.start, align); if (alignf) - alignf(alignf_data, &tmp, size, align); + tmp.start = alignf(alignf_data, &tmp, size, align); if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { new->start = tmp.start; new->end = tmp.start + size - 1; @@ -358,8 +390,10 @@ static int find_resource(struct resource *root, struct resource *new, int allocate_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, resource_size_t max, resource_size_t align, - void (*alignf)(void *, struct resource *, - resource_size_t, resource_size_t), + resource_size_t (*alignf)(void *, + const struct resource *, + resource_size_t, + resource_size_t), void *alignf_data) { int err; diff --git a/kernel/sched.c b/kernel/sched.c index c535cc4f642..3a8fb30a91b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2320,14 +2320,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } /* - * Called from: + * Gets called from 3 sites (exec, fork, wakeup), since it is called without + * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done + * by: * - * - fork, @p is stable because it isn't on the tasklist yet - * - * - exec, @p is unstable, retry loop - * - * - wake-up, we serialize ->cpus_allowed against TASK_WAKING so - * we should be good. + * exec: is unstable, retry loop + * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING */ static inline int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) @@ -2620,9 +2618,6 @@ void sched_fork(struct task_struct *p, int clone_flags) if (p->sched_class->task_fork) p->sched_class->task_fork(p); -#ifdef CONFIG_SMP - cpu = select_task_rq(p, SD_BALANCE_FORK, 0); -#endif set_task_cpu(p, cpu); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) @@ -2652,6 +2647,21 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { unsigned long flags; struct rq *rq; + int cpu = get_cpu(); + +#ifdef CONFIG_SMP + /* + * Fork balancing, do it here and not earlier because: + * - cpus_allowed can change in the fork path + * - any previously selected cpu might disappear through hotplug + * + * We still have TASK_WAKING but PF_STARTING is gone now, meaning + * ->cpus_allowed is stable, we have preemption disabled, meaning + * cpu_online_mask is stable. + */ + cpu = select_task_rq(p, SD_BALANCE_FORK, 0); + set_task_cpu(p, cpu); +#endif rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_WAKING); @@ -2665,6 +2675,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_woken(rq, p); #endif task_rq_unlock(rq, &flags); + put_cpu(); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -5530,8 +5541,11 @@ need_resched_nonpreemptible: post_schedule(rq); - if (unlikely(reacquire_kernel_lock(current) < 0)) + if (unlikely(reacquire_kernel_lock(current) < 0)) { + prev = rq->curr; + switch_count = &prev->nivcsw; goto need_resched_nonpreemptible; + } preempt_enable_no_resched(); if (need_resched()) @@ -7136,14 +7150,18 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) * the ->cpus_allowed mask from under waking tasks, which would be * possible when we change rq->lock in ttwu(), so synchronize against * TASK_WAKING to avoid that. + * + * Make an exception for freshly cloned tasks, since cpuset namespaces + * might move the task about, we have to validate the target in + * wake_up_new_task() anyway since the cpu might have gone away. */ again: - while (p->state == TASK_WAKING) + while (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) cpu_relax(); rq = task_rq_lock(p, &flags); - if (p->state == TASK_WAKING) { + if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) { task_rq_unlock(rq, &flags); goto again; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 42ac3c9f66f..8fe7ee81c55 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1508,7 +1508,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag * If there's an idle sibling in this domain, make that * the wake_affine target instead of the current cpu. */ - if (tmp->flags & SD_PREFER_SIBLING) + if (tmp->flags & SD_SHARE_PKG_RESOURCES) target = select_idle_sibling(p, tmp, target); if (target >= 0) { diff --git a/kernel/softirq.c b/kernel/softirq.c index a09502e2ef7..7c1a67ef027 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill); */ /* - * The trampoline is called when the hrtimer expires. If this is - * called from the hrtimer interrupt then we schedule the tasklet as - * the timer callback function expects to run in softirq context. If - * it's called in softirq context anyway (i.e. high resolution timers - * disabled) then the hrtimer callback is called right away. + * The trampoline is called when the hrtimer expires. It schedules a tasklet + * to run __tasklet_hrtimer_trampoline() which in turn will call the intended + * hrtimer callback, but from softirq context. */ static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) { struct tasklet_hrtimer *ttimer = container_of(timer, struct tasklet_hrtimer, timer); - if (hrtimer_is_hres_active(timer)) { - tasklet_hi_schedule(&ttimer->tasklet); - return HRTIMER_NORESTART; - } - return ttimer->function(timer); + tasklet_hi_schedule(&ttimer->tasklet); + return HRTIMER_NORESTART; } /* diff --git a/kernel/softlockup.c b/kernel/softlockup.c index d22579087e2..0d4c7898ab8 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -25,6 +25,7 @@ static DEFINE_SPINLOCK(print_lock); static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); +static DEFINE_PER_CPU(bool, softlock_touch_sync); static int __read_mostly did_panic; int __read_mostly softlockup_thresh = 60; @@ -79,6 +80,12 @@ void touch_softlockup_watchdog(void) } EXPORT_SYMBOL(touch_softlockup_watchdog); +void touch_softlockup_watchdog_sync(void) +{ + __raw_get_cpu_var(softlock_touch_sync) = true; + __raw_get_cpu_var(softlockup_touch_ts) = 0; +} + void touch_all_softlockup_watchdogs(void) { int cpu; @@ -118,6 +125,14 @@ void softlockup_tick(void) } if (touch_ts == 0) { + if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) { + /* + * If the time stamp was touched atomically + * make sure the scheduler tick is up to date. + */ + per_cpu(softlock_touch_sync, this_cpu) = false; + sched_clock_tick(); + } __touch_softlockup_watchdog(); return; } diff --git a/kernel/sys.c b/kernel/sys.c index 26a6b73a6b8..18bde979f34 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -222,6 +222,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) if (which > PRIO_USER || which < PRIO_PROCESS) return -EINVAL; + rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: @@ -267,6 +268,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) } out_unlock: read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 6f740d9f094..d7395fdfb9f 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -259,7 +259,8 @@ void clockevents_notify(unsigned long reason, void *arg) cpu = *((int *)arg); list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { if (cpumask_test_cpu(cpu, dev->cpumask) && - cpumask_weight(dev->cpumask) == 1) { + cpumask_weight(dev->cpumask) == 1 && + !tick_is_broadcast_device(dev)) { BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); list_del(&dev->list); } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e85c23404d3..13700833c18 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -343,7 +343,19 @@ static void clocksource_resume_watchdog(void) { unsigned long flags; - spin_lock_irqsave(&watchdog_lock, flags); + /* + * We use trylock here to avoid a potential dead lock when + * kgdb calls this code after the kernel has been stopped with + * watchdog_lock held. When watchdog_lock is held we just + * return and accept, that the watchdog might trigger and mark + * the monitored clock source (usually TSC) unstable. + * + * This does not affect the other caller clocksource_resume() + * because at this point the kernel is UP, interrupts are + * disabled and nothing can hold watchdog_lock. + */ + if (!spin_trylock_irqsave(&watchdog_lock, flags)) + return; clocksource_reset_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); } @@ -458,8 +470,8 @@ void clocksource_resume(void) * clocksource_touch_watchdog - Update watchdog * * Update the watchdog after exception contexts such as kgdb so as not - * to incorrectly trip the watchdog. - * + * to incorrectly trip the watchdog. This might fail when the kernel + * was stopped in code which holds watchdog_lock. */ void clocksource_touch_watchdog(void) { diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7faaa32fbf4..e2ab064c6d4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -880,6 +880,7 @@ void getboottime(struct timespec *ts) set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); } +EXPORT_SYMBOL_GPL(getboottime); /** * monotonic_to_bootbased - Convert the monotonic time to boot based. @@ -889,6 +890,7 @@ void monotonic_to_bootbased(struct timespec *ts) { *ts = timespec_add_safe(*ts, total_sleep_time); } +EXPORT_SYMBOL_GPL(monotonic_to_bootbased); unsigned long get_seconds(void) { diff --git a/kernel/timer.c b/kernel/timer.c index 15533b79239..c61a7949387 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1198,6 +1198,7 @@ void update_process_times(int user_tick) run_local_timers(); rcu_check_callbacks(cpu, user_tick); printk_tick(); + perf_event_do_pending(); scheduler_tick(); run_posix_cpu_timers(p); } @@ -1209,8 +1210,6 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = __get_cpu_var(tvec_bases); - perf_event_do_pending(); - hrtimer_run_pending(); if (time_after_eq(jiffies, base->timer_jiffies)) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 6c22d8a2f28..60e2ce0181e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -27,9 +27,7 @@ config HAVE_FUNCTION_GRAPH_TRACER config HAVE_FUNCTION_GRAPH_FP_TEST bool help - An arch may pass in a unique value (frame pointer) to both the - entering and exiting of a function. On exit, the value is compared - and if it does not match, then it will panic the kernel. + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_TRACE_MCOUNT_TEST bool diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index edefe3b2801..8c1b2d29071 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -464,6 +464,8 @@ struct ring_buffer_iter { struct ring_buffer_per_cpu *cpu_buffer; unsigned long head; struct buffer_page *head_page; + struct buffer_page *cache_reader_page; + unsigned long cache_read; u64 read_stamp; }; @@ -2716,6 +2718,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) iter->read_stamp = cpu_buffer->read_stamp; else iter->read_stamp = iter->head_page->page->time_stamp; + iter->cache_reader_page = cpu_buffer->reader_page; + iter->cache_read = cpu_buffer->read; } /** @@ -3060,13 +3064,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event *event; int nr_loops = 0; - if (ring_buffer_iter_empty(iter)) - return NULL; - cpu_buffer = iter->cpu_buffer; buffer = cpu_buffer->buffer; + /* + * Check if someone performed a consuming read to + * the buffer. A consuming read invalidates the iterator + * and we need to reset the iterator in this case. + */ + if (unlikely(iter->cache_read != cpu_buffer->read || + iter->cache_reader_page != cpu_buffer->reader_page)) + rb_iter_reset(iter); + again: + if (ring_buffer_iter_empty(iter)) + return NULL; + /* * We repeat when a timestamp is encountered. * We can get multiple timestamps by nested interrupts or also @@ -3081,6 +3094,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) if (rb_per_cpu_empty(cpu_buffer)) return NULL; + if (iter->head >= local_read(&iter->head_page->page->commit)) { + rb_inc_iter(iter); + goto again; + } + event = rb_iter_head_event(iter); switch (event->type_len) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0df1b0f2cb9..eac6875cb99 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -951,6 +951,11 @@ void trace_find_cmdline(int pid, char comm[]) return; } + if (WARN_ON_ONCE(pid < 0)) { + strcpy(comm, "<XXX>"); + return; + } + if (pid > PID_MAX_DEFAULT) { strcpy(comm, "<...>"); return; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6ea90c0e2c9..50b1b823980 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -689,7 +689,7 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } /* an address specified */ - ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); + ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); if (ret) { pr_info("Failed to parse address.\n"); return ret; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 678a5120ee3..f4bc9b27de5 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, unsigned long val, flags; char buf[64]; int ret; + int cpu; if (count >= sizeof(buf)) return -EINVAL; @@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, return ret; local_irq_save(flags); + + /* + * In case we trace inside arch_spin_lock() or after (NMI), + * we will cause circular lock, so we also need to increase + * the percpu trace_active here. + */ + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)++; + arch_spin_lock(&max_stack_lock); *ptr = val; arch_spin_unlock(&max_stack_lock); + + per_cpu(trace_active, cpu)--; local_irq_restore(flags); return count; @@ -206,7 +218,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { + int cpu; + local_irq_disable(); + + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)++; + arch_spin_lock(&max_stack_lock); if (*pos == 0) @@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void t_stop(struct seq_file *m, void *p) { + int cpu; + arch_spin_unlock(&max_stack_lock); + + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)--; + local_irq_enable(); } |