summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/cgroup.c7
-rw-r--r--kernel/cpu.c10
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/fork.c15
-rw-r--r--kernel/futex.c30
-rw-r--r--kernel/hw_breakpoint.c58
-rw-r--r--kernel/kexec.c4
-rw-r--r--kernel/kfifo.c6
-rw-r--r--kernel/kgdb.c9
-rw-r--r--kernel/kprobes.c1
-rw-r--r--kernel/lockdep.c2
-rw-r--r--kernel/padata.c690
-rw-r--r--kernel/panic.c3
-rw-r--r--kernel/perf_event.c16
-rw-r--r--kernel/power/Kconfig19
-rw-r--r--kernel/power/main.c31
-rw-r--r--kernel/power/snapshot.c4
-rw-r--r--kernel/power/swap.c4
-rw-r--r--kernel/power/swsusp.c58
-rw-r--r--kernel/power/user.c23
-rw-r--r--kernel/printk.c1
-rw-r--r--kernel/resource.c44
-rw-r--r--kernel/sched.c44
-rw-r--r--kernel/sched_fair.c2
-rw-r--r--kernel/softirq.c15
-rw-r--r--kernel/softlockup.c15
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/time/clockevents.c3
-rw-r--r--kernel/time/clocksource.c18
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/timer.c3
-rw-r--r--kernel/trace/Kconfig4
-rw-r--r--kernel/trace/ring_buffer.c24
-rw-r--r--kernel/trace/trace.c5
-rw-r--r--kernel/trace/trace_kprobe.c2
-rw-r--r--kernel/trace/trace_stack.c24
37 files changed, 1042 insertions, 159 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 864ff75d65f..6aebdeb2aa3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
obj-$(CONFIG_PERF_EVENTS) += perf_event.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
+obj-$(CONFIG_PADATA) += padata.o
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1fbcc748044..aa3bee56644 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2936,14 +2936,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
for_each_subsys(root, ss) {
struct cgroup_subsys_state *css = ss->create(ss, cgrp);
+
if (IS_ERR(css)) {
err = PTR_ERR(css);
goto err_destroy;
}
init_cgroup_css(css, ss, cgrp);
- if (ss->use_id)
- if (alloc_css_id(ss, parent, cgrp))
+ if (ss->use_id) {
+ err = alloc_css_id(ss, parent, cgrp);
+ if (err)
goto err_destroy;
+ }
/* At error, ->destroy() callback has to free assigned ID. */
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1c8ddd6ee94..677f25376a3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -151,13 +151,13 @@ static inline void check_for_tasks(int cpu)
write_lock_irq(&tasklist_lock);
for_each_process(p) {
- if (task_cpu(p) == cpu &&
+ if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
(!cputime_eq(p->utime, cputime_zero) ||
!cputime_eq(p->stime, cputime_zero)))
- printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
- (state = %ld, flags = %x) \n",
- p->comm, task_pid_nr(p), cpu,
- p->state, p->flags);
+ printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
+ "(state = %ld, flags = %x)\n",
+ p->comm, task_pid_nr(p), cpu,
+ p->state, p->flags);
}
write_unlock_irq(&tasklist_lock);
}
diff --git a/kernel/cred.c b/kernel/cred.c
index dd76cfe5f5b..1ed8ca18790 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -224,7 +224,7 @@ struct cred *cred_alloc_blank(void)
#ifdef CONFIG_KEYS
new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
if (!new->tgcred) {
- kfree(new);
+ kmem_cache_free(cred_jar, new);
return NULL;
}
atomic_set(&new->tgcred->usage, 1);
diff --git a/kernel/fork.c b/kernel/fork.c
index 5b2959b3ffc..f88bd984df3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1241,21 +1241,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
/* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);
- /*
- * The task hasn't been attached yet, so its cpus_allowed mask will
- * not be changed, nor will its assigned CPU.
- *
- * The cpus_allowed mask of the parent may have changed after it was
- * copied first time - so re-copy it here, then check the child's CPU
- * to ensure it is on a valid CPU (and if not, just force it back to
- * parent's CPU). This avoids alot of nasty races.
- */
- p->cpus_allowed = current->cpus_allowed;
- p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
- if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
- !cpu_online(task_cpu(p))))
- set_task_cpu(p, smp_processor_id());
-
/* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent;
diff --git a/kernel/futex.c b/kernel/futex.c
index d9b3a2228f9..e7a35f1039e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -530,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
return -EINVAL;
WARN_ON(!atomic_read(&pi_state->refcount));
- WARN_ON(pid && pi_state->owner &&
- pi_state->owner->pid != pid);
+
+ /*
+ * When pi_state->owner is NULL then the owner died
+ * and another waiter is on the fly. pi_state->owner
+ * is fixed up by the task which acquires
+ * pi_state->rt_mutex.
+ *
+ * We do not check for pid == 0 which can happen when
+ * the owner died and robust_list_exit() cleared the
+ * TID.
+ */
+ if (pid && pi_state->owner) {
+ /*
+ * Bail out if user space manipulated the
+ * futex value.
+ */
+ if (pid != task_pid_vnr(pi_state->owner))
+ return -EINVAL;
+ }
atomic_inc(&pi_state->refcount);
*ps = pi_state;
@@ -758,6 +775,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
if (!pi_state)
return -EINVAL;
+ /*
+ * If current does not own the pi_state then the futex is
+ * inconsistent and user space fiddled with the futex value.
+ */
+ if (pi_state->owner != current)
+ return -EINVAL;
+
raw_spin_lock(&pi_state->pi_mutex.wait_lock);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
@@ -1971,7 +1995,7 @@ retry_private:
/* Unqueue and drop the lock */
unqueue_me_pi(&q);
- goto out;
+ goto out_put_key;
out_unlock_put_key:
queue_unlock(&q, hb);
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 50dbd599958..967e66143e1 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -243,38 +243,70 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
* ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
* + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
*/
-int reserve_bp_slot(struct perf_event *bp)
+static int __reserve_bp_slot(struct perf_event *bp)
{
struct bp_busy_slots slots = {0};
- int ret = 0;
-
- mutex_lock(&nr_bp_mutex);
fetch_bp_busy_slots(&slots, bp);
/* Flexible counters need to keep at least one slot */
- if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
- ret = -ENOSPC;
- goto end;
- }
+ if (slots.pinned + (!!slots.flexible) == HBP_NUM)
+ return -ENOSPC;
toggle_bp_slot(bp, true);
-end:
+ return 0;
+}
+
+int reserve_bp_slot(struct perf_event *bp)
+{
+ int ret;
+
+ mutex_lock(&nr_bp_mutex);
+
+ ret = __reserve_bp_slot(bp);
+
mutex_unlock(&nr_bp_mutex);
return ret;
}
+static void __release_bp_slot(struct perf_event *bp)
+{
+ toggle_bp_slot(bp, false);
+}
+
void release_bp_slot(struct perf_event *bp)
{
mutex_lock(&nr_bp_mutex);
- toggle_bp_slot(bp, false);
+ __release_bp_slot(bp);
mutex_unlock(&nr_bp_mutex);
}
+/*
+ * Allow the kernel debugger to reserve breakpoint slots without
+ * taking a lock using the dbg_* variant of for the reserve and
+ * release breakpoint slots.
+ */
+int dbg_reserve_bp_slot(struct perf_event *bp)
+{
+ if (mutex_is_locked(&nr_bp_mutex))
+ return -1;
+
+ return __reserve_bp_slot(bp);
+}
+
+int dbg_release_bp_slot(struct perf_event *bp)
+{
+ if (mutex_is_locked(&nr_bp_mutex))
+ return -1;
+
+ __release_bp_slot(bp);
+
+ return 0;
+}
int register_perf_hw_breakpoint(struct perf_event *bp)
{
@@ -296,6 +328,10 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
if (!bp->attr.disabled || !bp->overflow_handler)
ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+ /* if arch_validate_hwbkpt_settings() fails then release bp slot */
+ if (ret)
+ release_bp_slot(bp);
+
return ret;
}
@@ -324,8 +360,8 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
{
u64 old_addr = bp->attr.bp_addr;
+ u64 old_len = bp->attr.bp_len;
int old_type = bp->attr.bp_type;
- int old_len = bp->attr.bp_len;
int err = 0;
perf_event_disable(bp);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index a9a93d9ee7a..ef077fb7315 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -32,6 +32,7 @@
#include <linux/console.h>
#include <linux/vmalloc.h>
#include <linux/swap.h>
+#include <linux/kmsg_dump.h>
#include <asm/page.h>
#include <asm/uaccess.h>
@@ -1074,6 +1075,9 @@ void crash_kexec(struct pt_regs *regs)
if (mutex_trylock(&kexec_mutex)) {
if (kexec_crash_image) {
struct pt_regs fixed_regs;
+
+ kmsg_dump(KMSG_DUMP_KEXEC);
+
crash_setup_regs(&fixed_regs, regs);
crash_save_vmcoreinfo();
machine_crash_shutdown(&fixed_regs);
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 32c5c15d750..35edbe22e9a 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -80,7 +80,7 @@ int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
buffer = kmalloc(size, gfp_mask);
if (!buffer) {
- _kfifo_init(fifo, 0, 0);
+ _kfifo_init(fifo, NULL, 0);
return -ENOMEM;
}
@@ -97,6 +97,7 @@ EXPORT_SYMBOL(kfifo_alloc);
void kfifo_free(struct kfifo *fifo)
{
kfree(fifo->buffer);
+ _kfifo_init(fifo, NULL, 0);
}
EXPORT_SYMBOL(kfifo_free);
@@ -349,6 +350,7 @@ EXPORT_SYMBOL(__kfifo_from_user_n);
* @fifo: the fifo to be used.
* @from: pointer to the data to be added.
* @len: the length of the data to be added.
+ * @total: the actual returned data length.
*
* This function copies at most @len bytes from the @from into the
* FIFO depending and returns -EFAULT/0.
@@ -399,7 +401,7 @@ EXPORT_SYMBOL(__kfifo_to_user_n);
* @fifo: the fifo to be used.
* @to: where the data must be copied.
* @len: the size of the destination buffer.
- @ @lenout: pointer to output variable with copied data
+ * @lenout: pointer to output variable with copied data
*
* This function copies at most @len bytes from the FIFO into the
* @to buffer and 0 or -EFAULT.
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 2eb517e2351..761fdd2b303 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -583,6 +583,9 @@ static void kgdb_wait(struct pt_regs *regs)
smp_wmb();
atomic_set(&cpu_in_kgdb[cpu], 1);
+ /* Disable any cpu specific hw breakpoints */
+ kgdb_disable_hw_debug(regs);
+
/* Wait till primary CPU is done with debugging */
while (atomic_read(&passive_cpu_wait[cpu]))
cpu_relax();
@@ -596,7 +599,7 @@ static void kgdb_wait(struct pt_regs *regs)
/* Signal the primary CPU that we are done: */
atomic_set(&cpu_in_kgdb[cpu], 0);
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sync();
clocksource_touch_watchdog();
local_irq_restore(flags);
}
@@ -1450,7 +1453,7 @@ acquirelock:
(kgdb_info[cpu].task &&
kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
atomic_set(&kgdb_active, -1);
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sync();
clocksource_touch_watchdog();
local_irq_restore(flags);
@@ -1550,7 +1553,7 @@ kgdb_restore:
}
/* Free kgdb_active */
atomic_set(&kgdb_active, -1);
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sync();
clocksource_touch_watchdog();
local_irq_restore(flags);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b7df302a020..c4b43430d39 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -93,6 +93,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
{"native_get_debugreg",},
{"irq_entries_start",},
{"common_interrupt",},
+ {"mcount",}, /* mcount can be called from everywhere */
{NULL} /* Terminator */
};
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 5feaddcdbe4..c62ec14609b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2147,7 +2147,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
return ret;
return print_irq_inversion_bug(curr, &root, target_entry,
- this, 1, irqclass);
+ this, 0, irqclass);
}
void print_irqtrace_events(struct task_struct *curr)
diff --git a/kernel/padata.c b/kernel/padata.c
new file mode 100644
index 00000000000..6f9bcb8313d
--- /dev/null
+++ b/kernel/padata.c
@@ -0,0 +1,690 @@
+/*
+ * padata.c - generic interface to process data streams in parallel
+ *
+ * Copyright (C) 2008, 2009 secunet Security Networks AG
+ * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/err.h>
+#include <linux/cpu.h>
+#include <linux/padata.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/rcupdate.h>
+
+#define MAX_SEQ_NR INT_MAX - NR_CPUS
+#define MAX_OBJ_NUM 10000 * NR_CPUS
+
+static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
+{
+ int cpu, target_cpu;
+
+ target_cpu = cpumask_first(pd->cpumask);
+ for (cpu = 0; cpu < cpu_index; cpu++)
+ target_cpu = cpumask_next(target_cpu, pd->cpumask);
+
+ return target_cpu;
+}
+
+static int padata_cpu_hash(struct padata_priv *padata)
+{
+ int cpu_index;
+ struct parallel_data *pd;
+
+ pd = padata->pd;
+
+ /*
+ * Hash the sequence numbers to the cpus by taking
+ * seq_nr mod. number of cpus in use.
+ */
+ cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask);
+
+ return padata_index_to_cpu(pd, cpu_index);
+}
+
+static void padata_parallel_worker(struct work_struct *work)
+{
+ struct padata_queue *queue;
+ struct parallel_data *pd;
+ struct padata_instance *pinst;
+ LIST_HEAD(local_list);
+
+ local_bh_disable();
+ queue = container_of(work, struct padata_queue, pwork);
+ pd = queue->pd;
+ pinst = pd->pinst;
+
+ spin_lock(&queue->parallel.lock);
+ list_replace_init(&queue->parallel.list, &local_list);
+ spin_unlock(&queue->parallel.lock);
+
+ while (!list_empty(&local_list)) {
+ struct padata_priv *padata;
+
+ padata = list_entry(local_list.next,
+ struct padata_priv, list);
+
+ list_del_init(&padata->list);
+
+ padata->parallel(padata);
+ }
+
+ local_bh_enable();
+}
+
+/*
+ * padata_do_parallel - padata parallelization function
+ *
+ * @pinst: padata instance
+ * @padata: object to be parallelized
+ * @cb_cpu: cpu the serialization callback function will run on,
+ * must be in the cpumask of padata.
+ *
+ * The parallelization callback function will run with BHs off.
+ * Note: Every object which is parallelized by padata_do_parallel
+ * must be seen by padata_do_serial.
+ */
+int padata_do_parallel(struct padata_instance *pinst,
+ struct padata_priv *padata, int cb_cpu)
+{
+ int target_cpu, err;
+ struct padata_queue *queue;
+ struct parallel_data *pd;
+
+ rcu_read_lock_bh();
+
+ pd = rcu_dereference(pinst->pd);
+
+ err = 0;
+ if (!(pinst->flags & PADATA_INIT))
+ goto out;
+
+ err = -EBUSY;
+ if ((pinst->flags & PADATA_RESET))
+ goto out;
+
+ if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
+ goto out;
+
+ err = -EINVAL;
+ if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
+ goto out;
+
+ err = -EINPROGRESS;
+ atomic_inc(&pd->refcnt);
+ padata->pd = pd;
+ padata->cb_cpu = cb_cpu;
+
+ if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
+ atomic_set(&pd->seq_nr, -1);
+
+ padata->seq_nr = atomic_inc_return(&pd->seq_nr);
+
+ target_cpu = padata_cpu_hash(padata);
+ queue = per_cpu_ptr(pd->queue, target_cpu);
+
+ spin_lock(&queue->parallel.lock);
+ list_add_tail(&padata->list, &queue->parallel.list);
+ spin_unlock(&queue->parallel.lock);
+
+ queue_work_on(target_cpu, pinst->wq, &queue->pwork);
+
+out:
+ rcu_read_unlock_bh();
+
+ return err;
+}
+EXPORT_SYMBOL(padata_do_parallel);
+
+static struct padata_priv *padata_get_next(struct parallel_data *pd)
+{
+ int cpu, num_cpus, empty, calc_seq_nr;
+ int seq_nr, next_nr, overrun, next_overrun;
+ struct padata_queue *queue, *next_queue;
+ struct padata_priv *padata;
+ struct padata_list *reorder;
+
+ empty = 0;
+ next_nr = -1;
+ next_overrun = 0;
+ next_queue = NULL;
+
+ num_cpus = cpumask_weight(pd->cpumask);
+
+ for_each_cpu(cpu, pd->cpumask) {
+ queue = per_cpu_ptr(pd->queue, cpu);
+ reorder = &queue->reorder;
+
+ /*
+ * Calculate the seq_nr of the object that should be
+ * next in this queue.
+ */
+ overrun = 0;
+ calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
+ + queue->cpu_index;
+
+ if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
+ calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
+ overrun = 1;
+ }
+
+ if (!list_empty(&reorder->list)) {
+ padata = list_entry(reorder->list.next,
+ struct padata_priv, list);
+
+ seq_nr = padata->seq_nr;
+ BUG_ON(calc_seq_nr != seq_nr);
+ } else {
+ seq_nr = calc_seq_nr;
+ empty++;
+ }
+
+ if (next_nr < 0 || seq_nr < next_nr
+ || (next_overrun && !overrun)) {
+ next_nr = seq_nr;
+ next_overrun = overrun;
+ next_queue = queue;
+ }
+ }
+
+ padata = NULL;
+
+ if (empty == num_cpus)
+ goto out;
+
+ reorder = &next_queue->reorder;
+
+ if (!list_empty(&reorder->list)) {
+ padata = list_entry(reorder->list.next,
+ struct padata_priv, list);
+
+ if (unlikely(next_overrun)) {
+ for_each_cpu(cpu, pd->cpumask) {
+ queue = per_cpu_ptr(pd->queue, cpu);
+ atomic_set(&queue->num_obj, 0);
+ }
+ }
+
+ spin_lock(&reorder->lock);
+ list_del_init(&padata->list);
+ atomic_dec(&pd->reorder_objects);
+ spin_unlock(&reorder->lock);
+
+ atomic_inc(&next_queue->num_obj);
+
+ goto out;
+ }
+
+ if (next_nr % num_cpus == next_queue->cpu_index) {
+ padata = ERR_PTR(-ENODATA);
+ goto out;
+ }
+
+ padata = ERR_PTR(-EINPROGRESS);
+out:
+ return padata;
+}
+
+static void padata_reorder(struct parallel_data *pd)
+{
+ struct padata_priv *padata;
+ struct padata_queue *queue;
+ struct padata_instance *pinst = pd->pinst;
+
+try_again:
+ if (!spin_trylock_bh(&pd->lock))
+ goto out;
+
+ while (1) {
+ padata = padata_get_next(pd);
+
+ if (!padata || PTR_ERR(padata) == -EINPROGRESS)
+ break;
+
+ if (PTR_ERR(padata) == -ENODATA) {
+ spin_unlock_bh(&pd->lock);
+ goto out;
+ }
+
+ queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
+
+ spin_lock(&queue->serial.lock);
+ list_add_tail(&padata->list, &queue->serial.list);
+ spin_unlock(&queue->serial.lock);
+
+ queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork);
+ }
+
+ spin_unlock_bh(&pd->lock);
+
+ if (atomic_read(&pd->reorder_objects))
+ goto try_again;
+
+out:
+ return;
+}
+
+static void padata_serial_worker(struct work_struct *work)
+{
+ struct padata_queue *queue;
+ struct parallel_data *pd;
+ LIST_HEAD(local_list);
+
+ local_bh_disable();
+ queue = container_of(work, struct padata_queue, swork);
+ pd = queue->pd;
+
+ spin_lock(&queue->serial.lock);
+ list_replace_init(&queue->serial.list, &local_list);
+ spin_unlock(&queue->serial.lock);
+
+ while (!list_empty(&local_list)) {
+ struct padata_priv *padata;
+
+ padata = list_entry(local_list.next,
+ struct padata_priv, list);
+
+ list_del_init(&padata->list);
+
+ padata->serial(padata);
+ atomic_dec(&pd->refcnt);
+ }
+ local_bh_enable();
+}
+
+/*
+ * padata_do_serial - padata serialization function
+ *
+ * @padata: object to be serialized.
+ *
+ * padata_do_serial must be called for every parallelized object.
+ * The serialization callback function will run with BHs off.
+ */
+void padata_do_serial(struct padata_priv *padata)
+{
+ int cpu;
+ struct padata_queue *queue;
+ struct parallel_data *pd;
+
+ pd = padata->pd;
+
+ cpu = get_cpu();
+ queue = per_cpu_ptr(pd->queue, cpu);
+
+ spin_lock(&queue->reorder.lock);
+ atomic_inc(&pd->reorder_objects);
+ list_add_tail(&padata->list, &queue->reorder.list);
+ spin_unlock(&queue->reorder.lock);
+
+ put_cpu();
+
+ padata_reorder(pd);
+}
+EXPORT_SYMBOL(padata_do_serial);
+
+static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
+ const struct cpumask *cpumask)
+{
+ int cpu, cpu_index, num_cpus;
+ struct padata_queue *queue;
+ struct parallel_data *pd;
+
+ cpu_index = 0;
+
+ pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
+ if (!pd)
+ goto err;
+
+ pd->queue = alloc_percpu(struct padata_queue);
+ if (!pd->queue)
+ goto err_free_pd;
+
+ if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
+ goto err_free_queue;
+
+ for_each_possible_cpu(cpu) {
+ queue = per_cpu_ptr(pd->queue, cpu);
+
+ queue->pd = pd;
+
+ if (cpumask_test_cpu(cpu, cpumask)
+ && cpumask_test_cpu(cpu, cpu_active_mask)) {
+ queue->cpu_index = cpu_index;
+ cpu_index++;
+ } else
+ queue->cpu_index = -1;
+
+ INIT_LIST_HEAD(&queue->reorder.list);
+ INIT_LIST_HEAD(&queue->parallel.list);
+ INIT_LIST_HEAD(&queue->serial.list);
+ spin_lock_init(&queue->reorder.lock);
+ spin_lock_init(&queue->parallel.lock);
+ spin_lock_init(&queue->serial.lock);
+
+ INIT_WORK(&queue->pwork, padata_parallel_worker);
+ INIT_WORK(&queue->swork, padata_serial_worker);
+ atomic_set(&queue->num_obj, 0);
+ }
+
+ cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
+
+ num_cpus = cpumask_weight(pd->cpumask);
+ pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
+
+ atomic_set(&pd->seq_nr, -1);
+ atomic_set(&pd->reorder_objects, 0);
+ atomic_set(&pd->refcnt, 0);
+ pd->pinst = pinst;
+ spin_lock_init(&pd->lock);
+
+ return pd;
+
+err_free_queue:
+ free_percpu(pd->queue);
+err_free_pd:
+ kfree(pd);
+err:
+ return NULL;
+}
+
+static void padata_free_pd(struct parallel_data *pd)
+{
+ free_cpumask_var(pd->cpumask);
+ free_percpu(pd->queue);
+ kfree(pd);
+}
+
+static void padata_replace(struct padata_instance *pinst,
+ struct parallel_data *pd_new)
+{
+ struct parallel_data *pd_old = pinst->pd;
+
+ pinst->flags |= PADATA_RESET;
+
+ rcu_assign_pointer(pinst->pd, pd_new);
+
+ synchronize_rcu();
+
+ while (atomic_read(&pd_old->refcnt) != 0)
+ yield();
+
+ flush_workqueue(pinst->wq);
+
+ padata_free_pd(pd_old);
+
+ pinst->flags &= ~PADATA_RESET;
+}
+
+/*
+ * padata_set_cpumask - set the cpumask that padata should use
+ *
+ * @pinst: padata instance
+ * @cpumask: the cpumask to use
+ */
+int padata_set_cpumask(struct padata_instance *pinst,
+ cpumask_var_t cpumask)
+{
+ struct parallel_data *pd;
+ int err = 0;
+
+ might_sleep();
+
+ mutex_lock(&pinst->lock);
+
+ pd = padata_alloc_pd(pinst, cpumask);
+ if (!pd) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ cpumask_copy(pinst->cpumask, cpumask);
+
+ padata_replace(pinst, pd);
+
+out:
+ mutex_unlock(&pinst->lock);
+
+ return err;
+}
+EXPORT_SYMBOL(padata_set_cpumask);
+
+static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
+{
+ struct parallel_data *pd;
+
+ if (cpumask_test_cpu(cpu, cpu_active_mask)) {
+ pd = padata_alloc_pd(pinst, pinst->cpumask);
+ if (!pd)
+ return -ENOMEM;
+
+ padata_replace(pinst, pd);
+ }
+
+ return 0;
+}
+
+/*
+ * padata_add_cpu - add a cpu to the padata cpumask
+ *
+ * @pinst: padata instance
+ * @cpu: cpu to add
+ */
+int padata_add_cpu(struct padata_instance *pinst, int cpu)
+{
+ int err;
+
+ might_sleep();
+
+ mutex_lock(&pinst->lock);
+
+ cpumask_set_cpu(cpu, pinst->cpumask);
+ err = __padata_add_cpu(pinst, cpu);
+
+ mutex_unlock(&pinst->lock);
+
+ return err;
+}
+EXPORT_SYMBOL(padata_add_cpu);
+
+static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
+{
+ struct parallel_data *pd;
+
+ if (cpumask_test_cpu(cpu, cpu_online_mask)) {
+ pd = padata_alloc_pd(pinst, pinst->cpumask);
+ if (!pd)
+ return -ENOMEM;
+
+ padata_replace(pinst, pd);
+ }
+
+ return 0;
+}
+
+/*
+ * padata_remove_cpu - remove a cpu from the padata cpumask
+ *
+ * @pinst: padata instance
+ * @cpu: cpu to remove
+ */
+int padata_remove_cpu(struct padata_instance *pinst, int cpu)
+{
+ int err;
+
+ might_sleep();
+
+ mutex_lock(&pinst->lock);
+
+ cpumask_clear_cpu(cpu, pinst->cpumask);
+ err = __padata_remove_cpu(pinst, cpu);
+
+ mutex_unlock(&pinst->lock);
+
+ return err;
+}
+EXPORT_SYMBOL(padata_remove_cpu);
+
+/*
+ * padata_start - start the parallel processing
+ *
+ * @pinst: padata instance to start
+ */
+void padata_start(struct padata_instance *pinst)
+{
+ might_sleep();
+
+ mutex_lock(&pinst->lock);
+ pinst->flags |= PADATA_INIT;
+ mutex_unlock(&pinst->lock);
+}
+EXPORT_SYMBOL(padata_start);
+
+/*
+ * padata_stop - stop the parallel processing
+ *
+ * @pinst: padata instance to stop
+ */
+void padata_stop(struct padata_instance *pinst)
+{
+ might_sleep();
+
+ mutex_lock(&pinst->lock);
+ pinst->flags &= ~PADATA_INIT;
+ mutex_unlock(&pinst->lock);
+}
+EXPORT_SYMBOL(padata_stop);
+
+static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ int err;
+ struct padata_instance *pinst;
+ int cpu = (unsigned long)hcpu;
+
+ pinst = container_of(nfb, struct padata_instance, cpu_notifier);
+
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ if (!cpumask_test_cpu(cpu, pinst->cpumask))
+ break;
+ mutex_lock(&pinst->lock);
+ err = __padata_add_cpu(pinst, cpu);
+ mutex_unlock(&pinst->lock);
+ if (err)
+ return NOTIFY_BAD;
+ break;
+
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ if (!cpumask_test_cpu(cpu, pinst->cpumask))
+ break;
+ mutex_lock(&pinst->lock);
+ err = __padata_remove_cpu(pinst, cpu);
+ mutex_unlock(&pinst->lock);
+ if (err)
+ return NOTIFY_BAD;
+ break;
+
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ if (!cpumask_test_cpu(cpu, pinst->cpumask))
+ break;
+ mutex_lock(&pinst->lock);
+ __padata_remove_cpu(pinst, cpu);
+ mutex_unlock(&pinst->lock);
+
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ if (!cpumask_test_cpu(cpu, pinst->cpumask))
+ break;
+ mutex_lock(&pinst->lock);
+ __padata_add_cpu(pinst, cpu);
+ mutex_unlock(&pinst->lock);
+ }
+
+ return NOTIFY_OK;
+}
+
+/*
+ * padata_alloc - allocate and initialize a padata instance
+ *
+ * @cpumask: cpumask that padata uses for parallelization
+ * @wq: workqueue to use for the allocated padata instance
+ */
+struct padata_instance *padata_alloc(const struct cpumask *cpumask,
+ struct workqueue_struct *wq)
+{
+ int err;
+ struct padata_instance *pinst;
+ struct parallel_data *pd;
+
+ pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
+ if (!pinst)
+ goto err;
+
+ pd = padata_alloc_pd(pinst, cpumask);
+ if (!pd)
+ goto err_free_inst;
+
+ rcu_assign_pointer(pinst->pd, pd);
+
+ pinst->wq = wq;
+
+ cpumask_copy(pinst->cpumask, cpumask);
+
+ pinst->flags = 0;
+
+ pinst->cpu_notifier.notifier_call = padata_cpu_callback;
+ pinst->cpu_notifier.priority = 0;
+ err = register_hotcpu_notifier(&pinst->cpu_notifier);
+ if (err)
+ goto err_free_pd;
+
+ mutex_init(&pinst->lock);
+
+ return pinst;
+
+err_free_pd:
+ padata_free_pd(pd);
+err_free_inst:
+ kfree(pinst);
+err:
+ return NULL;
+}
+EXPORT_SYMBOL(padata_alloc);
+
+/*
+ * padata_free - free a padata instance
+ *
+ * @ padata_inst: padata instance to free
+ */
+void padata_free(struct padata_instance *pinst)
+{
+ padata_stop(pinst);
+
+ synchronize_rcu();
+
+ while (atomic_read(&pinst->pd->refcnt) != 0)
+ yield();
+
+ unregister_hotcpu_notifier(&pinst->cpu_notifier);
+ padata_free_pd(pinst->pd);
+ kfree(pinst);
+}
+EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index 5827f7b9725..c787333282b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -75,7 +75,6 @@ NORET_TYPE void panic(const char * fmt, ...)
dump_stack();
#endif
- kmsg_dump(KMSG_DUMP_PANIC);
/*
* If we have crashed and we have a crash kernel loaded let it handle
* everything else.
@@ -83,6 +82,8 @@ NORET_TYPE void panic(const char * fmt, ...)
*/
crash_kexec(NULL);
+ kmsg_dump(KMSG_DUMP_PANIC);
+
/*
* Note smp_send_stop is the usual smp shutdown function, which
* unfortunately means it may not be hardened to work in a panic
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 603c0d8b5df..2ae7409bf38 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3259,8 +3259,6 @@ static void perf_event_task_output(struct perf_event *event,
task_event->event_id.tid = perf_event_tid(event, task);
task_event->event_id.ptid = perf_event_tid(event, current);
- task_event->event_id.time = perf_clock();
-
perf_output_put(&handle, task_event->event_id);
perf_output_end(&handle);
@@ -3268,6 +3266,9 @@ static void perf_event_task_output(struct perf_event *event,
static int perf_event_task_match(struct perf_event *event)
{
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ return 0;
+
if (event->cpu != -1 && event->cpu != smp_processor_id())
return 0;
@@ -3297,7 +3298,7 @@ static void perf_event_task_event(struct perf_task_event *task_event)
cpuctx = &get_cpu_var(perf_cpu_context);
perf_event_task_ctx(&cpuctx->ctx, task_event);
if (!ctx)
- ctx = rcu_dereference(task_event->task->perf_event_ctxp);
+ ctx = rcu_dereference(current->perf_event_ctxp);
if (ctx)
perf_event_task_ctx(ctx, task_event);
put_cpu_var(perf_cpu_context);
@@ -3328,6 +3329,7 @@ static void perf_event_task(struct task_struct *task,
/* .ppid */
/* .tid */
/* .ptid */
+ .time = perf_clock(),
},
};
@@ -3377,6 +3379,9 @@ static void perf_event_comm_output(struct perf_event *event,
static int perf_event_comm_match(struct perf_event *event)
{
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ return 0;
+
if (event->cpu != -1 && event->cpu != smp_processor_id())
return 0;
@@ -3494,6 +3499,9 @@ static void perf_event_mmap_output(struct perf_event *event,
static int perf_event_mmap_match(struct perf_event *event,
struct perf_mmap_event *mmap_event)
{
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ return 0;
+
if (event->cpu != -1 && event->cpu != smp_processor_id())
return 0;
@@ -4571,7 +4579,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (attr->type >= PERF_TYPE_MAX)
return -EINVAL;
- if (attr->__reserved_1 || attr->__reserved_2)
+ if (attr->__reserved_1)
return -EINVAL;
if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 91e09d3b2eb..5c36ea9d55d 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,15 @@ config PM_DEBUG
code. This is helpful when debugging and reporting PM bugs, like
suspend support.
+config PM_ADVANCED_DEBUG
+ bool "Extra PM attributes in sysfs for low-level debugging/testing"
+ depends on PM_DEBUG
+ default n
+ ---help---
+ Add extra sysfs attributes allowing one to access some Power Management
+ fields of device objects from user space. If you are not a kernel
+ developer interested in debugging/testing Power Management, say "no".
+
config PM_VERBOSE
bool "Verbose Power Management debugging"
depends on PM_DEBUG
@@ -85,6 +94,11 @@ config PM_SLEEP
depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
default y
+config PM_SLEEP_ADVANCED_DEBUG
+ bool
+ depends on PM_ADVANCED_DEBUG
+ default n
+
config SUSPEND
bool "Suspend to RAM and standby"
depends on PM && ARCH_SUSPEND_POSSIBLE
@@ -222,3 +236,8 @@ config PM_RUNTIME
and the bus type drivers of the buses the devices are on are
responsible for the actual handling of the autosuspend requests and
wake-up events.
+
+config PM_OPS
+ bool
+ depends on PM_SLEEP || PM_RUNTIME
+ default y
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0998c713905..b58800b21fc 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -44,6 +44,32 @@ int pm_notifier_call_chain(unsigned long val)
== NOTIFY_BAD) ? -EINVAL : 0;
}
+/* If set, devices may be suspended and resumed asynchronously. */
+int pm_async_enabled = 1;
+
+static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", pm_async_enabled);
+}
+
+static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (strict_strtoul(buf, 10, &val))
+ return -EINVAL;
+
+ if (val > 1)
+ return -EINVAL;
+
+ pm_async_enabled = val;
+ return n;
+}
+
+power_attr(pm_async);
+
#ifdef CONFIG_PM_DEBUG
int pm_test_level = TEST_NONE;
@@ -208,9 +234,12 @@ static struct attribute * g[] = {
#ifdef CONFIG_PM_TRACE
&pm_trace_attr.attr,
#endif
-#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG)
+#ifdef CONFIG_PM_SLEEP
+ &pm_async_attr.attr,
+#ifdef CONFIG_PM_DEBUG
&pm_test_attr.attr,
#endif
+#endif
NULL,
};
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 36cb168e433..830cadecbdf 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1181,7 +1181,7 @@ static void free_unnecessary_pages(void)
memory_bm_position_reset(&copy_bm);
- while (to_free_normal > 0 && to_free_highmem > 0) {
+ while (to_free_normal > 0 || to_free_highmem > 0) {
unsigned long pfn = memory_bm_next_pfn(&copy_bm);
struct page *page = pfn_to_page(pfn);
@@ -1500,7 +1500,7 @@ asmlinkage int swsusp_save(void)
{
unsigned int nr_pages, nr_highmem;
- printk(KERN_INFO "PM: Creating hibernation image: \n");
+ printk(KERN_INFO "PM: Creating hibernation image:\n");
drain_local_pages(NULL);
nr_pages = count_data_pages();
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 09b2b0ae9e9..1d575733d4e 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -657,10 +657,6 @@ int swsusp_read(unsigned int *flags_p)
struct swsusp_info *header;
*flags_p = swsusp_header->flags;
- if (IS_ERR(resume_bdev)) {
- pr_debug("PM: Image device not initialised\n");
- return PTR_ERR(resume_bdev);
- }
memset(&snapshot, 0, sizeof(struct snapshot_handle));
error = snapshot_write_next(&snapshot, PAGE_SIZE);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
deleted file mode 100644
index 5b3601bd189..00000000000
--- a/kernel/power/swsusp.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * linux/kernel/power/swsusp.c
- *
- * This file provides code to write suspend image to swap and read it back.
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
- *
- * This file is released under the GPLv2.
- *
- * I'd like to thank the following people for their work:
- *
- * Pavel Machek <pavel@ucw.cz>:
- * Modifications, defectiveness pointing, being with me at the very beginning,
- * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
- *
- * Steve Doddi <dirk@loth.demon.co.uk>:
- * Support the possibility of hardware state restoring.
- *
- * Raph <grey.havens@earthling.net>:
- * Support for preserving states of network devices and virtual console
- * (including X and svgatextmode)
- *
- * Kurt Garloff <garloff@suse.de>:
- * Straightened the critical function in order to prevent compilers from
- * playing tricks with local variables.
- *
- * Andreas Mohr <a.mohr@mailto.de>
- *
- * Alex Badea <vampire@go.ro>:
- * Fixed runaway init
- *
- * Rafael J. Wysocki <rjw@sisk.pl>
- * Reworked the freeing of memory and the handling of swap
- *
- * More state savers are welcome. Especially for the scsi layer...
- *
- * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
- */
-
-#include <linux/mm.h>
-#include <linux/suspend.h>
-#include <linux/spinlock.h>
-#include <linux/kernel.h>
-#include <linux/major.h>
-#include <linux/swap.h>
-#include <linux/pm.h>
-#include <linux/swapops.h>
-#include <linux/bootmem.h>
-#include <linux/syscalls.h>
-#include <linux/highmem.h>
-#include <linux/time.h>
-#include <linux/rbtree.h>
-#include <linux/io.h>
-
-#include "power.h"
-
-int in_suspend __nosavedata = 0;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bf0014d6a5f..4d2289626a8 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -195,6 +195,15 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
return res;
}
+static void snapshot_deprecated_ioctl(unsigned int cmd)
+{
+ if (printk_ratelimit())
+ printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
+ "be removed soon, update your suspend-to-disk "
+ "utilities\n",
+ __builtin_return_address(0), cmd);
+}
+
static long snapshot_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
@@ -246,8 +255,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
data->frozen = 0;
break;
- case SNAPSHOT_CREATE_IMAGE:
case SNAPSHOT_ATOMIC_SNAPSHOT:
+ snapshot_deprecated_ioctl(cmd);
+ case SNAPSHOT_CREATE_IMAGE:
if (data->mode != O_RDONLY || !data->frozen || data->ready) {
error = -EPERM;
break;
@@ -275,8 +285,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
data->ready = 0;
break;
- case SNAPSHOT_PREF_IMAGE_SIZE:
case SNAPSHOT_SET_IMAGE_SIZE:
+ snapshot_deprecated_ioctl(cmd);
+ case SNAPSHOT_PREF_IMAGE_SIZE:
image_size = arg;
break;
@@ -290,15 +301,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
error = put_user(size, (loff_t __user *)arg);
break;
- case SNAPSHOT_AVAIL_SWAP_SIZE:
case SNAPSHOT_AVAIL_SWAP:
+ snapshot_deprecated_ioctl(cmd);
+ case SNAPSHOT_AVAIL_SWAP_SIZE:
size = count_swap_pages(data->swap, 1);
size <<= PAGE_SHIFT;
error = put_user(size, (loff_t __user *)arg);
break;
- case SNAPSHOT_ALLOC_SWAP_PAGE:
case SNAPSHOT_GET_SWAP_PAGE:
+ snapshot_deprecated_ioctl(cmd);
+ case SNAPSHOT_ALLOC_SWAP_PAGE:
if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
error = -ENODEV;
break;
@@ -321,6 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
break;
case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
+ snapshot_deprecated_ioctl(cmd);
if (!swsusp_swap_in_use()) {
/*
* User space encodes device types as two-byte values,
@@ -362,6 +376,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
break;
case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
+ snapshot_deprecated_ioctl(cmd);
error = -EINVAL;
switch (arg) {
diff --git a/kernel/printk.c b/kernel/printk.c
index 17463ca2e22..1751c456b71 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1467,6 +1467,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
static const char const *kmsg_reasons[] = {
[KMSG_DUMP_OOPS] = "oops",
[KMSG_DUMP_PANIC] = "panic",
+ [KMSG_DUMP_KEXEC] = "kexec",
};
static const char *kmsg_to_str(enum kmsg_dump_reason reason)
diff --git a/kernel/resource.c b/kernel/resource.c
index af96c1e4b54..24e9e60c145 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -188,6 +188,36 @@ static int __release_resource(struct resource *old)
return -EINVAL;
}
+static void __release_child_resources(struct resource *r)
+{
+ struct resource *tmp, *p;
+ resource_size_t size;
+
+ p = r->child;
+ r->child = NULL;
+ while (p) {
+ tmp = p;
+ p = p->sibling;
+
+ tmp->parent = NULL;
+ tmp->sibling = NULL;
+ __release_child_resources(tmp);
+
+ printk(KERN_DEBUG "release child resource %pR\n", tmp);
+ /* need to restore size, and keep flags */
+ size = resource_size(tmp);
+ tmp->start = 0;
+ tmp->end = size - 1;
+ }
+}
+
+void release_child_resources(struct resource *r)
+{
+ write_lock(&resource_lock);
+ __release_child_resources(r);
+ write_unlock(&resource_lock);
+}
+
/**
* request_resource - request and reserve an I/O or memory resource
* @root: root resource descriptor
@@ -303,8 +333,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
static int find_resource(struct resource *root, struct resource *new,
resource_size_t size, resource_size_t min,
resource_size_t max, resource_size_t align,
- void (*alignf)(void *, struct resource *,
- resource_size_t, resource_size_t),
+ resource_size_t (*alignf)(void *,
+ const struct resource *,
+ resource_size_t,
+ resource_size_t),
void *alignf_data)
{
struct resource *this = root->child;
@@ -330,7 +362,7 @@ static int find_resource(struct resource *root, struct resource *new,
tmp.end = max;
tmp.start = ALIGN(tmp.start, align);
if (alignf)
- alignf(alignf_data, &tmp, size, align);
+ tmp.start = alignf(alignf_data, &tmp, size, align);
if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
new->start = tmp.start;
new->end = tmp.start + size - 1;
@@ -358,8 +390,10 @@ static int find_resource(struct resource *root, struct resource *new,
int allocate_resource(struct resource *root, struct resource *new,
resource_size_t size, resource_size_t min,
resource_size_t max, resource_size_t align,
- void (*alignf)(void *, struct resource *,
- resource_size_t, resource_size_t),
+ resource_size_t (*alignf)(void *,
+ const struct resource *,
+ resource_size_t,
+ resource_size_t),
void *alignf_data)
{
int err;
diff --git a/kernel/sched.c b/kernel/sched.c
index c535cc4f642..3a8fb30a91b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2320,14 +2320,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
}
/*
- * Called from:
+ * Gets called from 3 sites (exec, fork, wakeup), since it is called without
+ * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
+ * by:
*
- * - fork, @p is stable because it isn't on the tasklist yet
- *
- * - exec, @p is unstable, retry loop
- *
- * - wake-up, we serialize ->cpus_allowed against TASK_WAKING so
- * we should be good.
+ * exec: is unstable, retry loop
+ * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
*/
static inline
int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
@@ -2620,9 +2618,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
-#ifdef CONFIG_SMP
- cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
-#endif
set_task_cpu(p, cpu);
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -2652,6 +2647,21 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{
unsigned long flags;
struct rq *rq;
+ int cpu = get_cpu();
+
+#ifdef CONFIG_SMP
+ /*
+ * Fork balancing, do it here and not earlier because:
+ * - cpus_allowed can change in the fork path
+ * - any previously selected cpu might disappear through hotplug
+ *
+ * We still have TASK_WAKING but PF_STARTING is gone now, meaning
+ * ->cpus_allowed is stable, we have preemption disabled, meaning
+ * cpu_online_mask is stable.
+ */
+ cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
+ set_task_cpu(p, cpu);
+#endif
rq = task_rq_lock(p, &flags);
BUG_ON(p->state != TASK_WAKING);
@@ -2665,6 +2675,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
p->sched_class->task_woken(rq, p);
#endif
task_rq_unlock(rq, &flags);
+ put_cpu();
}
#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -5530,8 +5541,11 @@ need_resched_nonpreemptible:
post_schedule(rq);
- if (unlikely(reacquire_kernel_lock(current) < 0))
+ if (unlikely(reacquire_kernel_lock(current) < 0)) {
+ prev = rq->curr;
+ switch_count = &prev->nivcsw;
goto need_resched_nonpreemptible;
+ }
preempt_enable_no_resched();
if (need_resched())
@@ -7136,14 +7150,18 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
* the ->cpus_allowed mask from under waking tasks, which would be
* possible when we change rq->lock in ttwu(), so synchronize against
* TASK_WAKING to avoid that.
+ *
+ * Make an exception for freshly cloned tasks, since cpuset namespaces
+ * might move the task about, we have to validate the target in
+ * wake_up_new_task() anyway since the cpu might have gone away.
*/
again:
- while (p->state == TASK_WAKING)
+ while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
cpu_relax();
rq = task_rq_lock(p, &flags);
- if (p->state == TASK_WAKING) {
+ if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
task_rq_unlock(rq, &flags);
goto again;
}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 42ac3c9f66f..8fe7ee81c55 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1508,7 +1508,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
* If there's an idle sibling in this domain, make that
* the wake_affine target instead of the current cpu.
*/
- if (tmp->flags & SD_PREFER_SIBLING)
+ if (tmp->flags & SD_SHARE_PKG_RESOURCES)
target = select_idle_sibling(p, tmp, target);
if (target >= 0) {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index a09502e2ef7..7c1a67ef027 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill);
*/
/*
- * The trampoline is called when the hrtimer expires. If this is
- * called from the hrtimer interrupt then we schedule the tasklet as
- * the timer callback function expects to run in softirq context. If
- * it's called in softirq context anyway (i.e. high resolution timers
- * disabled) then the hrtimer callback is called right away.
+ * The trampoline is called when the hrtimer expires. It schedules a tasklet
+ * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
+ * hrtimer callback, but from softirq context.
*/
static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
{
struct tasklet_hrtimer *ttimer =
container_of(timer, struct tasklet_hrtimer, timer);
- if (hrtimer_is_hres_active(timer)) {
- tasklet_hi_schedule(&ttimer->tasklet);
- return HRTIMER_NORESTART;
- }
- return ttimer->function(timer);
+ tasklet_hi_schedule(&ttimer->tasklet);
+ return HRTIMER_NORESTART;
}
/*
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d22579087e2..0d4c7898ab8 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -25,6 +25,7 @@ static DEFINE_SPINLOCK(print_lock);
static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
+static DEFINE_PER_CPU(bool, softlock_touch_sync);
static int __read_mostly did_panic;
int __read_mostly softlockup_thresh = 60;
@@ -79,6 +80,12 @@ void touch_softlockup_watchdog(void)
}
EXPORT_SYMBOL(touch_softlockup_watchdog);
+void touch_softlockup_watchdog_sync(void)
+{
+ __raw_get_cpu_var(softlock_touch_sync) = true;
+ __raw_get_cpu_var(softlockup_touch_ts) = 0;
+}
+
void touch_all_softlockup_watchdogs(void)
{
int cpu;
@@ -118,6 +125,14 @@ void softlockup_tick(void)
}
if (touch_ts == 0) {
+ if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
+ /*
+ * If the time stamp was touched atomically
+ * make sure the scheduler tick is up to date.
+ */
+ per_cpu(softlock_touch_sync, this_cpu) = false;
+ sched_clock_tick();
+ }
__touch_softlockup_watchdog();
return;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 26a6b73a6b8..18bde979f34 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -222,6 +222,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
if (which > PRIO_USER || which < PRIO_PROCESS)
return -EINVAL;
+ rcu_read_lock();
read_lock(&tasklist_lock);
switch (which) {
case PRIO_PROCESS:
@@ -267,6 +268,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
}
out_unlock:
read_unlock(&tasklist_lock);
+ rcu_read_unlock();
return retval;
}
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 6f740d9f094..d7395fdfb9f 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -259,7 +259,8 @@ void clockevents_notify(unsigned long reason, void *arg)
cpu = *((int *)arg);
list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
if (cpumask_test_cpu(cpu, dev->cpumask) &&
- cpumask_weight(dev->cpumask) == 1) {
+ cpumask_weight(dev->cpumask) == 1 &&
+ !tick_is_broadcast_device(dev)) {
BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
list_del(&dev->list);
}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e85c23404d3..13700833c18 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -343,7 +343,19 @@ static void clocksource_resume_watchdog(void)
{
unsigned long flags;
- spin_lock_irqsave(&watchdog_lock, flags);
+ /*
+ * We use trylock here to avoid a potential dead lock when
+ * kgdb calls this code after the kernel has been stopped with
+ * watchdog_lock held. When watchdog_lock is held we just
+ * return and accept, that the watchdog might trigger and mark
+ * the monitored clock source (usually TSC) unstable.
+ *
+ * This does not affect the other caller clocksource_resume()
+ * because at this point the kernel is UP, interrupts are
+ * disabled and nothing can hold watchdog_lock.
+ */
+ if (!spin_trylock_irqsave(&watchdog_lock, flags))
+ return;
clocksource_reset_watchdog();
spin_unlock_irqrestore(&watchdog_lock, flags);
}
@@ -458,8 +470,8 @@ void clocksource_resume(void)
* clocksource_touch_watchdog - Update watchdog
*
* Update the watchdog after exception contexts such as kgdb so as not
- * to incorrectly trip the watchdog.
- *
+ * to incorrectly trip the watchdog. This might fail when the kernel
+ * was stopped in code which holds watchdog_lock.
*/
void clocksource_touch_watchdog(void)
{
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7faaa32fbf4..e2ab064c6d4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -880,6 +880,7 @@ void getboottime(struct timespec *ts)
set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
}
+EXPORT_SYMBOL_GPL(getboottime);
/**
* monotonic_to_bootbased - Convert the monotonic time to boot based.
@@ -889,6 +890,7 @@ void monotonic_to_bootbased(struct timespec *ts)
{
*ts = timespec_add_safe(*ts, total_sleep_time);
}
+EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
unsigned long get_seconds(void)
{
diff --git a/kernel/timer.c b/kernel/timer.c
index 15533b79239..c61a7949387 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1198,6 +1198,7 @@ void update_process_times(int user_tick)
run_local_timers();
rcu_check_callbacks(cpu, user_tick);
printk_tick();
+ perf_event_do_pending();
scheduler_tick();
run_posix_cpu_timers(p);
}
@@ -1209,8 +1210,6 @@ static void run_timer_softirq(struct softirq_action *h)
{
struct tvec_base *base = __get_cpu_var(tvec_bases);
- perf_event_do_pending();
-
hrtimer_run_pending();
if (time_after_eq(jiffies, base->timer_jiffies))
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 6c22d8a2f28..60e2ce0181e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -27,9 +27,7 @@ config HAVE_FUNCTION_GRAPH_TRACER
config HAVE_FUNCTION_GRAPH_FP_TEST
bool
help
- An arch may pass in a unique value (frame pointer) to both the
- entering and exiting of a function. On exit, the value is compared
- and if it does not match, then it will panic the kernel.
+ See Documentation/trace/ftrace-design.txt
config HAVE_FUNCTION_TRACE_MCOUNT_TEST
bool
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index edefe3b2801..8c1b2d29071 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -464,6 +464,8 @@ struct ring_buffer_iter {
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long head;
struct buffer_page *head_page;
+ struct buffer_page *cache_reader_page;
+ unsigned long cache_read;
u64 read_stamp;
};
@@ -2716,6 +2718,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
iter->read_stamp = cpu_buffer->read_stamp;
else
iter->read_stamp = iter->head_page->page->time_stamp;
+ iter->cache_reader_page = cpu_buffer->reader_page;
+ iter->cache_read = cpu_buffer->read;
}
/**
@@ -3060,13 +3064,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
struct ring_buffer_event *event;
int nr_loops = 0;
- if (ring_buffer_iter_empty(iter))
- return NULL;
-
cpu_buffer = iter->cpu_buffer;
buffer = cpu_buffer->buffer;
+ /*
+ * Check if someone performed a consuming read to
+ * the buffer. A consuming read invalidates the iterator
+ * and we need to reset the iterator in this case.
+ */
+ if (unlikely(iter->cache_read != cpu_buffer->read ||
+ iter->cache_reader_page != cpu_buffer->reader_page))
+ rb_iter_reset(iter);
+
again:
+ if (ring_buffer_iter_empty(iter))
+ return NULL;
+
/*
* We repeat when a timestamp is encountered.
* We can get multiple timestamps by nested interrupts or also
@@ -3081,6 +3094,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
if (rb_per_cpu_empty(cpu_buffer))
return NULL;
+ if (iter->head >= local_read(&iter->head_page->page->commit)) {
+ rb_inc_iter(iter);
+ goto again;
+ }
+
event = rb_iter_head_event(iter);
switch (event->type_len) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0df1b0f2cb9..eac6875cb99 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -951,6 +951,11 @@ void trace_find_cmdline(int pid, char comm[])
return;
}
+ if (WARN_ON_ONCE(pid < 0)) {
+ strcpy(comm, "<XXX>");
+ return;
+ }
+
if (pid > PID_MAX_DEFAULT) {
strcpy(comm, "<...>");
return;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 6ea90c0e2c9..50b1b823980 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -689,7 +689,7 @@ static int create_trace_probe(int argc, char **argv)
return -EINVAL;
}
/* an address specified */
- ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr);
+ ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
if (ret) {
pr_info("Failed to parse address.\n");
return ret;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 678a5120ee3..f4bc9b27de5 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
unsigned long val, flags;
char buf[64];
int ret;
+ int cpu;
if (count >= sizeof(buf))
return -EINVAL;
@@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
return ret;
local_irq_save(flags);
+
+ /*
+ * In case we trace inside arch_spin_lock() or after (NMI),
+ * we will cause circular lock, so we also need to increase
+ * the percpu trace_active here.
+ */
+ cpu = smp_processor_id();
+ per_cpu(trace_active, cpu)++;
+
arch_spin_lock(&max_stack_lock);
*ptr = val;
arch_spin_unlock(&max_stack_lock);
+
+ per_cpu(trace_active, cpu)--;
local_irq_restore(flags);
return count;
@@ -206,7 +218,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
static void *t_start(struct seq_file *m, loff_t *pos)
{
+ int cpu;
+
local_irq_disable();
+
+ cpu = smp_processor_id();
+ per_cpu(trace_active, cpu)++;
+
arch_spin_lock(&max_stack_lock);
if (*pos == 0)
@@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
static void t_stop(struct seq_file *m, void *p)
{
+ int cpu;
+
arch_spin_unlock(&max_stack_lock);
+
+ cpu = smp_processor_id();
+ per_cpu(trace_active, cpu)--;
+
local_irq_enable();
}