summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c1
-rw-r--r--kernel/cgroup_freezer.c51
-rw-r--r--kernel/cpu.c3
-rw-r--r--kernel/freezer.c20
-rw-r--r--kernel/power/Kconfig2
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/resource.c6
-rw-r--r--kernel/sched.c13
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c76
-rw-r--r--kernel/sched_features.h1
-rw-r--r--kernel/signal.c3
-rw-r--r--kernel/smp.c18
-rw-r--r--kernel/timer.c129
-rw-r--r--kernel/trace/Kconfig4
-rw-r--r--kernel/trace/ftrace.c8
-rw-r--r--kernel/trace/ring_buffer.c56
-rw-r--r--kernel/trace/trace.c48
-rw-r--r--kernel/trace/trace.h20
-rw-r--r--kernel/workqueue.c45
20 files changed, 362 insertions, 146 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 35eebd5510c..358e77564e6 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2497,7 +2497,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
list_del(&cgrp->sibling);
spin_lock(&cgrp->dentry->d_lock);
d = dget(cgrp->dentry);
- cgrp->dentry = NULL;
spin_unlock(&d->d_lock);
cgroup_d_remove_dir(d);
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e9505695449..7fa476f01d0 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -162,9 +162,13 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
struct task_struct *task)
{
struct freezer *freezer;
- int retval;
- /* Anything frozen can't move or be moved to/from */
+ /*
+ * Anything frozen can't move or be moved to/from.
+ *
+ * Since orig_freezer->state == FROZEN means that @task has been
+ * frozen, so it's sufficient to check the latter condition.
+ */
if (is_task_frozen_enough(task))
return -EBUSY;
@@ -173,13 +177,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
if (freezer->state == CGROUP_FROZEN)
return -EBUSY;
- retval = 0;
- task_lock(task);
- freezer = task_freezer(task);
- if (freezer->state == CGROUP_FROZEN)
- retval = -EBUSY;
- task_unlock(task);
- return retval;
+ return 0;
}
static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
@@ -190,8 +188,9 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
freezer = task_freezer(task);
task_unlock(task);
- BUG_ON(freezer->state == CGROUP_FROZEN);
spin_lock_irq(&freezer->lock);
+ BUG_ON(freezer->state == CGROUP_FROZEN);
+
/* Locking avoids race with FREEZING -> THAWED transitions. */
if (freezer->state == CGROUP_FREEZING)
freeze_task(task, true);
@@ -276,25 +275,18 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
return num_cant_freeze_now ? -EBUSY : 0;
}
-static int unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
{
struct cgroup_iter it;
struct task_struct *task;
cgroup_iter_start(cgroup, &it);
while ((task = cgroup_iter_next(cgroup, &it))) {
- int do_wake;
-
- task_lock(task);
- do_wake = __thaw_process(task);
- task_unlock(task);
- if (do_wake)
- wake_up_process(task);
+ thaw_process(task);
}
cgroup_iter_end(cgroup, &it);
- freezer->state = CGROUP_THAWED;
- return 0;
+ freezer->state = CGROUP_THAWED;
}
static int freezer_change_state(struct cgroup *cgroup,
@@ -304,27 +296,22 @@ static int freezer_change_state(struct cgroup *cgroup,
int retval = 0;
freezer = cgroup_freezer(cgroup);
+
spin_lock_irq(&freezer->lock);
+
update_freezer_state(cgroup, freezer);
if (goal_state == freezer->state)
goto out;
- switch (freezer->state) {
+
+ switch (goal_state) {
case CGROUP_THAWED:
- retval = try_to_freeze_cgroup(cgroup, freezer);
+ unfreeze_cgroup(cgroup, freezer);
break;
- case CGROUP_FREEZING:
- if (goal_state == CGROUP_FROZEN) {
- /* Userspace is retrying after
- * "/bin/echo FROZEN > freezer.state" returned -EBUSY */
- retval = try_to_freeze_cgroup(cgroup, freezer);
- break;
- }
- /* state == FREEZING and goal_state == THAWED, so unfreeze */
case CGROUP_FROZEN:
- retval = unfreeze_cgroup(cgroup, freezer);
+ retval = try_to_freeze_cgroup(cgroup, freezer);
break;
default:
- break;
+ BUG();
}
out:
spin_unlock_irq(&freezer->lock);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 86d49045dae..5a732c5ef08 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -499,3 +499,6 @@ const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
#endif
};
EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
+
+const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
+EXPORT_SYMBOL(cpu_all_bits);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index ba6248b323e..2f4936cf708 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -121,16 +121,7 @@ void cancel_freezing(struct task_struct *p)
}
}
-/*
- * Wake up a frozen process
- *
- * task_lock() is needed to prevent the race with refrigerator() which may
- * occur if the freezing of tasks fails. Namely, without the lock, if the
- * freezing of tasks failed, thaw_tasks() might have run before a task in
- * refrigerator() could call frozen_process(), in which case the task would be
- * frozen and no one would thaw it.
- */
-int __thaw_process(struct task_struct *p)
+static int __thaw_process(struct task_struct *p)
{
if (frozen(p)) {
p->flags &= ~PF_FROZEN;
@@ -140,6 +131,15 @@ int __thaw_process(struct task_struct *p)
return 0;
}
+/*
+ * Wake up a frozen process
+ *
+ * task_lock() is needed to prevent the race with refrigerator() which may
+ * occur if the freezing of tasks fails. Namely, without the lock, if the
+ * freezing of tasks failed, thaw_tasks() might have run before a task in
+ * refrigerator() could call frozen_process(), in which case the task would be
+ * frozen and no one would thaw it.
+ */
int thaw_process(struct task_struct *p)
{
task_lock(p);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index dcd165f92a8..23bd4daeb96 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -96,7 +96,7 @@ config SUSPEND
config PM_TEST_SUSPEND
bool "Test suspend/resume and wakealarm during bootup"
- depends on SUSPEND && PM_DEBUG && RTC_LIB=y
+ depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
---help---
This option will let you suspend your machine during bootup, and
make it wake up a few seconds later using an RTC wakeup alarm.
diff --git a/kernel/profile.c b/kernel/profile.c
index a9e422df6bf..9830a037d8d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -102,7 +102,7 @@ int profile_setup(char *str)
__setup("profile=", profile_setup);
-int profile_init(void)
+int __ref profile_init(void)
{
int buffer_bytes;
if (!prof_on)
diff --git a/kernel/resource.c b/kernel/resource.c
index 7fec0e42723..4337063663e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -17,6 +17,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/device.h>
+#include <linux/pfn.h>
#include <asm/io.h>
@@ -522,7 +523,7 @@ static void __init __reserve_region_with_split(struct resource *root,
{
struct resource *parent = root;
struct resource *conflict;
- struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
+ struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
if (!res)
return;
@@ -849,7 +850,8 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
continue;
if (p->end < addr)
continue;
- if (p->start <= addr && (p->end >= addr + size - 1))
+ if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
+ PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
continue;
printk(KERN_WARNING "resource map sanity check conflict: "
"0x%llx 0x%llx 0x%llx 0x%llx %s\n",
diff --git a/kernel/sched.c b/kernel/sched.c
index e8819bc6f46..57c933ffbee 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -397,7 +397,7 @@ struct cfs_rq {
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
- struct sched_entity *curr, *next;
+ struct sched_entity *curr, *next, *last;
unsigned long nr_spread_over;
@@ -1805,7 +1805,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
/*
* Buddy candidates are cache hot:
*/
- if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
+ if (sched_feat(CACHE_HOT_BUDDY) &&
+ (&p->se == cfs_rq_of(&p->se)->next ||
+ &p->se == cfs_rq_of(&p->se)->last))
return 1;
if (p->sched_class != &fair_sched_class)
@@ -6875,15 +6877,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
struct sched_domain *tmp;
/* Remove the sched domains which do not contribute to scheduling. */
- for (tmp = sd; tmp; tmp = tmp->parent) {
+ for (tmp = sd; tmp; ) {
struct sched_domain *parent = tmp->parent;
if (!parent)
break;
+
if (sd_parent_degenerate(tmp, parent)) {
tmp->parent = parent->parent;
if (parent->parent)
parent->parent->child = tmp;
- }
+ } else
+ tmp = tmp->parent;
}
if (sd && sd_degenerate(sd)) {
@@ -7672,6 +7676,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
error:
free_sched_groups(cpu_map, tmpmask);
SCHED_CPUMASK_FREE((void *)allmasks);
+ kfree(rd);
return -ENOMEM;
#endif
}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index ad958c1ec70..5ae17762ec3 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -319,7 +319,7 @@ static int __init init_sched_debug_procfs(void)
{
struct proc_dir_entry *pe;
- pe = proc_create("sched_debug", 0644, NULL, &sched_debug_fops);
+ pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops);
if (!pe)
return -ENOMEM;
return 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ce514afd78f..51aa3e102ac 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -341,23 +341,20 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->rb_leftmost = next_node;
}
- if (cfs_rq->next == se)
- cfs_rq->next = NULL;
-
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
-static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->rb_leftmost;
-}
-
static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
{
- return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
+ struct rb_node *left = cfs_rq->rb_leftmost;
+
+ if (!left)
+ return NULL;
+
+ return rb_entry(left, struct sched_entity, run_node);
}
-static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -741,6 +738,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
#endif
}
+ if (cfs_rq->last == se)
+ cfs_rq->last = NULL;
+
+ if (cfs_rq->next == se)
+ cfs_rq->next = NULL;
+
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
account_entity_dequeue(cfs_rq, se);
@@ -794,24 +797,15 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
-static struct sched_entity *
-pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- if (!cfs_rq->next || wakeup_preempt_entity(cfs_rq->next, se) == 1)
- return se;
-
- return cfs_rq->next;
-}
-
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
- struct sched_entity *se = NULL;
+ struct sched_entity *se = __pick_next_entity(cfs_rq);
- if (first_fair(cfs_rq)) {
- se = __pick_next_entity(cfs_rq);
- se = pick_next(cfs_rq, se);
- set_next_entity(cfs_rq, se);
- }
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
+ return cfs_rq->next;
+
+ if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
+ return cfs_rq->last;
return se;
}
@@ -1325,26 +1319,53 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
return 0;
}
+static void set_last_buddy(struct sched_entity *se)
+{
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->last = se;
+}
+
+static void set_next_buddy(struct sched_entity *se)
+{
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->next = se;
+}
+
/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
{
struct task_struct *curr = rq->curr;
- struct cfs_rq *cfs_rq = task_cfs_rq(curr);
struct sched_entity *se = &curr->se, *pse = &p->se;
if (unlikely(rt_prio(p->prio))) {
+ struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+
update_rq_clock(rq);
update_curr(cfs_rq);
resched_task(curr);
return;
}
+ if (unlikely(p->sched_class != &fair_sched_class))
+ return;
+
if (unlikely(se == pse))
return;
- cfs_rq_of(pse)->next = pse;
+ /*
+ * Only set the backward buddy when the current task is still on the
+ * rq. This can happen when a wakeup gets interleaved with schedule on
+ * the ->pre_schedule() or idle_balance() point, either of which can
+ * drop the rq lock.
+ *
+ * Also, during early boot the idle thread is in the fair class, for
+ * obvious reasons its a bad idea to schedule back to the idle thread.
+ */
+ if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
+ set_last_buddy(se);
+ set_next_buddy(pse);
/*
* We can come here with TIF_NEED_RESCHED already set from new task
@@ -1396,6 +1417,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
do {
se = pick_next_entity(cfs_rq);
+ set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index fda01621829..da5d93b5d2c 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -12,3 +12,4 @@ SCHED_FEAT(LB_BIAS, 1)
SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
SCHED_FEAT(ASYM_EFF_LOAD, 1)
SCHED_FEAT(WAKEUP_OVERLAP, 0)
+SCHED_FEAT(LAST_BUDDY, 1)
diff --git a/kernel/signal.c b/kernel/signal.c
index 105217da5c8..4530fc65445 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1144,7 +1144,8 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
struct task_struct * p;
for_each_process(p) {
- if (p->pid > 1 && !same_thread_group(p, current)) {
+ if (task_pid_vnr(p) > 1 &&
+ !same_thread_group(p, current)) {
int err = group_send_sig_info(sig, info, p);
++count;
if (err != -EPERM)
diff --git a/kernel/smp.c b/kernel/smp.c
index f362a855377..75c8dde58c5 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -51,10 +51,6 @@ static void csd_flag_wait(struct call_single_data *data)
{
/* Wait for response */
do {
- /*
- * We need to see the flags store in the IPI handler
- */
- smp_mb();
if (!(data->flags & CSD_FLAG_WAIT))
break;
cpu_relax();
@@ -76,6 +72,11 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
list_add_tail(&data->list, &dst->list);
spin_unlock_irqrestore(&dst->lock, flags);
+ /*
+ * Make the list addition visible before sending the ipi.
+ */
+ smp_mb();
+
if (ipi)
arch_send_call_function_single_ipi(cpu);
@@ -157,7 +158,7 @@ void generic_smp_call_function_single_interrupt(void)
* Need to see other stores to list head for checking whether
* list is empty without holding q->lock
*/
- smp_mb();
+ smp_read_barrier_depends();
while (!list_empty(&q->list)) {
unsigned int data_flags;
@@ -191,7 +192,7 @@ void generic_smp_call_function_single_interrupt(void)
/*
* See comment on outer loop
*/
- smp_mb();
+ smp_read_barrier_depends();
}
}
@@ -370,6 +371,11 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
list_add_tail_rcu(&data->csd.list, &call_function_queue);
spin_unlock_irqrestore(&call_function_lock, flags);
+ /*
+ * Make the list addition visible before sending the ipi.
+ */
+ smp_mb();
+
/* Send a message to all CPUs in the map */
arch_send_call_function_ipi(mask);
diff --git a/kernel/timer.c b/kernel/timer.c
index 56becf373c5..dbd50fabe4c 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -112,27 +112,8 @@ timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
tbase_get_deferrable(timer->base));
}
-/**
- * __round_jiffies - function to round jiffies to a full second
- * @j: the time in (absolute) jiffies that should be rounded
- * @cpu: the processor number on which the timeout will happen
- *
- * __round_jiffies() rounds an absolute time in the future (in jiffies)
- * up or down to (approximately) full seconds. This is useful for timers
- * for which the exact time they fire does not matter too much, as long as
- * they fire approximately every X seconds.
- *
- * By rounding these timers to whole seconds, all such timers will fire
- * at the same time, rather than at various times spread out. The goal
- * of this is to have the CPU wake up less, which saves power.
- *
- * The exact rounding is skewed for each processor to avoid all
- * processors firing at the exact same time, which could lead
- * to lock contention or spurious cache line bouncing.
- *
- * The return value is the rounded version of the @j parameter.
- */
-unsigned long __round_jiffies(unsigned long j, int cpu)
+static unsigned long round_jiffies_common(unsigned long j, int cpu,
+ bool force_up)
{
int rem;
unsigned long original = j;
@@ -154,8 +135,9 @@ unsigned long __round_jiffies(unsigned long j, int cpu)
* due to delays of the timer irq, long irq off times etc etc) then
* we should round down to the whole second, not up. Use 1/4th second
* as cutoff for this rounding as an extreme upper bound for this.
+ * But never round down if @force_up is set.
*/
- if (rem < HZ/4) /* round down */
+ if (rem < HZ/4 && !force_up) /* round down */
j = j - rem;
else /* round up */
j = j - rem + HZ;
@@ -167,6 +149,31 @@ unsigned long __round_jiffies(unsigned long j, int cpu)
return original;
return j;
}
+
+/**
+ * __round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies() rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the @j parameter.
+ */
+unsigned long __round_jiffies(unsigned long j, int cpu)
+{
+ return round_jiffies_common(j, cpu, false);
+}
EXPORT_SYMBOL_GPL(__round_jiffies);
/**
@@ -191,13 +198,10 @@ EXPORT_SYMBOL_GPL(__round_jiffies);
*/
unsigned long __round_jiffies_relative(unsigned long j, int cpu)
{
- /*
- * In theory the following code can skip a jiffy in case jiffies
- * increments right between the addition and the later subtraction.
- * However since the entire point of this function is to use approximate
- * timeouts, it's entirely ok to not handle that.
- */
- return __round_jiffies(j + jiffies, cpu) - jiffies;
+ unsigned long j0 = jiffies;
+
+ /* Use j0 because jiffies might change while we run */
+ return round_jiffies_common(j + j0, cpu, false) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_relative);
@@ -218,7 +222,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative);
*/
unsigned long round_jiffies(unsigned long j)
{
- return __round_jiffies(j, raw_smp_processor_id());
+ return round_jiffies_common(j, raw_smp_processor_id(), false);
}
EXPORT_SYMBOL_GPL(round_jiffies);
@@ -243,6 +247,71 @@ unsigned long round_jiffies_relative(unsigned long j)
}
EXPORT_SYMBOL_GPL(round_jiffies_relative);
+/**
+ * __round_jiffies_up - function to round jiffies up to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * This is the same as __round_jiffies() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long __round_jiffies_up(unsigned long j, int cpu)
+{
+ return round_jiffies_common(j, cpu, true);
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_up);
+
+/**
+ * __round_jiffies_up_relative - function to round jiffies up to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * This is the same as __round_jiffies_relative() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
+{
+ unsigned long j0 = jiffies;
+
+ /* Use j0 because jiffies might change while we run */
+ return round_jiffies_common(j + j0, cpu, true) - j0;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
+
+/**
+ * round_jiffies_up - function to round jiffies up to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ *
+ * This is the same as round_jiffies() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long round_jiffies_up(unsigned long j)
+{
+ return round_jiffies_common(j, raw_smp_processor_id(), true);
+}
+EXPORT_SYMBOL_GPL(round_jiffies_up);
+
+/**
+ * round_jiffies_up_relative - function to round jiffies up to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ *
+ * This is the same as round_jiffies_relative() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long round_jiffies_up_relative(unsigned long j)
+{
+ return __round_jiffies_up_relative(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
+
static inline void set_running_timer(struct tvec_base *base,
struct timer_list *timer)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e0cea282e0c..33dbefd471e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -8,7 +8,6 @@ config NOP_TRACER
config HAVE_FUNCTION_TRACER
bool
- select NOP_TRACER
config HAVE_DYNAMIC_FTRACE
bool
@@ -26,8 +25,9 @@ config TRACING
bool
select DEBUG_FS
select RING_BUFFER
- select STACKTRACE
+ select STACKTRACE if STACKTRACE_SUPPORT
select TRACEPOINTS
+ select NOP_TRACER
menu "Tracers"
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7618c528756..4a39d24568c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1339,6 +1339,14 @@ void __init ftrace_init(void)
}
#else
+
+static int __init ftrace_nodyn_init(void)
+{
+ ftrace_enabled = 1;
+ return 0;
+}
+device_initcall(ftrace_nodyn_init);
+
# define ftrace_startup() do { } while (0)
# define ftrace_shutdown() do { } while (0)
# define ftrace_startup_sysctl() do { } while (0)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cedf4e26828..3f338063864 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1022,8 +1022,23 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event;
u64 ts, delta;
int commit = 0;
+ int nr_loops = 0;
again:
+ /*
+ * We allow for interrupts to reenter here and do a trace.
+ * If one does, it will cause this original code to loop
+ * back here. Even with heavy interrupts happening, this
+ * should only happen a few times in a row. If this happens
+ * 1000 times in a row, there must be either an interrupt
+ * storm or we have something buggy.
+ * Bail!
+ */
+ if (unlikely(++nr_loops > 1000)) {
+ RB_WARN_ON(cpu_buffer, 1);
+ return NULL;
+ }
+
ts = ring_buffer_time_stamp(cpu_buffer->cpu);
/*
@@ -1532,10 +1547,23 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = NULL;
unsigned long flags;
+ int nr_loops = 0;
spin_lock_irqsave(&cpu_buffer->lock, flags);
again:
+ /*
+ * This should normally only loop twice. But because the
+ * start of the reader inserts an empty page, it causes
+ * a case where we will loop three times. There should be no
+ * reason to loop four times (that I know of).
+ */
+ if (unlikely(++nr_loops > 3)) {
+ RB_WARN_ON(cpu_buffer, 1);
+ reader = NULL;
+ goto out;
+ }
+
reader = cpu_buffer->reader_page;
/* If there's more to read, return this page */
@@ -1665,6 +1693,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
struct buffer_page *reader;
+ int nr_loops = 0;
if (!cpu_isset(cpu, buffer->cpumask))
return NULL;
@@ -1672,6 +1701,19 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
cpu_buffer = buffer->buffers[cpu];
again:
+ /*
+ * We repeat when a timestamp is encountered. It is possible
+ * to get multiple timestamps from an interrupt entering just
+ * as one timestamp is about to be written. The max times
+ * that this can happen is the number of nested interrupts we
+ * can have. Nesting 10 deep of interrupts is clearly
+ * an anomaly.
+ */
+ if (unlikely(++nr_loops > 10)) {
+ RB_WARN_ON(cpu_buffer, 1);
+ return NULL;
+ }
+
reader = rb_get_reader_page(cpu_buffer);
if (!reader)
return NULL;
@@ -1722,6 +1764,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
struct ring_buffer *buffer;
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
+ int nr_loops = 0;
if (ring_buffer_iter_empty(iter))
return NULL;
@@ -1730,6 +1773,19 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
buffer = cpu_buffer->buffer;
again:
+ /*
+ * We repeat when a timestamp is encountered. It is possible
+ * to get multiple timestamps from an interrupt entering just
+ * as one timestamp is about to be written. The max times
+ * that this can happen is the number of nested interrupts we
+ * can have. Nesting 10 deep of interrupts is clearly
+ * an anomaly.
+ */
+ if (unlikely(++nr_loops > 10)) {
+ RB_WARN_ON(cpu_buffer, 1);
+ return NULL;
+ }
+
if (rb_per_cpu_empty(cpu_buffer))
return NULL;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a610ca77155..9f3b478f917 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -656,7 +656,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
entry->preempt_count = pc & 0xff;
entry->pid = (tsk) ? tsk->pid : 0;
entry->flags =
+#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
+#else
+ TRACE_FLAG_IRQS_NOSUPPORT |
+#endif
((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
@@ -701,6 +705,7 @@ static void ftrace_trace_stack(struct trace_array *tr,
unsigned long flags,
int skip, int pc)
{
+#ifdef CONFIG_STACKTRACE
struct ring_buffer_event *event;
struct stack_entry *entry;
struct stack_trace trace;
@@ -726,6 +731,7 @@ static void ftrace_trace_stack(struct trace_array *tr,
save_stack_trace(&trace);
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+#endif
}
void __trace_stack(struct trace_array *tr,
@@ -1082,17 +1088,20 @@ static void s_stop(struct seq_file *m, void *p)
mutex_unlock(&trace_types_lock);
}
-#define KRETPROBE_MSG "[unknown/kretprobe'd]"
-
#ifdef CONFIG_KRETPROBES
-static inline int kretprobed(unsigned long addr)
+static inline const char *kretprobed(const char *name)
{
- return addr == (unsigned long)kretprobe_trampoline;
+ static const char tramp_name[] = "kretprobe_trampoline";
+ int size = sizeof(tramp_name);
+
+ if (strncmp(tramp_name, name, size) == 0)
+ return "[unknown/kretprobe'd]";
+ return name;
}
#else
-static inline int kretprobed(unsigned long addr)
+static inline const char *kretprobed(const char *name)
{
- return 0;
+ return name;
}
#endif /* CONFIG_KRETPROBES */
@@ -1101,10 +1110,13 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
{
#ifdef CONFIG_KALLSYMS
char str[KSYM_SYMBOL_LEN];
+ const char *name;
kallsyms_lookup(address, NULL, NULL, NULL, str);
- return trace_seq_printf(s, fmt, str);
+ name = kretprobed(str);
+
+ return trace_seq_printf(s, fmt, name);
#endif
return 1;
}
@@ -1115,9 +1127,12 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
{
#ifdef CONFIG_KALLSYMS
char str[KSYM_SYMBOL_LEN];
+ const char *name;
sprint_symbol(str, address);
- return trace_seq_printf(s, fmt, str);
+ name = kretprobed(str);
+
+ return trace_seq_printf(s, fmt, name);
#endif
return 1;
}
@@ -1244,7 +1259,8 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
trace_seq_printf(s, "%3d", cpu);
trace_seq_printf(s, "%c%c",
- (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.',
+ (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+ (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.',
((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
@@ -1370,10 +1386,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
seq_print_ip_sym(s, field->ip, sym_flags);
trace_seq_puts(s, " (");
- if (kretprobed(field->parent_ip))
- trace_seq_puts(s, KRETPROBE_MSG);
- else
- seq_print_ip_sym(s, field->parent_ip, sym_flags);
+ seq_print_ip_sym(s, field->parent_ip, sym_flags);
trace_seq_puts(s, ")\n");
break;
}
@@ -1489,12 +1502,9 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
ret = trace_seq_printf(s, " <-");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
- if (kretprobed(field->parent_ip))
- ret = trace_seq_puts(s, KRETPROBE_MSG);
- else
- ret = seq_print_ip_sym(s,
- field->parent_ip,
- sym_flags);
+ ret = seq_print_ip_sym(s,
+ field->parent_ip,
+ sym_flags);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6889ca48f1f..8465ad05270 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -120,18 +120,20 @@ struct trace_boot {
/*
* trace_flag_type is an enumeration that holds different
* states when a trace occurs. These are:
- * IRQS_OFF - interrupts were disabled
- * NEED_RESCED - reschedule is requested
- * HARDIRQ - inside an interrupt handler
- * SOFTIRQ - inside a softirq handler
- * CONT - multiple entries hold the trace item
+ * IRQS_OFF - interrupts were disabled
+ * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
+ * NEED_RESCED - reschedule is requested
+ * HARDIRQ - inside an interrupt handler
+ * SOFTIRQ - inside a softirq handler
+ * CONT - multiple entries hold the trace item
*/
enum trace_flag_type {
TRACE_FLAG_IRQS_OFF = 0x01,
- TRACE_FLAG_NEED_RESCHED = 0x02,
- TRACE_FLAG_HARDIRQ = 0x04,
- TRACE_FLAG_SOFTIRQ = 0x08,
- TRACE_FLAG_CONT = 0x10,
+ TRACE_FLAG_IRQS_NOSUPPORT = 0x02,
+ TRACE_FLAG_NEED_RESCHED = 0x04,
+ TRACE_FLAG_HARDIRQ = 0x08,
+ TRACE_FLAG_SOFTIRQ = 0x10,
+ TRACE_FLAG_CONT = 0x20,
};
#define TRACE_BUF_SIZE 1024
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f928f2a87b9..d4dc69ddebd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -970,6 +970,51 @@ undo:
return ret;
}
+#ifdef CONFIG_SMP
+struct work_for_cpu {
+ struct work_struct work;
+ long (*fn)(void *);
+ void *arg;
+ long ret;
+};
+
+static void do_work_for_cpu(struct work_struct *w)
+{
+ struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work);
+
+ wfc->ret = wfc->fn(wfc->arg);
+}
+
+/**
+ * work_on_cpu - run a function in user context on a particular cpu
+ * @cpu: the cpu to run on
+ * @fn: the function to run
+ * @arg: the function arg
+ *
+ * This will return -EINVAL in the cpu is not online, or the return value
+ * of @fn otherwise.
+ */
+long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
+{
+ struct work_for_cpu wfc;
+
+ INIT_WORK(&wfc.work, do_work_for_cpu);
+ wfc.fn = fn;
+ wfc.arg = arg;
+ get_online_cpus();
+ if (unlikely(!cpu_online(cpu)))
+ wfc.ret = -EINVAL;
+ else {
+ schedule_work_on(cpu, &wfc.work);
+ flush_work(&wfc.work);
+ }
+ put_online_cpus();
+
+ return wfc.ret;
+}
+EXPORT_SYMBOL_GPL(work_on_cpu);
+#endif /* CONFIG_SMP */
+
void __init init_workqueues(void)
{
cpu_populated_map = cpu_online_map;