summaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c847
1 files changed, 558 insertions, 289 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index cb31fb4a137..6c10fa796ca 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -53,6 +53,7 @@
#include <linux/percpu.h>
#include <linux/kthread.h>
#include <linux/seq_file.h>
+#include <linux/sysctl.h>
#include <linux/syscalls.h>
#include <linux/times.h>
#include <linux/tsacct_kern.h>
@@ -60,6 +61,7 @@
#include <linux/delayacct.h>
#include <linux/reciprocal_div.h>
#include <linux/unistd.h>
+#include <linux/pagemap.h>
#include <asm/tlb.h>
@@ -261,9 +263,9 @@ struct rq {
s64 clock_max_delta;
unsigned int clock_warps, clock_overflows;
- unsigned int clock_unstable_events;
-
- struct sched_class *load_balance_class;
+ u64 idle_clock;
+ unsigned int clock_deep_idle_events;
+ u64 tick_timestamp;
atomic_t nr_iowait;
@@ -301,7 +303,7 @@ struct rq {
struct lock_class_key rq_lock_key;
};
-static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
static DEFINE_MUTEX(sched_hotcpu_mutex);
static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
@@ -319,15 +321,19 @@ static inline int cpu_of(struct rq *rq)
}
/*
- * Per-runqueue clock, as finegrained as the platform can give us:
+ * Update the per-runqueue clock, as finegrained as the platform can give
+ * us, but without assuming monotonicity, etc.:
*/
-static unsigned long long __rq_clock(struct rq *rq)
+static void __update_rq_clock(struct rq *rq)
{
u64 prev_raw = rq->prev_clock_raw;
u64 now = sched_clock();
s64 delta = now - prev_raw;
u64 clock = rq->clock;
+#ifdef CONFIG_SCHED_DEBUG
+ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+#endif
/*
* Protect against sched_clock() occasionally going backwards:
*/
@@ -338,8 +344,11 @@ static unsigned long long __rq_clock(struct rq *rq)
/*
* Catch too large forward jumps too:
*/
- if (unlikely(delta > 2*TICK_NSEC)) {
- clock++;
+ if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) {
+ if (clock < rq->tick_timestamp + TICK_NSEC)
+ clock = rq->tick_timestamp + TICK_NSEC;
+ else
+ clock++;
rq->clock_overflows++;
} else {
if (unlikely(delta > rq->clock_max_delta))
@@ -350,18 +359,12 @@ static unsigned long long __rq_clock(struct rq *rq)
rq->prev_clock_raw = now;
rq->clock = clock;
-
- return clock;
}
-static inline unsigned long long rq_clock(struct rq *rq)
+static void update_rq_clock(struct rq *rq)
{
- int this_cpu = smp_processor_id();
-
- if (this_cpu == cpu_of(rq))
- return __rq_clock(rq);
-
- return rq->clock;
+ if (likely(smp_processor_id() == cpu_of(rq)))
+ __update_rq_clock(rq);
}
/*
@@ -379,6 +382,25 @@ static inline unsigned long long rq_clock(struct rq *rq)
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+/*
+ * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
+ * clock constructed from sched_clock():
+ */
+unsigned long long cpu_clock(int cpu)
+{
+ unsigned long long now;
+ unsigned long flags;
+ struct rq *rq;
+
+ local_irq_save(flags);
+ rq = cpu_rq(cpu);
+ update_rq_clock(rq);
+ now = rq->clock;
+ local_irq_restore(flags);
+
+ return now;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
/* Change a task's ->cfs_rq if it moves across CPUs */
static inline void set_task_cfs_rq(struct task_struct *p)
@@ -536,18 +558,40 @@ static inline struct rq *this_rq_lock(void)
}
/*
- * CPU frequency is/was unstable - start new by setting prev_clock_raw:
+ * We are going deep-idle (irqs are disabled):
*/
-void sched_clock_unstable_event(void)
+void sched_clock_idle_sleep_event(void)
{
- unsigned long flags;
- struct rq *rq;
+ struct rq *rq = cpu_rq(smp_processor_id());
- rq = task_rq_lock(current, &flags);
- rq->prev_clock_raw = sched_clock();
- rq->clock_unstable_events++;
- task_rq_unlock(rq, &flags);
+ spin_lock(&rq->lock);
+ __update_rq_clock(rq);
+ spin_unlock(&rq->lock);
+ rq->clock_deep_idle_events++;
}
+EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
+
+/*
+ * We just idled delta nanoseconds (called with irqs disabled):
+ */
+void sched_clock_idle_wakeup_event(u64 delta_ns)
+{
+ struct rq *rq = cpu_rq(smp_processor_id());
+ u64 now = sched_clock();
+
+ rq->idle_clock += delta_ns;
+ /*
+ * Override the previous timestamp and ignore all
+ * sched_clock() deltas that occured while we idled,
+ * and use the PM-provided delta_ns to advance the
+ * rq clock:
+ */
+ spin_lock(&rq->lock);
+ rq->prev_clock_raw = now;
+ rq->clock += delta_ns;
+ spin_unlock(&rq->lock);
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
/*
* resched_task - mark a task 'to be rescheduled now'.
@@ -622,27 +666,31 @@ static u64 div64_likely32(u64 divident, unsigned long divisor)
#define WMULT_SHIFT 32
-static inline unsigned long
+/*
+ * Shift right and round:
+ */
+#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+
+static unsigned long
calc_delta_mine(unsigned long delta_exec, unsigned long weight,
struct load_weight *lw)
{
u64 tmp;
if (unlikely(!lw->inv_weight))
- lw->inv_weight = WMULT_CONST / lw->weight;
+ lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1;
tmp = (u64)delta_exec * weight;
/*
* Check whether we'd overflow the 64-bit multiplication:
*/
- if (unlikely(tmp > WMULT_CONST)) {
- tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
- >> (WMULT_SHIFT/2);
- } else {
- tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
- }
+ if (unlikely(tmp > WMULT_CONST))
+ tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
+ WMULT_SHIFT/2);
+ else
+ tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
- return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
+ return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
}
static inline unsigned long
@@ -663,46 +711,6 @@ static void update_load_sub(struct load_weight *lw, unsigned long dec)
lw->inv_weight = 0;
}
-static void __update_curr_load(struct rq *rq, struct load_stat *ls)
-{
- if (rq->curr != rq->idle && ls->load.weight) {
- ls->delta_exec += ls->delta_stat;
- ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
- ls->delta_stat = 0;
- }
-}
-
-/*
- * Update delta_exec, delta_fair fields for rq.
- *
- * delta_fair clock advances at a rate inversely proportional to
- * total load (rq->ls.load.weight) on the runqueue, while
- * delta_exec advances at the same rate as wall-clock (provided
- * cpu is not idle).
- *
- * delta_exec / delta_fair is a measure of the (smoothened) load on this
- * runqueue over any given interval. This (smoothened) load is used
- * during load balance.
- *
- * This function is called /before/ updating rq->ls.load
- * and when switching tasks.
- */
-static void update_curr_load(struct rq *rq, u64 now)
-{
- struct load_stat *ls = &rq->ls;
- u64 start;
-
- start = ls->load_update_start;
- ls->load_update_start = now;
- ls->delta_stat += now - start;
- /*
- * Stagger updates to ls->delta_fair. Very frequent updates
- * can be expensive.
- */
- if (ls->delta_stat >= sysctl_sched_stat_granularity)
- __update_curr_load(rq, ls);
-}
-
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
* of tasks with abnormal "nice" values across CPUs the contribution that
@@ -712,19 +720,6 @@ static void update_curr_load(struct rq *rq, u64 now)
* slice expiry etc.
*/
-/*
- * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
- * If static_prio_timeslice() is ever changed to break this assumption then
- * this code will need modification
- */
-#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
-#define load_weight(lp) \
- (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
-#define PRIO_TO_LOAD_WEIGHT(prio) \
- load_weight(static_prio_timeslice(prio))
-#define RTPRIO_TO_LOAD_WEIGHT(rp) \
- (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
-
#define WEIGHT_IDLEPRIO 2
#define WMULT_IDLEPRIO (1 << 31)
@@ -741,11 +736,14 @@ static void update_curr_load(struct rq *rq, u64 now)
* the relative distance between them is ~25%.)
*/
static const int prio_to_weight[40] = {
-/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
-/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
-/* 0 */ NICE_0_LOAD /* 1024 */,
-/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
-/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
+ /* -20 */ 88761, 71755, 56483, 46273, 36291,
+ /* -15 */ 29154, 23254, 18705, 14949, 11916,
+ /* -10 */ 9548, 7620, 6100, 4904, 3906,
+ /* -5 */ 3121, 2501, 1991, 1586, 1277,
+ /* 0 */ 1024, 820, 655, 526, 423,
+ /* 5 */ 335, 272, 215, 172, 137,
+ /* 10 */ 110, 87, 70, 56, 45,
+ /* 15 */ 36, 29, 23, 18, 15,
};
/*
@@ -756,42 +754,16 @@ static const int prio_to_weight[40] = {
* into multiplications:
*/
static const u32 prio_to_wmult[40] = {
-/* -20 */ 48356, 60446, 75558, 94446, 118058,
-/* -15 */ 147573, 184467, 230589, 288233, 360285,
-/* -10 */ 450347, 562979, 703746, 879575, 1099582,
-/* -5 */ 1374389, 1717986, 2147483, 2684354, 3355443,
-/* 0 */ 4194304, 5244160, 6557201, 8196502, 10250518,
-/* 5 */ 12782640, 16025997, 19976592, 24970740, 31350126,
-/* 10 */ 39045157, 49367440, 61356675, 76695844, 95443717,
-/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+ /* -20 */ 48388, 59856, 76040, 92818, 118348,
+ /* -15 */ 147320, 184698, 229616, 287308, 360437,
+ /* -10 */ 449829, 563644, 704093, 875809, 1099582,
+ /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
+ /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
+ /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
+ /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
+ /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
-static inline void
-inc_load(struct rq *rq, const struct task_struct *p, u64 now)
-{
- update_curr_load(rq, now);
- update_load_add(&rq->ls.load, p->se.load.weight);
-}
-
-static inline void
-dec_load(struct rq *rq, const struct task_struct *p, u64 now)
-{
- update_curr_load(rq, now);
- update_load_sub(&rq->ls.load, p->se.load.weight);
-}
-
-static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
-{
- rq->nr_running++;
- inc_load(rq, p, now);
-}
-
-static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
-{
- rq->nr_running--;
- dec_load(rq, p, now);
-}
-
static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
/*
@@ -809,8 +781,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_nr_move, unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned, unsigned long *load_moved,
- int this_best_prio, int best_prio, int best_prio_seen,
- struct rq_iterator *iterator);
+ int *this_best_prio, struct rq_iterator *iterator);
#include "sched_stats.h"
#include "sched_rt.c"
@@ -822,9 +793,72 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
#define sched_class_highest (&rt_sched_class)
+static void __update_curr_load(struct rq *rq, struct load_stat *ls)
+{
+ if (rq->curr != rq->idle && ls->load.weight) {
+ ls->delta_exec += ls->delta_stat;
+ ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
+ ls->delta_stat = 0;
+ }
+}
+
+/*
+ * Update delta_exec, delta_fair fields for rq.
+ *
+ * delta_fair clock advances at a rate inversely proportional to
+ * total load (rq->ls.load.weight) on the runqueue, while
+ * delta_exec advances at the same rate as wall-clock (provided
+ * cpu is not idle).
+ *
+ * delta_exec / delta_fair is a measure of the (smoothened) load on this
+ * runqueue over any given interval. This (smoothened) load is used
+ * during load balance.
+ *
+ * This function is called /before/ updating rq->ls.load
+ * and when switching tasks.
+ */
+static void update_curr_load(struct rq *rq)
+{
+ struct load_stat *ls = &rq->ls;
+ u64 start;
+
+ start = ls->load_update_start;
+ ls->load_update_start = rq->clock;
+ ls->delta_stat += rq->clock - start;
+ /*
+ * Stagger updates to ls->delta_fair. Very frequent updates
+ * can be expensive.
+ */
+ if (ls->delta_stat >= sysctl_sched_stat_granularity)
+ __update_curr_load(rq, ls);
+}
+
+static inline void inc_load(struct rq *rq, const struct task_struct *p)
+{
+ update_curr_load(rq);
+ update_load_add(&rq->ls.load, p->se.load.weight);
+}
+
+static inline void dec_load(struct rq *rq, const struct task_struct *p)
+{
+ update_curr_load(rq);
+ update_load_sub(&rq->ls.load, p->se.load.weight);
+}
+
+static void inc_nr_running(struct task_struct *p, struct rq *rq)
+{
+ rq->nr_running++;
+ inc_load(rq, p);
+}
+
+static void dec_nr_running(struct task_struct *p, struct rq *rq)
+{
+ rq->nr_running--;
+ dec_load(rq, p);
+}
+
static void set_load_weight(struct task_struct *p)
{
- task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
p->se.wait_runtime = 0;
if (task_has_rt_policy(p)) {
@@ -846,18 +880,16 @@ static void set_load_weight(struct task_struct *p)
p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
}
-static void
-enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
+static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
{
sched_info_queued(p);
- p->sched_class->enqueue_task(rq, p, wakeup, now);
+ p->sched_class->enqueue_task(rq, p, wakeup);
p->se.on_rq = 1;
}
-static void
-dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
+static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
{
- p->sched_class->dequeue_task(rq, p, sleep, now);
+ p->sched_class->dequeue_task(rq, p, sleep);
p->se.on_rq = 0;
}
@@ -912,13 +944,11 @@ static int effective_prio(struct task_struct *p)
*/
static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
{
- u64 now = rq_clock(rq);
-
if (p->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible--;
- enqueue_task(rq, p, wakeup, now);
- inc_nr_running(p, rq, now);
+ enqueue_task(rq, p, wakeup);
+ inc_nr_running(p, rq);
}
/*
@@ -926,13 +956,13 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
*/
static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
{
- u64 now = rq_clock(rq);
+ update_rq_clock(rq);
if (p->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible--;
- enqueue_task(rq, p, 0, now);
- inc_nr_running(p, rq, now);
+ enqueue_task(rq, p, 0);
+ inc_nr_running(p, rq);
}
/*
@@ -940,13 +970,11 @@ static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
*/
static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
{
- u64 now = rq_clock(rq);
-
if (p->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible++;
- dequeue_task(rq, p, sleep, now);
- dec_nr_running(p, rq, now);
+ dequeue_task(rq, p, sleep);
+ dec_nr_running(p, rq);
}
/**
@@ -981,18 +1009,21 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
u64 clock_offset, fair_clock_offset;
clock_offset = old_rq->clock - new_rq->clock;
- fair_clock_offset = old_rq->cfs.fair_clock -
- new_rq->cfs.fair_clock;
- if (p->se.wait_start)
- p->se.wait_start -= clock_offset;
+ fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock;
+
if (p->se.wait_start_fair)
p->se.wait_start_fair -= fair_clock_offset;
+ if (p->se.sleep_start_fair)
+ p->se.sleep_start_fair -= fair_clock_offset;
+
+#ifdef CONFIG_SCHEDSTATS
+ if (p->se.wait_start)
+ p->se.wait_start -= clock_offset;
if (p->se.sleep_start)
p->se.sleep_start -= clock_offset;
if (p->se.block_start)
p->se.block_start -= clock_offset;
- if (p->se.sleep_start_fair)
- p->se.sleep_start_fair -= fair_clock_offset;
+#endif
__set_task_cpu(p, new_cpu);
}
@@ -1511,6 +1542,7 @@ out_set_cpu:
out_activate:
#endif /* CONFIG_SMP */
+ update_rq_clock(rq);
activate_task(rq, p, 1);
/*
* Sync wakeups (i.e. those types of wakeups where the waker
@@ -1553,17 +1585,20 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
static void __sched_fork(struct task_struct *p)
{
p->se.wait_start_fair = 0;
- p->se.wait_start = 0;
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
+ p->se.prev_sum_exec_runtime = 0;
p->se.delta_exec = 0;
p->se.delta_fair_run = 0;
p->se.delta_fair_sleep = 0;
p->se.wait_runtime = 0;
+ p->se.sleep_start_fair = 0;
+
+#ifdef CONFIG_SCHEDSTATS
+ p->se.wait_start = 0;
p->se.sum_wait_runtime = 0;
p->se.sum_sleep_runtime = 0;
p->se.sleep_start = 0;
- p->se.sleep_start_fair = 0;
p->se.block_start = 0;
p->se.sleep_max = 0;
p->se.block_max = 0;
@@ -1571,10 +1606,15 @@ static void __sched_fork(struct task_struct *p)
p->se.wait_max = 0;
p->se.wait_runtime_overruns = 0;
p->se.wait_runtime_underruns = 0;
+#endif
INIT_LIST_HEAD(&p->run_list);
p->se.on_rq = 0;
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ INIT_HLIST_HEAD(&p->preempt_notifiers);
+#endif
+
/*
* We mark the process as running here, but have not actually
* inserted it onto the runqueue yet. This guarantees that
@@ -1639,11 +1679,19 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
rq = task_rq_lock(p, &flags);
BUG_ON(p->state != TASK_RUNNING);
this_cpu = smp_processor_id(); /* parent's CPU */
+ update_rq_clock(rq);
p->prio = effective_prio(p);
- if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
- task_cpu(p) != this_cpu || !current->se.on_rq) {
+ if (rt_prio(p->prio))
+ p->sched_class = &rt_sched_class;
+ else
+ p->sched_class = &fair_sched_class;
+
+ if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
+ (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
+ !current->se.on_rq) {
+
activate_task(rq, p, 0);
} else {
/*
@@ -1651,14 +1699,74 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
* management (if any):
*/
p->sched_class->task_new(rq, p);
+ inc_nr_running(p, rq);
}
check_preempt_curr(rq, p);
task_rq_unlock(rq, &flags);
}
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+/**
+ * preempt_notifier_register - tell me when current is being being preempted & rescheduled
+ * @notifier: notifier struct to register
+ */
+void preempt_notifier_register(struct preempt_notifier *notifier)
+{
+ hlist_add_head(&notifier->link, &current->preempt_notifiers);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_register);
+
+/**
+ * preempt_notifier_unregister - no longer interested in preemption notifications
+ * @notifier: notifier struct to unregister
+ *
+ * This is safe to call from within a preemption notifier.
+ */
+void preempt_notifier_unregister(struct preempt_notifier *notifier)
+{
+ hlist_del(&notifier->link);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+ struct preempt_notifier *notifier;
+ struct hlist_node *node;
+
+ hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+ notifier->ops->sched_in(notifier, raw_smp_processor_id());
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+ struct preempt_notifier *notifier;
+ struct hlist_node *node;
+
+ hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+ notifier->ops->sched_out(notifier, next);
+}
+
+#else
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+}
+
+#endif
+
/**
* prepare_task_switch - prepare to switch tasks
* @rq: the runqueue preparing to switch
+ * @prev: the current task that is being switched out
* @next: the task we are going to switch to.
*
* This is called with the rq lock held and interrupts off. It must
@@ -1668,8 +1776,11 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
* prepare_task_switch sets up locking and calls architecture specific
* hooks.
*/
-static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
+static inline void
+prepare_task_switch(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
{
+ fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next);
prepare_arch_switch(next);
}
@@ -1711,6 +1822,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
prev_state = prev->state;
finish_arch_switch(prev);
finish_lock_switch(rq, prev);
+ fire_sched_in_preempt_notifiers(current);
if (mm)
mmdrop(mm);
if (unlikely(prev_state == TASK_DEAD)) {
@@ -1751,7 +1863,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
{
struct mm_struct *mm, *oldmm;
- prepare_task_switch(rq, next);
+ prepare_task_switch(rq, prev, next);
mm = next->mm;
oldmm = prev->active_mm;
/*
@@ -1874,7 +1986,6 @@ static void update_cpu_load(struct rq *this_rq)
unsigned long total_load = this_rq->ls.load.weight;
unsigned long this_load = total_load;
struct load_stat *ls = &this_rq->ls;
- u64 now = __rq_clock(this_rq);
int i, scale;
this_rq->nr_load_updates++;
@@ -1882,7 +1993,7 @@ static void update_cpu_load(struct rq *this_rq)
goto do_avg;
/* Update delta_fair/delta_exec fields first */
- update_curr_load(this_rq, now);
+ update_curr_load(this_rq);
fair_delta64 = ls->delta_fair + 1;
ls->delta_fair = 0;
@@ -1890,8 +2001,8 @@ static void update_cpu_load(struct rq *this_rq)
exec_delta64 = ls->delta_exec + 1;
ls->delta_exec = 0;
- sample_interval64 = now - ls->load_update_last;
- ls->load_update_last = now;
+ sample_interval64 = this_rq->clock - ls->load_update_last;
+ ls->load_update_last = this_rq->clock;
if ((s64)sample_interval64 < (s64)TICK_NSEC)
sample_interval64 = TICK_NSEC;
@@ -1946,6 +2057,8 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
spin_lock(&rq1->lock);
}
}
+ update_rq_clock(rq1);
+ update_rq_clock(rq2);
}
/*
@@ -2073,12 +2186,6 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
if (task_running(rq, p))
return 0;
- /*
- * Aggressive migration if too many balance attempts have failed:
- */
- if (sd->nr_balance_failed > sd->cache_nice_tries)
- return 1;
-
return 1;
}
@@ -2086,8 +2193,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_nr_move, unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned, unsigned long *load_moved,
- int this_best_prio, int best_prio, int best_prio_seen,
- struct rq_iterator *iterator)
+ int *this_best_prio, struct rq_iterator *iterator)
{
int pulled = 0, pinned = 0, skip_for_load;
struct task_struct *p;
@@ -2112,12 +2218,8 @@ next:
*/
skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
SCHED_LOAD_SCALE_FUZZ;
- if (skip_for_load && p->prio < this_best_prio)
- skip_for_load = !best_prio_seen && p->prio == best_prio;
- if (skip_for_load ||
+ if ((skip_for_load && p->prio >= *this_best_prio) ||
!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
-
- best_prio_seen |= p->prio == best_prio;
p = iterator->next(iterator->arg);
goto next;
}
@@ -2131,8 +2233,8 @@ next:
* and the prescribed amount of weighted load.
*/
if (pulled < max_nr_move && rem_load_move > 0) {
- if (p->prio < this_best_prio)
- this_best_prio = p->prio;
+ if (p->prio < *this_best_prio)
+ *this_best_prio = p->prio;
p = iterator->next(iterator->arg);
goto next;
}
@@ -2151,32 +2253,52 @@ out:
}
/*
- * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
- * load from busiest to this_rq, as part of a balancing operation within
- * "domain". Returns the number of tasks moved.
+ * move_tasks tries to move up to max_load_move weighted load from busiest to
+ * this_rq, as part of a balancing operation within domain "sd".
+ * Returns 1 if successful and 0 otherwise.
*
* Called with both runqueues locked.
*/
static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_nr_move, unsigned long max_load_move,
+ unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned)
{
struct sched_class *class = sched_class_highest;
- unsigned long load_moved, total_nr_moved = 0, nr_moved;
- long rem_load_move = max_load_move;
+ unsigned long total_load_moved = 0;
+ int this_best_prio = this_rq->curr->prio;
do {
- nr_moved = class->load_balance(this_rq, this_cpu, busiest,
- max_nr_move, (unsigned long)rem_load_move,
- sd, idle, all_pinned, &load_moved);
- total_nr_moved += nr_moved;
- max_nr_move -= nr_moved;
- rem_load_move -= load_moved;
+ total_load_moved +=
+ class->load_balance(this_rq, this_cpu, busiest,
+ ULONG_MAX, max_load_move - total_load_moved,
+ sd, idle, all_pinned, &this_best_prio);
class = class->next;
- } while (class && max_nr_move && rem_load_move > 0);
+ } while (class && max_load_move > total_load_moved);
+
+ return total_load_moved > 0;
+}
+
+/*
+ * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * part of active balancing operations within "domain".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ struct sched_domain *sd, enum cpu_idle_type idle)
+{
+ struct sched_class *class;
+ int this_best_prio = MAX_PRIO;
+
+ for (class = sched_class_highest; class; class = class->next)
+ if (class->load_balance(this_rq, this_cpu, busiest,
+ 1, ULONG_MAX, sd, idle, NULL,
+ &this_best_prio))
+ return 1;
- return total_nr_moved;
+ return 0;
}
/*
@@ -2235,7 +2357,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
rq = cpu_rq(i);
- if (*sd_idle && !idle_cpu(i))
+ if (*sd_idle && rq->nr_running)
*sd_idle = 0;
/* Bias balancing toward cpus of our domain */
@@ -2257,9 +2379,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
/*
* First idle cpu or the first cpu(busiest) in this sched group
* is eligible for doing load balancing at this and above
- * domains.
+ * domains. In the newly idle case, we will allow all the cpu's
+ * to do the newly idle load balance.
*/
- if (local_group && balance_cpu != this_cpu && balance) {
+ if (idle != CPU_NEWLY_IDLE && local_group &&
+ balance_cpu != this_cpu && balance) {
*balance = 0;
goto ret;
}
@@ -2393,7 +2517,7 @@ group_next:
* a think about bumping its value to force at least one task to be
* moved
*/
- if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
+ if (*imbalance < busiest_load_per_task) {
unsigned long tmp, pwr_now, pwr_move;
unsigned int imbn;
@@ -2445,10 +2569,8 @@ small_imbalance:
pwr_move /= SCHED_LOAD_SCALE;
/* Move if we gain throughput */
- if (pwr_move <= pwr_now)
- goto out_balanced;
-
- *imbalance = busiest_load_per_task;
+ if (pwr_move > pwr_now)
+ *imbalance = busiest_load_per_task;
}
return busiest;
@@ -2506,11 +2628,6 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
*/
#define MAX_PINNED_INTERVAL 512
-static inline unsigned long minus_1_or_zero(unsigned long n)
-{
- return n > 0 ? n - 1 : 0;
-}
-
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
@@ -2519,7 +2636,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *balance)
{
- int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+ int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
struct sched_group *group;
unsigned long imbalance;
struct rq *busiest;
@@ -2560,18 +2677,17 @@ redo:
schedstat_add(sd, lb_imbalance[idle], imbalance);
- nr_moved = 0;
+ ld_moved = 0;
if (busiest->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
* an imbalance but busiest->nr_running <= 1, the group is
- * still unbalanced. nr_moved simply stays zero, so it is
+ * still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
local_irq_save(flags);
double_rq_lock(this_rq, busiest);
- nr_moved = move_tasks(this_rq, this_cpu, busiest,
- minus_1_or_zero(busiest->nr_running),
+ ld_moved = move_tasks(this_rq, this_cpu, busiest,
imbalance, sd, idle, &all_pinned);
double_rq_unlock(this_rq, busiest);
local_irq_restore(flags);
@@ -2579,7 +2695,7 @@ redo:
/*
* some other cpu did the load balance for us.
*/
- if (nr_moved && this_cpu != smp_processor_id())
+ if (ld_moved && this_cpu != smp_processor_id())
resched_cpu(this_cpu);
/* All tasks on this runqueue were pinned by CPU affinity */
@@ -2591,7 +2707,7 @@ redo:
}
}
- if (!nr_moved) {
+ if (!ld_moved) {
schedstat_inc(sd, lb_failed[idle]);
sd->nr_balance_failed++;
@@ -2640,10 +2756,10 @@ redo:
sd->balance_interval *= 2;
}
- if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+ if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
return -1;
- return nr_moved;
+ return ld_moved;
out_balanced:
schedstat_inc(sd, lb_balanced[idle]);
@@ -2675,8 +2791,9 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
struct sched_group *group;
struct rq *busiest = NULL;
unsigned long imbalance;
- int nr_moved = 0;
+ int ld_moved = 0;
int sd_idle = 0;
+ int all_pinned = 0;
cpumask_t cpus = CPU_MASK_ALL;
/*
@@ -2709,23 +2826,25 @@ redo:
schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
- nr_moved = 0;
+ ld_moved = 0;
if (busiest->nr_running > 1) {
/* Attempt to move tasks */
double_lock_balance(this_rq, busiest);
- nr_moved = move_tasks(this_rq, this_cpu, busiest,
- minus_1_or_zero(busiest->nr_running),
- imbalance, sd, CPU_NEWLY_IDLE, NULL);
+ /* this_rq->clock is already updated */
+ update_rq_clock(busiest);
+ ld_moved = move_tasks(this_rq, this_cpu, busiest,
+ imbalance, sd, CPU_NEWLY_IDLE,
+ &all_pinned);
spin_unlock(&busiest->lock);
- if (!nr_moved) {
+ if (unlikely(all_pinned)) {
cpu_clear(cpu_of(busiest), cpus);
if (!cpus_empty(cpus))
goto redo;
}
}
- if (!nr_moved) {
+ if (!ld_moved) {
schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
@@ -2733,7 +2852,7 @@ redo:
} else
sd->nr_balance_failed = 0;
- return nr_moved;
+ return ld_moved;
out_balanced:
schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
@@ -2810,6 +2929,8 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
/* move a task from busiest_rq to target_rq */
double_lock_balance(busiest_rq, target_rq);
+ update_rq_clock(busiest_rq);
+ update_rq_clock(target_rq);
/* Search for an sd spanning us and the target CPU. */
for_each_domain(target_cpu, sd) {
@@ -2821,9 +2942,8 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
if (likely(sd)) {
schedstat_inc(sd, alb_cnt);
- if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
- RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
- NULL))
+ if (move_one_task(target_rq, target_cpu, busiest_rq,
+ sd, CPU_IDLE))
schedstat_inc(sd, alb_pushed);
else
schedstat_inc(sd, alb_failed);
@@ -2921,6 +3041,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
+ int update_next_balance = 0;
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
@@ -2957,8 +3078,10 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
if (sd->flags & SD_SERIALIZE)
spin_unlock(&balancing);
out:
- if (time_after(next_balance, sd->last_balance + interval))
+ if (time_after(next_balance, sd->last_balance + interval)) {
next_balance = sd->last_balance + interval;
+ update_next_balance = 1;
+ }
/*
* Stop the load balance at this level. There is another
@@ -2968,7 +3091,14 @@ out:
if (!balance)
break;
}
- rq->next_balance = next_balance;
+
+ /*
+ * next_balance will be updated only when there is a need.
+ * When the cpu is attached to null domain for ex, it will not be
+ * updated.
+ */
+ if (likely(update_next_balance))
+ rq->next_balance = next_balance;
}
/*
@@ -3007,7 +3137,7 @@ static void run_rebalance_domains(struct softirq_action *h)
if (need_resched())
break;
- rebalance_domains(balance_cpu, SCHED_IDLE);
+ rebalance_domains(balance_cpu, CPU_IDLE);
rq = cpu_rq(balance_cpu);
if (time_after(this_rq->next_balance, rq->next_balance))
@@ -3092,8 +3222,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_nr_move, unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned, unsigned long *load_moved,
- int this_best_prio, int best_prio, int best_prio_seen,
- struct rq_iterator *iterator)
+ int *this_best_prio, struct rq_iterator *iterator)
{
*load_moved = 0;
@@ -3119,7 +3248,8 @@ unsigned long long task_sched_runtime(struct task_struct *p)
rq = task_rq_lock(p, &flags);
ns = p->se.sum_exec_runtime;
if (rq->curr == p) {
- delta_exec = rq_clock(rq) - p->se.exec_start;
+ update_rq_clock(rq);
+ delta_exec = rq->clock - p->se.exec_start;
if ((s64)delta_exec > 0)
ns += delta_exec;
}
@@ -3213,11 +3343,19 @@ void scheduler_tick(void)
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
+ u64 next_tick = rq->tick_timestamp + TICK_NSEC;
spin_lock(&rq->lock);
+ __update_rq_clock(rq);
+ /*
+ * Let rq->clock advance by at least TICK_NSEC:
+ */
+ if (unlikely(rq->clock < next_tick))
+ rq->clock = next_tick;
+ rq->tick_timestamp = rq->clock;
+ update_cpu_load(rq);
if (curr != rq->idle) /* FIXME: needed? */
curr->sched_class->task_tick(rq, curr);
- update_cpu_load(rq);
spin_unlock(&rq->lock);
#ifdef CONFIG_SMP
@@ -3299,7 +3437,7 @@ static inline void schedule_debug(struct task_struct *prev)
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
+pick_next_task(struct rq *rq, struct task_struct *prev)
{
struct sched_class *class;
struct task_struct *p;
@@ -3309,14 +3447,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
* the fair class we can call that function directly:
*/
if (likely(rq->nr_running == rq->cfs.nr_running)) {
- p = fair_sched_class.pick_next_task(rq, now);
+ p = fair_sched_class.pick_next_task(rq);
if (likely(p))
return p;
}
class = sched_class_highest;
for ( ; ; ) {
- p = class->pick_next_task(rq, now);
+ p = class->pick_next_task(rq);
if (p)
return p;
/*
@@ -3335,7 +3473,6 @@ asmlinkage void __sched schedule(void)
struct task_struct *prev, *next;
long *switch_count;
struct rq *rq;
- u64 now;
int cpu;
need_resched:
@@ -3353,6 +3490,7 @@ need_resched_nonpreemptible:
spin_lock_irq(&rq->lock);
clear_tsk_need_resched(prev);
+ __update_rq_clock(rq);
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
@@ -3367,9 +3505,8 @@ need_resched_nonpreemptible:
if (unlikely(!rq->nr_running))
idle_balance(cpu, rq);
- now = __rq_clock(rq);
- prev->sched_class->put_prev_task(rq, prev, now);
- next = pick_next_task(rq, prev, now);
+ prev->sched_class->put_prev_task(rq, prev);
+ next = pick_next_task(rq, prev);
sched_info_switch(prev, next);
@@ -3812,17 +3949,16 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
unsigned long flags;
int oldprio, on_rq;
struct rq *rq;
- u64 now;
BUG_ON(prio < 0 || prio > MAX_PRIO);
rq = task_rq_lock(p, &flags);
- now = rq_clock(rq);
+ update_rq_clock(rq);
oldprio = p->prio;
on_rq = p->se.on_rq;
if (on_rq)
- dequeue_task(rq, p, 0, now);
+ dequeue_task(rq, p, 0);
if (rt_prio(prio))
p->sched_class = &rt_sched_class;
@@ -3832,7 +3968,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
p->prio = prio;
if (on_rq) {
- enqueue_task(rq, p, 0, now);
+ enqueue_task(rq, p, 0);
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
@@ -3855,7 +3991,6 @@ void set_user_nice(struct task_struct *p, long nice)
int old_prio, delta, on_rq;
unsigned long flags;
struct rq *rq;
- u64 now;
if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
return;
@@ -3864,7 +3999,7 @@ void set_user_nice(struct task_struct *p, long nice)
* the task might be in the middle of scheduling on another CPU.
*/
rq = task_rq_lock(p, &flags);
- now = rq_clock(rq);
+ update_rq_clock(rq);
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
@@ -3877,8 +4012,8 @@ void set_user_nice(struct task_struct *p, long nice)
}
on_rq = p->se.on_rq;
if (on_rq) {
- dequeue_task(rq, p, 0, now);
- dec_load(rq, p, now);
+ dequeue_task(rq, p, 0);
+ dec_load(rq, p);
}
p->static_prio = NICE_TO_PRIO(nice);
@@ -3888,8 +4023,8 @@ void set_user_nice(struct task_struct *p, long nice)
delta = p->prio - old_prio;
if (on_rq) {
- enqueue_task(rq, p, 0, now);
- inc_load(rq, p, now);
+ enqueue_task(rq, p, 0);
+ inc_load(rq, p);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -4125,6 +4260,7 @@ recheck:
spin_unlock_irqrestore(&p->pi_lock, flags);
goto recheck;
}
+ update_rq_clock(rq);
on_rq = p->se.on_rq;
if (on_rq)
deactivate_task(rq, p, 0);
@@ -4380,10 +4516,8 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
out_unlock:
read_unlock(&tasklist_lock);
mutex_unlock(&sched_hotcpu_mutex);
- if (retval)
- return retval;
- return 0;
+ return retval;
}
/**
@@ -4422,10 +4556,7 @@ asmlinkage long sys_sched_yield(void)
struct rq *rq = this_rq_lock();
schedstat_inc(rq, yld_cnt);
- if (unlikely(rq->nr_running == 1))
- schedstat_inc(rq, yld_act_empty);
- else
- current->sched_class->yield_task(rq, current);
+ current->sched_class->yield_task(rq, current);
/*
* Since we are going to call schedule() anyway, there's
@@ -4781,14 +4912,18 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
static inline void sched_init_granularity(void)
{
unsigned int factor = 1 + ilog2(num_online_cpus());
- const unsigned long gran_limit = 100000000;
+ const unsigned long limit = 100000000;
+
+ sysctl_sched_min_granularity *= factor;
+ if (sysctl_sched_min_granularity > limit)
+ sysctl_sched_min_granularity = limit;
- sysctl_sched_granularity *= factor;
- if (sysctl_sched_granularity > gran_limit)
- sysctl_sched_granularity = gran_limit;
+ sysctl_sched_latency *= factor;
+ if (sysctl_sched_latency > limit)
+ sysctl_sched_latency = limit;
- sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
- sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
+ sysctl_sched_runtime_limit = sysctl_sched_latency;
+ sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2;
}
#ifdef CONFIG_SMP
@@ -4883,6 +5018,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
on_rq = p->se.on_rq;
if (on_rq)
deactivate_task(rq_src, p, 0);
+
set_task_cpu(p, dest_cpu);
if (on_rq) {
activate_task(rq_dest, p, 0);
@@ -5115,14 +5251,137 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
for ( ; ; ) {
if (!rq->nr_running)
break;
- next = pick_next_task(rq, rq->curr, rq_clock(rq));
+ update_rq_clock(rq);
+ next = pick_next_task(rq, rq->curr);
if (!next)
break;
migrate_dead(dead_cpu, next);
+
}
}
#endif /* CONFIG_HOTPLUG_CPU */
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+
+static struct ctl_table sd_ctl_dir[] = {
+ {
+ .procname = "sched_domain",
+ .mode = 0555,
+ },
+ {0,},
+};
+
+static struct ctl_table sd_ctl_root[] = {
+ {
+ .ctl_name = CTL_KERN,
+ .procname = "kernel",
+ .mode = 0555,
+ .child = sd_ctl_dir,
+ },
+ {0,},
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+ struct ctl_table *entry =
+ kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL);
+
+ BUG_ON(!entry);
+ memset(entry, 0, n * sizeof(struct ctl_table));
+
+ return entry;
+}
+
+static void
+set_table_entry(struct ctl_table *entry,
+ const char *procname, void *data, int maxlen,
+ mode_t mode, proc_handler *proc_handler)
+{
+ entry->procname = procname;
+ entry->data = data;
+ entry->maxlen = maxlen;
+ entry->mode = mode;
+ entry->proc_handler = proc_handler;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(14);
+
+ set_table_entry(&table[0], "min_interval", &sd->min_interval,
+ sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[1], "max_interval", &sd->max_interval,
+ sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[10], "cache_nice_tries",
+ &sd->cache_nice_tries,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[12], "flags", &sd->flags,
+ sizeof(int), 0644, proc_dointvec_minmax);
+
+ return table;
+}
+
+static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+ struct ctl_table *entry, *table;
+ struct sched_domain *sd;
+ int domain_num = 0, i;
+ char buf[32];
+
+ for_each_domain(cpu, sd)
+ domain_num++;
+ entry = table = sd_alloc_ctl_entry(domain_num + 1);
+
+ i = 0;
+ for_each_domain(cpu, sd) {
+ snprintf(buf, 32, "domain%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_domain_table(sd);
+ entry++;
+ i++;
+ }
+ return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+static void init_sched_domain_sysctl(void)
+{
+ int i, cpu_num = num_online_cpus();
+ struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+ char buf[32];
+
+ sd_ctl_dir[0].child = entry;
+
+ for (i = 0; i < cpu_num; i++, entry++) {
+ snprintf(buf, 32, "cpu%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_cpu_table(i);
+ }
+ sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+#else
+static void init_sched_domain_sysctl(void)
+{
+}
+#endif
+
/*
* migration_call - callback that gets triggered when a CPU is added.
* Here we can start up the necessary migration thread for the new CPU.
@@ -5179,6 +5438,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
rq->migration_thread = NULL;
/* Idle task back to normal (off runqueue, low prio) */
rq = task_rq_lock(rq->idle, &flags);
+ update_rq_clock(rq);
deactivate_task(rq, rq->idle, 0);
rq->idle->static_prio = MAX_PRIO;
__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
@@ -6101,7 +6361,7 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
}
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-int arch_reinit_sched_domains(void)
+static int arch_reinit_sched_domains(void)
{
int err;
@@ -6130,24 +6390,6 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
return ret ? ret : count;
}
-int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
-{
- int err = 0;
-
-#ifdef CONFIG_SCHED_SMT
- if (smt_capable())
- err = sysfs_create_file(&cls->kset.kobj,
- &attr_sched_smt_power_savings.attr);
-#endif
-#ifdef CONFIG_SCHED_MC
- if (!err && mc_capable())
- err = sysfs_create_file(&cls->kset.kobj,
- &attr_sched_mc_power_savings.attr);
-#endif
- return err;
-}
-#endif
-
#ifdef CONFIG_SCHED_MC
static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
{
@@ -6158,8 +6400,8 @@ static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
{
return sched_power_savings_store(buf, count, 0);
}
-SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
- sched_mc_power_savings_store);
+static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
+ sched_mc_power_savings_store);
#endif
#ifdef CONFIG_SCHED_SMT
@@ -6172,8 +6414,26 @@ static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
{
return sched_power_savings_store(buf, count, 1);
}
-SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
- sched_smt_power_savings_store);
+static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+ sched_smt_power_savings_store);
+#endif
+
+int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+{
+ int err = 0;
+
+#ifdef CONFIG_SCHED_SMT
+ if (smt_capable())
+ err = sysfs_create_file(&cls->kset.kobj,
+ &attr_sched_smt_power_savings.attr);
+#endif
+#ifdef CONFIG_SCHED_MC
+ if (!err && mc_capable())
+ err = sysfs_create_file(&cls->kset.kobj,
+ &attr_sched_mc_power_savings.attr);
+#endif
+ return err;
+}
#endif
/*
@@ -6228,6 +6488,8 @@ void __init sched_init_smp(void)
/* XXX: Theoretical race here - CPU may be hotplugged now */
hotcpu_notifier(update_sched_domains, 0);
+ init_sched_domain_sysctl();
+
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed(current, non_isolated_cpus) < 0)
BUG();
@@ -6314,6 +6576,10 @@ void __init sched_init(void)
set_load_weight(&init_task);
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ INIT_HLIST_HEAD(&init_task.preempt_notifiers);
+#endif
+
#ifdef CONFIG_SMP
nr_cpu_ids = highest_cpu + 1;
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
@@ -6379,12 +6645,14 @@ void normalize_rt_tasks(void)
do_each_thread(g, p) {
p->se.fair_key = 0;
p->se.wait_runtime = 0;
+ p->se.exec_start = 0;
p->se.wait_start_fair = 0;
+ p->se.sleep_start_fair = 0;
+#ifdef CONFIG_SCHEDSTATS
p->se.wait_start = 0;
- p->se.exec_start = 0;
p->se.sleep_start = 0;
- p->se.sleep_start_fair = 0;
p->se.block_start = 0;
+#endif
task_rq(p)->cfs.fair_clock = 0;
task_rq(p)->clock = 0;
@@ -6408,12 +6676,13 @@ void normalize_rt_tasks(void)
goto out_unlock;
#endif
+ update_rq_clock(rq);
on_rq = p->se.on_rq;
if (on_rq)
- deactivate_task(task_rq(p), p, 0);
+ deactivate_task(rq, p, 0);
__setscheduler(rq, p, SCHED_NORMAL, 0);
if (on_rq) {
- activate_task(task_rq(p), p, 0);
+ activate_task(rq, p, 0);
resched_task(rq->curr);
}
#ifdef CONFIG_SMP