summaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c251
1 files changed, 143 insertions, 108 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 84b9e076812..ec5f472bc5b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,6 +75,9 @@
#include <asm/tlb.h>
#include <asm/irq_regs.h>
#include <asm/mutex.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
#include "sched_cpupri.h"
#include "workqueue_sched.h"
@@ -124,7 +127,7 @@
static inline int rt_policy(int policy)
{
- if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
+ if (policy == SCHED_FIFO || policy == SCHED_RR)
return 1;
return 0;
}
@@ -422,6 +425,7 @@ struct rt_rq {
*/
struct root_domain {
atomic_t refcount;
+ atomic_t rto_count;
struct rcu_head rcu;
cpumask_var_t span;
cpumask_var_t online;
@@ -431,7 +435,6 @@ struct root_domain {
* one runnable RT task.
*/
cpumask_var_t rto_mask;
- atomic_t rto_count;
struct cpupri cpupri;
};
@@ -528,6 +531,12 @@ struct rq {
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
#endif
+#ifdef CONFIG_PARAVIRT
+ u64 prev_steal_time;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ u64 prev_steal_time_rq;
+#endif
/* calc_load related fields */
unsigned long calc_load_update;
@@ -581,7 +590,6 @@ static inline int cpu_of(struct rq *rq)
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
- rcu_read_lock_held() || \
lockdep_is_held(&sched_domains_mutex))
/*
@@ -1568,38 +1576,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
return rq->avg_load_per_task;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/*
- * Compute the cpu's hierarchical load factor for each task group.
- * This needs to be done in a top-down fashion because the load of a child
- * group is a fraction of its parents load.
- */
-static int tg_load_down(struct task_group *tg, void *data)
-{
- unsigned long load;
- long cpu = (long)data;
-
- if (!tg->parent) {
- load = cpu_rq(cpu)->load.weight;
- } else {
- load = tg->parent->cfs_rq[cpu]->h_load;
- load *= tg->se[cpu]->load.weight;
- load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
- }
-
- tg->cfs_rq[cpu]->h_load = load;
-
- return 0;
-}
-
-static void update_h_load(long cpu)
-{
- walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
-}
-
-#endif
-
#ifdef CONFIG_PREEMPT
static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -1953,10 +1929,28 @@ void account_system_vtime(struct task_struct *curr)
}
EXPORT_SYMBOL_GPL(account_system_vtime);
-static void update_rq_clock_task(struct rq *rq, s64 delta)
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
{
- s64 irq_delta;
+ if (unlikely(steal > NSEC_PER_SEC))
+ return div_u64(steal, TICK_NSEC);
+
+ return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+}
+#endif
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+ s64 steal = 0, irq_delta = 0;
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
/*
@@ -1979,12 +1973,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ if (static_branch((&paravirt_steal_rq_enabled))) {
+ u64 st;
+
+ steal = paravirt_steal_clock(cpu_of(rq));
+ steal -= rq->prev_steal_time_rq;
+
+ if (unlikely(steal > delta))
+ steal = delta;
+
+ st = steal_ticks(steal);
+ steal = st * TICK_NSEC;
+
+ rq->prev_steal_time_rq += steal;
+
+ delta -= steal;
+ }
+#endif
+
rq->clock_task += delta;
- if (irq_delta && sched_feat(NONIRQ_POWER))
- sched_rt_avg_update(rq, irq_delta);
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+ if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+ sched_rt_avg_update(rq, irq_delta + steal);
+#endif
}
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
static int irqtime_account_hi_update(void)
{
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
@@ -2019,12 +2036,7 @@ static int irqtime_account_si_update(void)
#define sched_clock_irqtime (0)
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
- rq->clock_task += delta;
-}
-
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#endif
#include "sched_idletask.c"
#include "sched_fair.c"
@@ -2497,7 +2509,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
if (p->sched_class->task_woken)
p->sched_class->task_woken(rq, p);
- if (unlikely(rq->idle_stamp)) {
+ if (rq->idle_stamp) {
u64 delta = rq->clock - rq->idle_stamp;
u64 max = 2*sysctl_sched_migration_cost;
@@ -2886,7 +2898,7 @@ void sched_fork(struct task_struct *p)
#if defined(CONFIG_SMP)
p->on_cpu = 0;
#endif
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
#endif
@@ -3053,7 +3065,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
local_irq_disable();
#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
- perf_event_task_sched_in(current);
+ perf_event_task_sched_in(prev, current);
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
local_irq_enable();
#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
@@ -3877,6 +3889,25 @@ void account_idle_time(cputime_t cputime)
cpustat->idle = cputime64_add(cpustat->idle, cputime64);
}
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+ if (static_branch(&paravirt_steal_enabled)) {
+ u64 steal, st = 0;
+
+ steal = paravirt_steal_clock(smp_processor_id());
+ steal -= this_rq()->prev_steal_time;
+
+ st = steal_ticks(steal);
+ this_rq()->prev_steal_time += st * TICK_NSEC;
+
+ account_steal_time(st);
+ return st;
+ }
+#endif
+ return false;
+}
+
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -3908,6 +3939,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ if (steal_account_process_tick())
+ return;
+
if (irqtime_account_hi_update()) {
cpustat->irq = cputime64_add(cpustat->irq, tmp);
} else if (irqtime_account_si_update()) {
@@ -3961,6 +3995,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
}
+ if (steal_account_process_tick())
+ return;
+
if (user_tick)
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -4242,9 +4279,9 @@ pick_next_task(struct rq *rq)
}
/*
- * schedule() is the main scheduler function.
+ * __schedule() is the main scheduler function.
*/
-asmlinkage void __sched schedule(void)
+static void __sched __schedule(void)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
@@ -4285,16 +4322,6 @@ need_resched:
if (to_wakeup)
try_to_wake_up_local(to_wakeup);
}
-
- /*
- * If we are going to sleep and we have plugged IO
- * queued, make sure to submit it to avoid deadlocks.
- */
- if (blk_needs_flush_plug(prev)) {
- raw_spin_unlock(&rq->lock);
- blk_schedule_flush_plug(prev);
- raw_spin_lock(&rq->lock);
- }
}
switch_count = &prev->nvcsw;
}
@@ -4332,17 +4359,34 @@ need_resched:
if (need_resched())
goto need_resched;
}
+
+static inline void sched_submit_work(struct task_struct *tsk)
+{
+ if (!tsk->state)
+ return;
+ /*
+ * If we are going to sleep and we have plugged IO queued,
+ * make sure to submit it to avoid deadlocks.
+ */
+ if (blk_needs_flush_plug(tsk))
+ blk_schedule_flush_plug(tsk);
+}
+
+asmlinkage void schedule(void)
+{
+ struct task_struct *tsk = current;
+
+ sched_submit_work(tsk);
+ __schedule();
+}
EXPORT_SYMBOL(schedule);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
{
- bool ret = false;
-
- rcu_read_lock();
if (lock->owner != owner)
- goto fail;
+ return false;
/*
* Ensure we emit the owner->on_cpu, dereference _after_ checking
@@ -4352,11 +4396,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
*/
barrier();
- ret = owner->on_cpu;
-fail:
- rcu_read_unlock();
-
- return ret;
+ return owner->on_cpu;
}
/*
@@ -4368,21 +4408,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
if (!sched_feat(OWNER_SPIN))
return 0;
+ rcu_read_lock();
while (owner_running(lock, owner)) {
if (need_resched())
- return 0;
+ break;
arch_mutex_cpu_relax();
}
+ rcu_read_unlock();
/*
- * If the owner changed to another task there is likely
- * heavy contention, stop spinning.
+ * We break out the loop above on need_resched() and when the
+ * owner changed, which is a sign for heavy contention. Return
+ * success only when lock->owner is NULL.
*/
- if (lock->owner)
- return 0;
-
- return 1;
+ return lock->owner == NULL;
}
#endif
@@ -4405,7 +4445,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
do {
add_preempt_count_notrace(PREEMPT_ACTIVE);
- schedule();
+ __schedule();
sub_preempt_count_notrace(PREEMPT_ACTIVE);
/*
@@ -4433,7 +4473,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
do {
add_preempt_count(PREEMPT_ACTIVE);
local_irq_enable();
- schedule();
+ __schedule();
local_irq_disable();
sub_preempt_count(PREEMPT_ACTIVE);
@@ -5558,7 +5598,7 @@ static inline int should_resched(void)
static void __cond_resched(void)
{
add_preempt_count(PREEMPT_ACTIVE);
- schedule();
+ __schedule();
sub_preempt_count(PREEMPT_ACTIVE);
}
@@ -7413,6 +7453,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
if (sd && (sd->flags & SD_OVERLAP))
free_sched_groups(sd->groups, 0);
+ kfree(*per_cpu_ptr(sdd->sd, j));
kfree(*per_cpu_ptr(sdd->sg, j));
kfree(*per_cpu_ptr(sdd->sgp, j));
}
@@ -7898,17 +7939,10 @@ int in_sched_functions(unsigned long addr)
&& addr < (unsigned long)__sched_text_end);
}
-static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
+static void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT;
INIT_LIST_HEAD(&cfs_rq->tasks);
-#ifdef CONFIG_FAIR_GROUP_SCHED
- cfs_rq->rq = rq;
- /* allow initial update_cfs_load() to truncate */
-#ifdef CONFIG_SMP
- cfs_rq->load_stamp = 1;
-#endif
-#endif
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
#ifndef CONFIG_64BIT
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
@@ -7928,13 +7962,9 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
/* delimiter for bitsearch: */
__set_bit(MAX_RT_PRIO, array->bitmap);
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+#if defined CONFIG_SMP
rt_rq->highest_prio.curr = MAX_RT_PRIO;
-#ifdef CONFIG_SMP
rt_rq->highest_prio.next = MAX_RT_PRIO;
-#endif
-#endif
-#ifdef CONFIG_SMP
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
@@ -7944,11 +7974,6 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
rt_rq->rt_throttled = 0;
rt_rq->rt_runtime = 0;
raw_spin_lock_init(&rt_rq->rt_runtime_lock);
-
-#ifdef CONFIG_RT_GROUP_SCHED
- rt_rq->rt_nr_boosted = 0;
- rt_rq->rq = rq;
-#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -7957,11 +7982,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *parent)
{
struct rq *rq = cpu_rq(cpu);
- tg->cfs_rq[cpu] = cfs_rq;
- init_cfs_rq(cfs_rq, rq);
+
cfs_rq->tg = tg;
+ cfs_rq->rq = rq;
+#ifdef CONFIG_SMP
+ /* allow initial update_cfs_load() to truncate */
+ cfs_rq->load_stamp = 1;
+#endif
+ tg->cfs_rq[cpu] = cfs_rq;
tg->se[cpu] = se;
+
/* se could be NULL for root_task_group */
if (!se)
return;
@@ -7984,12 +8015,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
{
struct rq *rq = cpu_rq(cpu);
- tg->rt_rq[cpu] = rt_rq;
- init_rt_rq(rt_rq, rq);
+ rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ rt_rq->rt_nr_boosted = 0;
+ rt_rq->rq = rq;
rt_rq->tg = tg;
- rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
+ tg->rt_rq[cpu] = rt_rq;
tg->rt_se[cpu] = rt_se;
+
if (!rt_se)
return;
@@ -8071,7 +8104,7 @@ void __init sched_init(void)
rq->nr_running = 0;
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
- init_cfs_rq(&rq->cfs, rq);
+ init_cfs_rq(&rq->cfs);
init_rt_rq(&rq->rt, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = root_task_group_load;
@@ -8185,7 +8218,7 @@ void __init sched_init(void)
scheduler_running = 1;
}
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
static inline int preempt_count_equals(int preempt_offset)
{
int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
@@ -8195,7 +8228,6 @@ static inline int preempt_count_equals(int preempt_offset)
void __might_sleep(const char *file, int line, int preempt_offset)
{
-#ifdef in_atomic
static unsigned long prev_jiffy; /* ratelimiting */
if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
@@ -8217,7 +8249,6 @@ void __might_sleep(const char *file, int line, int preempt_offset)
if (irqs_disabled())
print_irqtrace_events(current);
dump_stack();
-#endif
}
EXPORT_SYMBOL(__might_sleep);
#endif
@@ -8376,6 +8407,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
if (!se)
goto err_free_rq;
+ init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
}
@@ -8403,7 +8435,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
-#else /* !CONFG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
static inline void free_fair_sched_group(struct task_group *tg)
{
}
@@ -8424,7 +8456,8 @@ static void free_rt_sched_group(struct task_group *tg)
{
int i;
- destroy_rt_bandwidth(&tg->rt_bandwidth);
+ if (tg->rt_se)
+ destroy_rt_bandwidth(&tg->rt_bandwidth);
for_each_possible_cpu(i) {
if (tg->rt_rq)
@@ -8465,6 +8498,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
if (!rt_se)
goto err_free_rq;
+ init_rt_rq(rt_rq, cpu_rq(i));
+ rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
}