From 606dba2e289446600a0b68422ed2019af5355c12 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 11 Feb 2012 06:05:00 +0100 Subject: sched: Push put_prev_task() into pick_next_task() In order to avoid having to do put/set on a whole cgroup hierarchy when we context switch, push the put into pick_next_task() so that both operations are in the same function. Further changes then allow us to possibly optimize away redundant work. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel/sched/rt.c') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a2740b775b4..a15ca1c0c7b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1310,15 +1310,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) { struct sched_rt_entity *rt_se; struct task_struct *p; - struct rt_rq *rt_rq; - - rt_rq = &rq->rt; - - if (!rt_rq->rt_nr_running) - return NULL; - - if (rt_rq_throttled(rt_rq)) - return NULL; + struct rt_rq *rt_rq = &rq->rt; do { rt_se = pick_next_rt_entity(rq, rt_rq); @@ -1332,9 +1324,22 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return p; } -static struct task_struct *pick_next_task_rt(struct rq *rq) +static struct task_struct * +pick_next_task_rt(struct rq *rq, struct task_struct *prev) { - struct task_struct *p = _pick_next_task_rt(rq); + struct task_struct *p; + struct rt_rq *rt_rq = &rq->rt; + + if (!rt_rq->rt_nr_running) + return NULL; + + if (rt_rq_throttled(rt_rq)) + return NULL; + + if (prev) + prev->sched_class->put_prev_task(rq, prev); + + p = _pick_next_task_rt(rq); /* The running task is never eligible for pushing */ if (p) -- cgit v1.2.3-70-g09d2 From 38033c37faab850ed5d33bb675c4de6c66be84d8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Jan 2014 20:32:21 +0100 Subject: sched: Push down pre_schedule() and idle_balance() This patch both merged idle_balance() and pre_schedule() and pushes both of them into pick_next_task(). Conceptually pre_schedule() and idle_balance() are rather similar, both are used to pull more work onto the current CPU. We cannot however first move idle_balance() into pre_schedule_fair() since there is no guarantee the last runnable task is a fair task, and thus we would miss newidle balances. Similarly, the dl and rt pre_schedule calls must be ran before idle_balance() since their respective tasks have higher priority and it would not do to delay their execution searching for less important tasks first. However, by noticing that pick_next_tasks() already traverses the sched_class hierarchy in the right order, we can get the right behaviour and do away with both calls. We must however change the special case optimization to also require that prev is of sched_class_fair, otherwise we can miss doing a dl or rt pull where we needed one. Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/n/tip-a8k6vvaebtn64nie345kx1je@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 26 ++------------------------ kernel/sched/deadline.c | 15 +++++++-------- kernel/sched/fair.c | 26 ++++++++++++++++++++++---- kernel/sched/idle_task.c | 12 +++++------- kernel/sched/rt.c | 16 ++++++++-------- kernel/sched/sched.h | 1 - 6 files changed, 44 insertions(+), 52 deletions(-) (limited to 'kernel/sched/rt.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dedb5f07666..3068f37f7c5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2169,13 +2169,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) #ifdef CONFIG_SMP -/* assumes rq->lock is held */ -static inline void pre_schedule(struct rq *rq, struct task_struct *prev) -{ - if (prev->sched_class->pre_schedule) - prev->sched_class->pre_schedule(rq, prev); -} - /* rq->lock is NOT held, but preemption is disabled */ static inline void post_schedule(struct rq *rq) { @@ -2193,10 +2186,6 @@ static inline void post_schedule(struct rq *rq) #else -static inline void pre_schedule(struct rq *rq, struct task_struct *p) -{ -} - static inline void post_schedule(struct rq *rq) { } @@ -2592,7 +2581,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev) * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ - if (likely(rq->nr_running == rq->cfs.h_nr_running)) { + if (likely(prev->sched_class == &fair_sched_class && + rq->nr_running == rq->cfs.h_nr_running)) { p = fair_sched_class.pick_next_task(rq, prev); if (likely(p)) return p; @@ -2695,18 +2685,6 @@ need_resched: switch_count = &prev->nvcsw; } - pre_schedule(rq, prev); - - if (unlikely(!rq->nr_running)) { - /* - * We must set idle_stamp _before_ calling idle_balance(), such - * that we measure the duration of idle_balance() as idle time. - */ - rq->idle_stamp = rq_clock(rq); - if (idle_balance(rq)) - rq->idle_stamp = 0; - } - if (prev->on_rq || rq->skip_clock_update < 0) update_rq_clock(rq); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 50797d57608..ed31ef66ab9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -944,6 +944,8 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) resched_task(rq->curr); } +static int pull_dl_task(struct rq *this_rq); + #endif /* CONFIG_SMP */ /* @@ -998,6 +1000,11 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) dl_rq = &rq->dl; +#ifdef CONFIG_SMP + if (dl_task(prev)) + pull_dl_task(rq); +#endif + if (unlikely(!dl_rq->dl_nr_running)) return NULL; @@ -1429,13 +1436,6 @@ skip: return ret; } -static void pre_schedule_dl(struct rq *rq, struct task_struct *prev) -{ - /* Try to pull other tasks here */ - if (dl_task(prev)) - pull_dl_task(rq); -} - static void post_schedule_dl(struct rq *rq) { push_dl_tasks(rq); @@ -1628,7 +1628,6 @@ const struct sched_class dl_sched_class = { .set_cpus_allowed = set_cpus_allowed_dl, .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, - .pre_schedule = pre_schedule_dl, .post_schedule = post_schedule_dl, .task_woken = task_woken_dl, #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a81b241ff70..43b49fe077a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2577,7 +2577,8 @@ void idle_exit_fair(struct rq *this_rq) update_rq_runnable_avg(this_rq, 0); } -#else +#else /* CONFIG_SMP */ + static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) {} static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} @@ -2589,7 +2590,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, int sleep) {} static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) {} -#endif +#endif /* CONFIG_SMP */ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -4682,9 +4683,10 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev) struct sched_entity *se; struct task_struct *p; +again: __maybe_unused #ifdef CONFIG_FAIR_GROUP_SCHED if (!cfs_rq->nr_running) - return NULL; + goto idle; if (!prev || prev->sched_class != &fair_sched_class) goto simple; @@ -4760,7 +4762,7 @@ simple: #endif if (!cfs_rq->nr_running) - return NULL; + goto idle; if (prev) prev->sched_class->put_prev_task(rq, prev); @@ -4777,6 +4779,22 @@ simple: hrtick_start_fair(rq, p); return p; + +idle: +#ifdef CONFIG_SMP + idle_enter_fair(rq); + /* + * We must set idle_stamp _before_ calling idle_balance(), such that we + * measure the duration of idle_balance() as idle time. + */ + rq->idle_stamp = rq_clock(rq); + if (idle_balance(rq)) { /* drops rq->lock */ + rq->idle_stamp = 0; + goto again; + } +#endif + + return NULL; } /* diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 721371bf03b..f7d03af79a5 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -13,13 +13,8 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } - -static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) -{ - idle_exit_fair(rq); - rq_last_tick_reset(rq); -} #endif /* CONFIG_SMP */ + /* * Idle tasks are unconditionally rescheduled: */ @@ -56,6 +51,10 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { +#ifdef CONFIG_SMP + idle_exit_fair(rq); + rq_last_tick_reset(rq); +#endif } static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) @@ -99,7 +98,6 @@ const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, - .pre_schedule = pre_schedule_idle, #endif .set_curr_task = set_curr_task_idle, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a15ca1c0c7b..72f9ec75997 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -229,6 +229,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) #ifdef CONFIG_SMP +static int pull_rt_task(struct rq *this_rq); + static inline int rt_overloaded(struct rq *rq) { return atomic_read(&rq->rd->rto_count); @@ -1330,6 +1332,12 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) struct task_struct *p; struct rt_rq *rt_rq = &rq->rt; +#ifdef CONFIG_SMP + /* Try to pull RT tasks here if we lower this rq's prio */ + if (rq->rt.highest_prio.curr > prev->prio) + pull_rt_task(rq); +#endif + if (!rt_rq->rt_nr_running) return NULL; @@ -1721,13 +1729,6 @@ skip: return ret; } -static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) -{ - /* Try to pull RT tasks here if we lower this rq's prio */ - if (rq->rt.highest_prio.curr > prev->prio) - pull_rt_task(rq); -} - static void post_schedule_rt(struct rq *rq) { push_rt_tasks(rq); @@ -2004,7 +2005,6 @@ const struct sched_class rt_sched_class = { .set_cpus_allowed = set_cpus_allowed_rt, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, - .pre_schedule = pre_schedule_rt, .post_schedule = post_schedule_rt, .task_woken = task_woken_rt, .switched_from = switched_from_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c534cf4181a..1bf34c257d3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1118,7 +1118,6 @@ struct sched_class { int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p, int next_cpu); - void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); -- cgit v1.2.3-70-g09d2 From 3f1d2a318171bf61850d4e5a72031271e5aada76 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 12 Feb 2014 10:49:30 +0100 Subject: sched: Fix hotplug task migration Dan Carpenter reported: > kernel/sched/rt.c:1347 pick_next_task_rt() warn: variable dereferenced before check 'prev' (see line 1338) > kernel/sched/deadline.c:1011 pick_next_task_dl() warn: variable dereferenced before check 'prev' (see line 1005) Kirill also spotted that migrate_tasks() will have an instant NULL deref because pick_next_task() will immediately deref prev. Instead of fixing all the corner cases because migrate_tasks() can pass in a NULL prev task in the unlikely case of hot-un-plug, provide a fake task such that we can remove all the NULL checks from the far more common paths. A further problem; not previously spotted; is that because we pushed pre_schedule() and idle_balance() into pick_next_task() we now need to avoid those getting called and pulling more tasks on our dying CPU. We avoid pull_{dl,rt}_task() by setting fake_task.prio to MAX_PRIO+1. We also note that since we call pick_next_task() exactly the amount of times we have runnable tasks present, we should never land in idle_balance(). Fixes: 38033c37faab ("sched: Push down pre_schedule() and idle_balance()") Cc: Juri Lelli Cc: Ingo Molnar Cc: Steven Rostedt Reported-by: Kirill Tkhai Reported-by: Dan Carpenter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140212094930.GB3545@laptop.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 18 +++++++++++++++++- kernel/sched/deadline.c | 3 +-- kernel/sched/fair.c | 5 ++--- kernel/sched/idle_task.c | 3 +-- kernel/sched/rt.c | 3 +-- kernel/sched/sched.h | 5 +++++ kernel/sched/stop_task.c | 3 +-- 7 files changed, 28 insertions(+), 12 deletions(-) (limited to 'kernel/sched/rt.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fb9764fbc53..49db434a35d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4681,6 +4681,22 @@ static void calc_load_migrate(struct rq *rq) atomic_long_add(delta, &calc_load_tasks); } +static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) +{ +} + +static const struct sched_class fake_sched_class = { + .put_prev_task = put_prev_task_fake, +}; + +static struct task_struct fake_task = { + /* + * Avoid pull_{rt,dl}_task() + */ + .prio = MAX_PRIO + 1, + .sched_class = &fake_sched_class, +}; + /* * Migrate all tasks from the rq, sleeping tasks will be migrated by * try_to_wake_up()->select_task_rq(). @@ -4721,7 +4737,7 @@ static void migrate_tasks(unsigned int dead_cpu) if (rq->nr_running == 1) break; - next = pick_next_task(rq, NULL); + next = pick_next_task(rq, &fake_task); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ed31ef66ab9..bfeb84ecc32 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1008,8 +1008,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) if (unlikely(!dl_rq->dl_nr_running)) return NULL; - if (prev) - prev->sched_class->put_prev_task(rq, prev); + put_prev_task(rq, prev); dl_se = pick_next_dl_entity(rq, dl_rq); BUG_ON(!dl_se); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 40c758bbdd5..e884e45982a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4690,7 +4690,7 @@ again: if (!cfs_rq->nr_running) goto idle; - if (!prev || prev->sched_class != &fair_sched_class) + if (prev->sched_class != &fair_sched_class) goto simple; /* @@ -4766,8 +4766,7 @@ simple: if (!cfs_rq->nr_running) goto idle; - if (prev) - prev->sched_class->put_prev_task(rq, prev); + put_prev_task(rq, prev); do { se = pick_next_entity(cfs_rq, NULL); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index f7d03af79a5..53ff9e7c76d 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -26,8 +26,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl static struct task_struct * pick_next_task_idle(struct rq *rq, struct task_struct *prev) { - if (prev) - prev->sched_class->put_prev_task(rq, prev); + put_prev_task(rq, prev); schedstat_inc(rq, sched_goidle); #ifdef CONFIG_SMP diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 72f9ec75997..65c2d6881ac 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1344,8 +1344,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) if (rt_rq_throttled(rt_rq)) return NULL; - if (prev) - prev->sched_class->put_prev_task(rq, prev); + put_prev_task(rq, prev); p = _pick_next_task_rt(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 92018f9821e..d276147ba5e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1147,6 +1147,11 @@ struct sched_class { #endif }; +static inline void put_prev_task(struct rq *rq, struct task_struct *prev) +{ + prev->sched_class->put_prev_task(rq, prev); +} + #define sched_class_highest (&stop_sched_class) #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index a4147c9d201..d6ce65dde54 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -31,8 +31,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev) if (!stop || !stop->on_rq) return NULL; - if (prev) - prev->sched_class->put_prev_task(rq, prev); + put_prev_task(rq, prev); stop->se.exec_start = rq_clock_task(rq); -- cgit v1.2.3-70-g09d2 From dc87734106bb6e97c92d8bd81f261fb71976ec2c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 12 Feb 2014 15:47:29 +0100 Subject: sched: Remove some #ifdeffery Remove a few gratuitous #ifdefs in pick_next_task*(). Cc: Ingo Molnar Cc: Steven Rostedt Cc: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-nnzddp5c4fijyzzxxrwlxghf@git.kernel.org Signed-off-by: Thomas Gleixner --- kernel/sched/deadline.c | 31 +++++++++++++++++++++++++------ kernel/sched/idle_task.c | 4 ---- kernel/sched/rt.c | 41 ++++++++++++++++++++++++++++++----------- kernel/sched/sched.h | 5 +++++ 4 files changed, 60 insertions(+), 21 deletions(-) (limited to 'kernel/sched/rt.c') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index bfeb84ecc32..3185b775dbf 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -214,6 +214,16 @@ static inline int has_pushable_dl_tasks(struct rq *rq) static int push_dl_task(struct rq *rq); +static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) +{ + return dl_task(prev); +} + +static inline void set_post_schedule(struct rq *rq) +{ + rq->post_schedule = has_pushable_dl_tasks(rq); +} + #else static inline @@ -236,6 +246,19 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { } +static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) +{ + return false; +} + +static inline int pull_dl_task(struct rq *rq) +{ + return 0; +} + +static inline void set_post_schedule(struct rq *rq) +{ +} #endif /* CONFIG_SMP */ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); @@ -1000,10 +1023,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) dl_rq = &rq->dl; -#ifdef CONFIG_SMP - if (dl_task(prev)) + if (need_pull_dl_task(rq, prev)) pull_dl_task(rq); -#endif if (unlikely(!dl_rq->dl_nr_running)) return NULL; @@ -1024,9 +1045,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) start_hrtick_dl(rq, p); #endif -#ifdef CONFIG_SMP - rq->post_schedule = has_pushable_dl_tasks(rq); -#endif /* CONFIG_SMP */ + set_post_schedule(rq); return p; } diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 53ff9e7c76d..1f372588283 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -29,9 +29,7 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev) put_prev_task(rq, prev); schedstat_inc(rq, sched_goidle); -#ifdef CONFIG_SMP idle_enter_fair(rq); -#endif return rq->idle; } @@ -50,10 +48,8 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { -#ifdef CONFIG_SMP idle_exit_fair(rq); rq_last_tick_reset(rq); -#endif } static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 65c2d6881ac..3e488ca6050 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -231,6 +231,12 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) static int pull_rt_task(struct rq *this_rq); +static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) +{ + /* Try to pull RT tasks here if we lower this rq's prio */ + return rq->rt.highest_prio.curr > prev->prio; +} + static inline int rt_overloaded(struct rq *rq) { return atomic_read(&rq->rd->rto_count); @@ -317,6 +323,15 @@ static inline int has_pushable_tasks(struct rq *rq) return !plist_head_empty(&rq->rt.pushable_tasks); } +static inline void set_post_schedule(struct rq *rq) +{ + /* + * We detect this state here so that we can avoid taking the RQ + * lock again later if there is no need to push + */ + rq->post_schedule = has_pushable_tasks(rq); +} + static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) { plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); @@ -361,6 +376,19 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { } +static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) +{ + return false; +} + +static inline int pull_rt_task(struct rq *this_rq) +{ + return 0; +} + +static inline void set_post_schedule(struct rq *rq) +{ +} #endif /* CONFIG_SMP */ static inline int on_rt_rq(struct sched_rt_entity *rt_se) @@ -1332,11 +1360,8 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) struct task_struct *p; struct rt_rq *rt_rq = &rq->rt; -#ifdef CONFIG_SMP - /* Try to pull RT tasks here if we lower this rq's prio */ - if (rq->rt.highest_prio.curr > prev->prio) + if (need_pull_rt_task(rq, prev)) pull_rt_task(rq); -#endif if (!rt_rq->rt_nr_running) return NULL; @@ -1352,13 +1377,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) if (p) dequeue_pushable_task(rq, p); -#ifdef CONFIG_SMP - /* - * We detect this state here so that we can avoid taking the RQ - * lock again later if there is no need to push - */ - rq->post_schedule = has_pushable_tasks(rq); -#endif + set_post_schedule(rq); return p; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d276147ba5e..caf4abda45e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1172,6 +1172,11 @@ extern void trigger_load_balance(struct rq *rq); extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); +#else + +static inline void idle_enter_fair(struct rq *rq) { } +static inline void idle_exit_fair(struct rq *rq) { } + #endif extern void sysrq_sched_debug_show(void); -- cgit v1.2.3-70-g09d2 From 11c785b79ef2a669e4bf7be5cf2c3904b8fed015 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 8 Feb 2014 14:17:45 +0800 Subject: sched/rt: Make init_sched_rt_calss() __init It's a bootstrap function. Signed-off-by: Li Zefan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/52F5CC09.1080502@huawei.com Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/rt.c') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3e488ca6050..4d4b386598a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1849,7 +1849,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) resched_task(rq->curr); } -void init_sched_rt_class(void) +void __init init_sched_rt_class(void) { unsigned int i; -- cgit v1.2.3-70-g09d2 From faa5993736d9b44b508cab4f1f3a77d66641c6f4 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 21 Feb 2014 11:37:15 +0100 Subject: sched/deadline: Prevent rt_time growth to infinity Kirill Tkhai noted: Since deadline tasks share rt bandwidth, we must care about bandwidth timer set. Otherwise rt_time may grow up to infinity in update_curr_dl(), if there are no other available RT tasks on top level bandwidth. RT task were in fact throttled right after they got enqueued, and never executed again (rt_time never again went below rt_runtime). Peter then proposed to accrue DL execution on rt_time only when rt timer is active, and proposed a patch (this patch is a slight modification of that) to implement that behavior. While this solves Kirill problem, it has a drawback. Indeed, Kirill noted again: It looks we may get into a situation, when all CPU time is shared between RT and DL tasks: rt_runtime = n rt_period = 2n | RT working, DL sleeping | DL working, RT sleeping | ----------------------------------------------------------- | (1) duration = n | (2) duration = n | (repeat) |--------------------------|------------------------------| | (rt_bw timer is running) | (rt_bw timer is not running) | No time for fair tasks at all. While this can happen during the first period, if rq is always backlogged, RT tasks won't have the opportunity to execute anymore: rt_time reached rt_runtime during (1), suppose after (2) RT is enqueued back, it gets throttled since rt timer didn't fire, replenishment is from now on eaten up by DL tasks that accrue their execution on rt_time (while rt timer is active - we have an RT task waiting for replenishment). FAIR tasks are not touched after this first period. Ok, this is not ideal, and the situation is even worse! What above (the nice case), practically never happens in reality, where your rt timer is not aligned to tasks periods, tasks are in general not periodic, etc.. Long story short, you always risk to overload your system. This patch is based on Peter's idea, but exploits an additional fact: if you don't have RT tasks enqueued, it makes little sense to continue incrementing rt_time once you reached the upper limit (DL tasks have their own mechanism for throttling). This cures both problems: - no matter how many DL instances in the past, you'll have an rt_time slightly above rt_runtime when an RT task is enqueued, and from that point on (after the first replenishment), the task will normally execute; - you can still eat up all bandwidth during the first period, but not anymore after that, remember that DL execution will increment rt_time till the upper limit is reached. The situation is still not perfect! But, we have a simple solution for now, that limits how much you can jeopardize your system, as we keep working towards the right answer: RT groups scheduled using deadline servers. Reported-by: Kirill Tkhai Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20140225151515.617714e2f2cd6c558531ba61@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 8 ++++++-- kernel/sched/rt.c | 8 ++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel/sched/rt.c') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index aecf93030e0..6e79b3faa4c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -562,6 +562,8 @@ int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) return 1; } +extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); + /* * Update the current task's runtime statistics (provided it is still * a -deadline task and has not been removed from the dl_rq). @@ -625,11 +627,13 @@ static void update_curr_dl(struct rq *rq) struct rt_rq *rt_rq = &rq->rt; raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_time += delta_exec; /* * We'll let actual RT tasks worry about the overflow here, we - * have our own CBS to keep us inline -- see above. + * have our own CBS to keep us inline; only account when RT + * bandwidth is relevant. */ + if (sched_rt_bandwidth_account(rt_rq)) + rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a2740b775b4..1999021042c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -538,6 +538,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif /* CONFIG_RT_GROUP_SCHED */ +bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) +{ + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + return (hrtimer_active(&rt_b->rt_period_timer) || + rt_rq->rt_time < rt_b->rt_runtime); +} + #ifdef CONFIG_SMP /* * We ran out of runtime, see if we can borrow some from our neighbours. -- cgit v1.2.3-70-g09d2 From 37e117c07b89194aae7062bc63bde1104c03db02 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Feb 2014 12:25:08 +0100 Subject: sched: Guarantee task priority in pick_next_task() Michael spotted that the idle_balance() push down created a task priority problem. Previously, when we called idle_balance() before pick_next_task() it wasn't a problem when -- because of the rq->lock droppage -- an rt/dl task slipped in. Similarly for pre_schedule(), rt pre-schedule could have a dl task slip in. But by pulling it into the pick_next_task() loop, we'll not try a higher task priority again. Cure this by creating a re-start condition in pick_next_task(); and triggering this from pick_next_task_{rt,fair}(). It also fixes a live-lock where we get stuck in pick_next_task_fair() due to idle_balance() seeing !0 nr_running but there not actually being any fair tasks about. Reported-by: Michael Wang Fixes: 38033c37faab ("sched: Push down pre_schedule() and idle_balance()") Tested-by: Sasha Levin Signed-off-by: Peter Zijlstra Cc: Juri Lelli Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20140224121218.GR15586@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 12 ++++++++---- kernel/sched/fair.c | 13 ++++++++++++- kernel/sched/rt.c | 10 +++++++++- kernel/sched/sched.h | 5 +++++ 4 files changed, 34 insertions(+), 6 deletions(-) (limited to 'kernel/sched/rt.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a8a73b8897b..cde573d3f12 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2586,24 +2586,28 @@ static inline void schedule_debug(struct task_struct *prev) static inline struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev) { - const struct sched_class *class; + const struct sched_class *class = &fair_sched_class; struct task_struct *p; /* * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ - if (likely(prev->sched_class == &fair_sched_class && + if (likely(prev->sched_class == class && rq->nr_running == rq->cfs.h_nr_running)) { p = fair_sched_class.pick_next_task(rq, prev); - if (likely(p)) + if (likely(p && p != RETRY_TASK)) return p; } +again: for_each_class(class) { p = class->pick_next_task(rq, prev); - if (p) + if (p) { + if (unlikely(p == RETRY_TASK)) + goto again; return p; + } } BUG(); /* the idle class will always have a runnable task */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index be4f7d9eaf0..16042b58a32 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4686,6 +4686,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev) struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; struct task_struct *p; + int new_tasks; again: #ifdef CONFIG_FAIR_GROUP_SCHED @@ -4784,7 +4785,17 @@ simple: return p; idle: - if (idle_balance(rq)) /* drops rq->lock */ + /* + * Because idle_balance() releases (and re-acquires) rq->lock, it is + * possible for any higher priority task to appear. In that case we + * must re-start the pick_next_entity() loop. + */ + new_tasks = idle_balance(rq); + + if (rq->nr_running != rq->cfs.h_nr_running) + return RETRY_TASK; + + if (new_tasks) goto again; return NULL; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4d4b386598a..398b3f99082 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1360,8 +1360,16 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) struct task_struct *p; struct rt_rq *rt_rq = &rq->rt; - if (need_pull_rt_task(rq, prev)) + if (need_pull_rt_task(rq, prev)) { pull_rt_task(rq); + /* + * pull_rt_task() can drop (and re-acquire) rq->lock; this + * means a dl task can slip in, in which case we need to + * re-start task selection. + */ + if (unlikely(rq->dl.dl_nr_running)) + return RETRY_TASK; + } if (!rt_rq->rt_nr_running) return NULL; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 046084ebb1f..1929deb3f29 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1091,6 +1091,8 @@ static const u32 prio_to_wmult[40] = { #define DEQUEUE_SLEEP 1 +#define RETRY_TASK ((void *)-1UL) + struct sched_class { const struct sched_class *next; @@ -1105,6 +1107,9 @@ struct sched_class { * It is the responsibility of the pick_next_task() method that will * return the next task to call put_prev_task() on the @prev task or * something equivalent. + * + * May return RETRY_TASK when it finds a higher prio class has runnable + * tasks. */ struct task_struct * (*pick_next_task) (struct rq *rq, struct task_struct *prev); -- cgit v1.2.3-70-g09d2 From 734ff2a71f9e6aa6fedfa5a9a34818b8586516d5 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 4 Mar 2014 19:25:46 +0400 Subject: sched/rt: Fix picking RT and DL tasks from empty queue The problems: 1) We check for rt_nr_running before call of put_prev_task(). If previous task is RT, its rt_rq may become throttled and dequeued after this call. In case of p is from rt->rq this just causes picking a task from throttled queue, but in case of its rt_rq is child we are guaranteed catch BUG_ON. 2) The same with deadline class. The only difference we operate on only dl_rq. This patch fixes all the above problems and it adds a small skip in the DL update like we've already done for RT class: if (unlikely((s64)delta_exec <= 0)) return; This will optimize sequential update_curr_dl() calls a little. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Cc: Juri Lelli Link: http://lkml.kernel.org/r/1393946746.3643.3.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 10 ++++++++-- kernel/sched/rt.c | 7 +++++++ 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel/sched/rt.c') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e4f3ac3b851..27ef4092552 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -609,8 +609,8 @@ static void update_curr_dl(struct rq *rq) * approach need further study. */ delta_exec = rq_clock_task(rq) - curr->se.exec_start; - if (unlikely((s64)delta_exec < 0)) - delta_exec = 0; + if (unlikely((s64)delta_exec <= 0)) + return; schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -1023,6 +1023,12 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) if (need_pull_dl_task(rq, prev)) pull_dl_task(rq); + /* + * When prev is DL, we may throttle it in put_prev_task(). + * So, we update time before we check for dl_nr_running. + */ + if (prev->sched_class == &dl_sched_class) + update_curr_dl(rq); if (unlikely(!dl_rq->dl_nr_running)) return NULL; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index facc824334f..f3cee0a63b7 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1379,6 +1379,13 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) return RETRY_TASK; } + /* + * We may dequeue prev's rt_rq in put_prev_task(). + * So, we update time before rt_nr_running check. + */ + if (prev->sched_class == &rt_sched_class) + update_curr_rt(rq); + if (!rt_rq->rt_nr_running) return NULL; -- cgit v1.2.3-70-g09d2 From 4c6c4e38c4e9a454889298dcc498174968d14a09 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 6 Mar 2014 13:32:01 +0400 Subject: sched/core: Fix endless loop in pick_next_task() 1) Single cpu machine case. When rq has only RT tasks, but no one of them can be picked because of throttling, we enter in endless loop. pick_next_task_{dl,rt} return NULL. In pick_next_task_fair() we permanently go to retry if (rq->nr_running != rq->cfs.h_nr_running) return RETRY_TASK; (rq->nr_running is not being decremented when rt_rq becomes throttled). No chances to unthrottle any rt_rq or to wake fair here, because of rq is locked permanently and interrupts are disabled. 2) In case of SMP this can cause a hang too. Although we unlock rq in idle_balance(), interrupts are still disabled. The solution is to check for available tasks in DL and RT classes instead of checking for sum. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1394098321.19290.11.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 +++- kernel/sched/rt.c | 10 ---------- kernel/sched/sched.h | 12 ++++++++++++ 3 files changed, 15 insertions(+), 11 deletions(-) (limited to 'kernel/sched/rt.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b956e70fc50..10db4a87ad7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6728,7 +6728,9 @@ static int idle_balance(struct rq *this_rq) out: /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_running) + if (this_rq->nr_running != this_rq->cfs.h_nr_running && + (this_rq->dl.dl_nr_running || + (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt)))) pulled_task = -1; if (pulled_task) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f3cee0a63b7..d8cdf161855 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -470,11 +470,6 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) dequeue_rt_entity(rt_se); } -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; -} - static int rt_se_boosted(struct sched_rt_entity *rt_se) { struct rt_rq *rt_rq = group_rt_rq(rt_se); @@ -545,11 +540,6 @@ static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { } -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled; -} - static inline const struct cpumask *sched_rt_period_mask(void) { return cpu_online_mask; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 378bff76267..f2de7a17562 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -423,6 +423,18 @@ struct rt_rq { #endif }; +#ifdef CONFIG_RT_GROUP_SCHED +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; +} +#else +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled; +} +#endif + /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ -- cgit v1.2.3-70-g09d2