diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/Makefile | 2 | ||||
-rw-r--r-- | kernel/sched/auto_group.c | 2 | ||||
-rw-r--r-- | kernel/sched/clock.c | 3 | ||||
-rw-r--r-- | kernel/sched/core.c | 364 | ||||
-rw-r--r-- | kernel/sched/cpuacct.c | 6 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.c | 37 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.h | 6 | ||||
-rw-r--r-- | kernel/sched/cpupri.c | 10 | ||||
-rw-r--r-- | kernel/sched/cpupri.h | 2 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 52 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 80 | ||||
-rw-r--r-- | kernel/sched/debug.c | 10 | ||||
-rw-r--r-- | kernel/sched/fair.c | 627 | ||||
-rw-r--r-- | kernel/sched/idle.c | 265 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 25 | ||||
-rw-r--r-- | kernel/sched/rt.c | 103 | ||||
-rw-r--r-- | kernel/sched/sched.h | 86 | ||||
-rw-r--r-- | kernel/sched/stats.c | 2 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 15 |
19 files changed, 1211 insertions, 486 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 9a95c8c2af2..ab32b7b0db5 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -13,7 +13,7 @@ endif obj-y += core.o proc.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o -obj-y += wait.o completion.o +obj-y += wait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 4a073539c58..e73efba9830 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -203,7 +203,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) struct autogroup *ag; int err; - if (nice < -20 || nice > 19) + if (nice < MIN_NICE || nice > MAX_NICE) return -EINVAL; err = security_task_setnice(current, nice); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index b30a2924ef1..3ef6451e972 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -60,13 +60,14 @@ #include <linux/sched.h> #include <linux/static_key.h> #include <linux/workqueue.h> +#include <linux/compiler.h> /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. * Architectures and sub-architectures can override this. */ -unsigned long long __attribute__((weak)) sched_clock(void) +unsigned long long __weak sched_clock(void) { return (unsigned long long)(jiffies - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f5c6635b806..084d17f8913 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -73,6 +73,7 @@ #include <linux/init_task.h> #include <linux/binfmts.h> #include <linux/context_tracking.h> +#include <linux/compiler.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -432,7 +433,7 @@ void hrtick_start(struct rq *rq, u64 delay) if (rq == this_rq()) { __hrtick_restart(rq); } else if (!rq->hrtick_csd_pending) { - __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); + smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); rq->hrtick_csd_pending = 1; } } @@ -555,12 +556,15 @@ void resched_cpu(int cpu) * selecting an idle cpu will add more delays to the timers than intended * (as that cpu's timer base may not be uptodate wrt jiffies etc). */ -int get_nohz_timer_target(void) +int get_nohz_timer_target(int pinned) { int cpu = smp_processor_id(); int i; struct sched_domain *sd; + if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) + return cpu; + rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { @@ -823,19 +827,13 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { - u64 st; - steal = paravirt_steal_clock(cpu_of(rq)); steal -= rq->prev_steal_time_rq; if (unlikely(steal > delta)) steal = delta; - st = steal_ticks(steal); - steal = st * TICK_NSEC; - rq->prev_steal_time_rq += steal; - delta -= steal; } #endif @@ -1745,8 +1743,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; - p->numa_faults = NULL; - p->numa_faults_buffer = NULL; + p->numa_faults_memory = NULL; + p->numa_faults_buffer_memory = NULL; + p->last_task_numa_placement = 0; + p->last_sum_exec_runtime = 0; INIT_LIST_HEAD(&p->numa_entry); p->numa_group = NULL; @@ -2149,8 +2149,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { - task_numa_free(prev); - if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); @@ -2167,13 +2165,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) #ifdef CONFIG_SMP -/* assumes rq->lock is held */ -static inline void pre_schedule(struct rq *rq, struct task_struct *prev) -{ - if (prev->sched_class->pre_schedule) - prev->sched_class->pre_schedule(rq, prev); -} - /* rq->lock is NOT held, but preemption is disabled */ static inline void post_schedule(struct rq *rq) { @@ -2191,10 +2182,6 @@ static inline void post_schedule(struct rq *rq) #else -static inline void pre_schedule(struct rq *rq, struct task_struct *p) -{ -} - static inline void post_schedule(struct rq *rq) { } @@ -2205,7 +2192,7 @@ static inline void post_schedule(struct rq *rq) * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. */ -asmlinkage void schedule_tail(struct task_struct *prev) +asmlinkage __visible void schedule_tail(struct task_struct *prev) __releases(rq->lock) { struct rq *rq = this_rq(); @@ -2510,8 +2497,13 @@ void __kprobes preempt_count_add(int val) DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); #endif - if (preempt_count() == val) - trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + if (preempt_count() == val) { + unsigned long ip = get_parent_ip(CALLER_ADDR1); +#ifdef CONFIG_DEBUG_PREEMPT + current->preempt_disable_ip = ip; +#endif + trace_preempt_off(CALLER_ADDR0, ip); + } } EXPORT_SYMBOL(preempt_count_add); @@ -2554,6 +2546,13 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); +#ifdef CONFIG_DEBUG_PREEMPT + if (in_atomic_preempt_off()) { + pr_err("Preemption disabled at:"); + print_ip_sym(current->preempt_disable_ip); + pr_cont("\n"); + } +#endif dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } @@ -2577,36 +2576,40 @@ static inline void schedule_debug(struct task_struct *prev) schedstat_inc(this_rq(), sched_count); } -static void put_prev_task(struct rq *rq, struct task_struct *prev) -{ - if (prev->on_rq || rq->skip_clock_update < 0) - update_rq_clock(rq); - prev->sched_class->put_prev_task(rq, prev); -} - /* * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq) +pick_next_task(struct rq *rq, struct task_struct *prev) { - const struct sched_class *class; + const struct sched_class *class = &fair_sched_class; struct task_struct *p; /* * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ - if (likely(rq->nr_running == rq->cfs.h_nr_running)) { - p = fair_sched_class.pick_next_task(rq); - if (likely(p)) - return p; + if (likely(prev->sched_class == class && + rq->nr_running == rq->cfs.h_nr_running)) { + p = fair_sched_class.pick_next_task(rq, prev); + if (unlikely(p == RETRY_TASK)) + goto again; + + /* assumes fair_sched_class->next == idle_sched_class */ + if (unlikely(!p)) + p = idle_sched_class.pick_next_task(rq, prev); + + return p; } +again: for_each_class(class) { - p = class->pick_next_task(rq); - if (p) + p = class->pick_next_task(rq, prev); + if (p) { + if (unlikely(p == RETRY_TASK)) + goto again; return p; + } } BUG(); /* the idle class will always have a runnable task */ @@ -2700,13 +2703,10 @@ need_resched: switch_count = &prev->nvcsw; } - pre_schedule(rq, prev); - - if (unlikely(!rq->nr_running)) - idle_balance(cpu, rq); + if (prev->on_rq || rq->skip_clock_update < 0) + update_rq_clock(rq); - put_prev_task(rq, prev); - next = pick_next_task(rq); + next = pick_next_task(rq, prev); clear_tsk_need_resched(prev); clear_preempt_need_resched(); rq->skip_clock_update = 0; @@ -2747,7 +2747,7 @@ static inline void sched_submit_work(struct task_struct *tsk) blk_schedule_flush_plug(tsk); } -asmlinkage void __sched schedule(void) +asmlinkage __visible void __sched schedule(void) { struct task_struct *tsk = current; @@ -2757,7 +2757,7 @@ asmlinkage void __sched schedule(void) EXPORT_SYMBOL(schedule); #ifdef CONFIG_CONTEXT_TRACKING -asmlinkage void __sched schedule_user(void) +asmlinkage __visible void __sched schedule_user(void) { /* * If we come here after a random call to set_need_resched(), @@ -2789,7 +2789,7 @@ void __sched schedule_preempt_disabled(void) * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ -asmlinkage void __sched notrace preempt_schedule(void) +asmlinkage __visible void __sched notrace preempt_schedule(void) { /* * If there is a non-zero preempt_count or interrupts are disabled, @@ -2819,7 +2819,7 @@ EXPORT_SYMBOL(preempt_schedule); * Note, that this is called and return with irqs disabled. This will * protect us against recursive calling from irq. */ -asmlinkage void __sched preempt_schedule_irq(void) +asmlinkage __visible void __sched preempt_schedule_irq(void) { enum ctx_state prev_state; @@ -2852,52 +2852,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, } EXPORT_SYMBOL(default_wake_function); -static long __sched -sleep_on_common(wait_queue_head_t *q, int state, long timeout) -{ - unsigned long flags; - wait_queue_t wait; - - init_waitqueue_entry(&wait, current); - - __set_current_state(state); - - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, &wait); - spin_unlock(&q->lock); - timeout = schedule_timeout(timeout); - spin_lock_irq(&q->lock); - __remove_wait_queue(q, &wait); - spin_unlock_irqrestore(&q->lock, flags); - - return timeout; -} - -void __sched interruptible_sleep_on(wait_queue_head_t *q) -{ - sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(interruptible_sleep_on); - -long __sched -interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(interruptible_sleep_on_timeout); - -void __sched sleep_on(wait_queue_head_t *q) -{ - sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(sleep_on); - -long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(sleep_on_timeout); - #ifdef CONFIG_RT_MUTEXES /* @@ -2908,7 +2862,8 @@ EXPORT_SYMBOL(sleep_on_timeout); * This function changes the 'effective' priority of a task. It does * not touch ->normal_prio like __setscheduler(). * - * Used by the rt_mutex code to implement priority inheritance logic. + * Used by the rt_mutex code to implement priority inheritance + * logic. Call site only calls if the priority of the task changed. */ void rt_mutex_setprio(struct task_struct *p, int prio) { @@ -2998,7 +2953,7 @@ void set_user_nice(struct task_struct *p, long nice) unsigned long flags; struct rq *rq; - if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) return; /* * We have to be careful, if called from sys_setpriority(), @@ -3076,11 +3031,11 @@ SYSCALL_DEFINE1(nice, int, increment) if (increment > 40) increment = 40; - nice = TASK_NICE(current) + increment; - if (nice < -20) - nice = -20; - if (nice > 19) - nice = 19; + nice = task_nice(current) + increment; + if (nice < MIN_NICE) + nice = MIN_NICE; + if (nice > MAX_NICE) + nice = MAX_NICE; if (increment < 0 && !can_nice(current, nice)) return -EPERM; @@ -3109,18 +3064,6 @@ int task_prio(const struct task_struct *p) } /** - * task_nice - return the nice value of a given task. - * @p: the task in question. - * - * Return: The nice value [ -20 ... 0 ... 19 ]. - */ -int task_nice(const struct task_struct *p) -{ - return TASK_NICE(p); -} -EXPORT_SYMBOL(task_nice); - -/** * idle_cpu - is a given cpu idle currently? * @cpu: the processor in question. * @@ -3187,11 +3130,11 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_throttled = 0; dl_se->dl_new = 1; + dl_se->dl_yielded = 0; } -/* Actually do priority change: must hold pi & rq lock. */ -static void __setscheduler(struct rq *rq, struct task_struct *p, - const struct sched_attr *attr) +static void __setscheduler_params(struct task_struct *p, + const struct sched_attr *attr) { int policy = attr->sched_policy; @@ -3211,9 +3154,21 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, * getparam()/getattr() don't report silly values for !rt tasks. */ p->rt_priority = attr->sched_priority; - p->normal_prio = normal_prio(p); - p->prio = rt_mutex_getprio(p); + set_load_weight(p); +} + +/* Actually do priority change: must hold pi & rq lock. */ +static void __setscheduler(struct rq *rq, struct task_struct *p, + const struct sched_attr *attr) +{ + __setscheduler_params(p, attr); + + /* + * If we get here, there was no pi waiters boosting the + * task. It is safe to use the normal prio. + */ + p->prio = normal_prio(p); if (dl_prio(p->prio)) p->sched_class = &dl_sched_class; @@ -3221,8 +3176,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, p->sched_class = &rt_sched_class; else p->sched_class = &fair_sched_class; - - set_load_weight(p); } static void @@ -3242,17 +3195,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr) * We ask for the deadline not being zero, and greater or equal * than the runtime, as well as the period of being zero or * greater than deadline. Furthermore, we have to be sure that - * user parameters are above the internal resolution (1us); we - * check sched_runtime only since it is always the smaller one. + * user parameters are above the internal resolution of 1us (we + * check sched_runtime only since it is always the smaller one) and + * below 2^63 ns (we have to check both sched_deadline and + * sched_period, as the latter can be zero). */ static bool __checkparam_dl(const struct sched_attr *attr) { - return attr && attr->sched_deadline != 0 && - (attr->sched_period == 0 || - (s64)(attr->sched_period - attr->sched_deadline) >= 0) && - (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && - attr->sched_runtime >= (2 << (DL_SCALE - 1)); + /* deadline != 0 */ + if (attr->sched_deadline == 0) + return false; + + /* + * Since we truncate DL_SCALE bits, make sure we're at least + * that big. + */ + if (attr->sched_runtime < (1ULL << DL_SCALE)) + return false; + + /* + * Since we use the MSB for wrap-around and sign issues, make + * sure it's not set (mind that period can be equal to zero). + */ + if (attr->sched_deadline & (1ULL << 63) || + attr->sched_period & (1ULL << 63)) + return false; + + /* runtime <= deadline <= period (if period != 0) */ + if ((attr->sched_period != 0 && + attr->sched_period < attr->sched_deadline) || + attr->sched_deadline < attr->sched_runtime) + return false; + + return true; } /* @@ -3275,6 +3251,8 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user) { + int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : + MAX_RT_PRIO - 1 - attr->sched_priority; int retval, oldprio, oldpolicy = -1, on_rq, running; int policy = attr->sched_policy; unsigned long flags; @@ -3319,7 +3297,7 @@ recheck: */ if (user && !capable(CAP_SYS_NICE)) { if (fair_policy(policy)) { - if (attr->sched_nice < TASK_NICE(p) && + if (attr->sched_nice < task_nice(p) && !can_nice(p, attr->sched_nice)) return -EPERM; } @@ -3352,7 +3330,7 @@ recheck: * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { - if (!can_nice(p, TASK_NICE(p))) + if (!can_nice(p, task_nice(p))) return -EPERM; } @@ -3389,16 +3367,18 @@ recheck: } /* - * If not changing anything there's no need to proceed further: + * If not changing anything there's no need to proceed further, + * but store a possible modification of reset_on_fork. */ if (unlikely(policy == p->policy)) { - if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) + if (fair_policy(policy) && attr->sched_nice != task_nice(p)) goto change; if (rt_policy(policy) && attr->sched_priority != p->rt_priority) goto change; if (dl_policy(policy)) goto change; + p->sched_reset_on_fork = reset_on_fork; task_rq_unlock(rq, p, &flags); return 0; } @@ -3452,6 +3432,24 @@ change: return -EBUSY; } + p->sched_reset_on_fork = reset_on_fork; + oldprio = p->prio; + + /* + * Special case for priority boosted tasks. + * + * If the new priority is lower or equal (user space view) + * than the current (boosted) priority, we just store the new + * normal parameters and do not touch the scheduler class and + * the runqueue. This will be done when the task deboost + * itself. + */ + if (rt_mutex_check_prio(p, newprio)) { + __setscheduler_params(p, attr); + task_rq_unlock(rq, p, &flags); + return 0; + } + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) @@ -3459,16 +3457,18 @@ change: if (running) p->sched_class->put_prev_task(rq, p); - p->sched_reset_on_fork = reset_on_fork; - - oldprio = p->prio; prev_class = p->sched_class; __setscheduler(rq, p, attr); if (running) p->sched_class->set_curr_task(rq); - if (on_rq) - enqueue_task(rq, p, 0); + if (on_rq) { + /* + * We enqueue to tail when the priority of a task is + * increased (user space view). + */ + enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); + } check_class_changed(rq, p, prev_class, oldprio); task_rq_unlock(rq, p, &flags); @@ -3624,7 +3624,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr, * XXX: do we want to be lenient like existing syscalls; or do we want * to be strict and return an error on out-of-bounds values? */ - attr->sched_nice = clamp(attr->sched_nice, -20, 19); + attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); out: return ret; @@ -3669,6 +3669,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) * sys_sched_setattr - same as above, but with extended sched_attr * @pid: the pid in question. * @uattr: structure containing the extended parameters. + * @flags: for future extension. */ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, unsigned int, flags) @@ -3680,8 +3681,12 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if (!uattr || pid < 0 || flags) return -EINVAL; - if (sched_copy_attr(uattr, &attr)) - return -EFAULT; + retval = sched_copy_attr(uattr, &attr); + if (retval) + return retval; + + if ((int)attr.sched_policy < 0) + return -EINVAL; rcu_read_lock(); retval = -ESRCH; @@ -3731,7 +3736,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) */ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) { - struct sched_param lp; + struct sched_param lp = { .sched_priority = 0 }; struct task_struct *p; int retval; @@ -3748,11 +3753,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (retval) goto out_unlock; - if (task_has_dl_policy(p)) { - retval = -EINVAL; - goto out_unlock; - } - lp.sched_priority = p->rt_priority; + if (task_has_rt_policy(p)) + lp.sched_priority = p->rt_priority; rcu_read_unlock(); /* @@ -3813,6 +3815,7 @@ err_size: * @pid: the pid in question. * @uattr: structure containing the extended parameters. * @size: sizeof(attr) for fwd/bwd comp. + * @flags: for future extension. */ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, unsigned int, size, unsigned int, flags) @@ -3845,7 +3848,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, else if (task_has_rt_policy(p)) attr.sched_priority = p->rt_priority; else - attr.sched_nice = TASK_NICE(p); + attr.sched_nice = task_nice(p); rcu_read_unlock(); @@ -4483,6 +4486,7 @@ void init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; + idle->on_rq = 1; #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif @@ -4702,8 +4706,10 @@ void idle_task_exit(void) BUG_ON(cpu_online(smp_processor_id())); - if (mm != &init_mm) + if (mm != &init_mm) { switch_mm(mm, &init_mm, current); + finish_arch_post_lock_switch(); + } mmdrop(mm); } @@ -4721,6 +4727,22 @@ static void calc_load_migrate(struct rq *rq) atomic_long_add(delta, &calc_load_tasks); } +static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) +{ +} + +static const struct sched_class fake_sched_class = { + .put_prev_task = put_prev_task_fake, +}; + +static struct task_struct fake_task = { + /* + * Avoid pull_{rt,dl}_task() + */ + .prio = MAX_PRIO + 1, + .sched_class = &fake_sched_class, +}; + /* * Migrate all tasks from the rq, sleeping tasks will be migrated by * try_to_wake_up()->select_task_rq(). @@ -4761,7 +4783,7 @@ static void migrate_tasks(unsigned int dead_cpu) if (rq->nr_running == 1) break; - next = pick_next_task(rq); + next = pick_next_task(rq, &fake_task); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); @@ -4851,7 +4873,7 @@ set_table_entry(struct ctl_table *entry, static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(13); + struct ctl_table *table = sd_alloc_ctl_entry(14); if (table == NULL) return NULL; @@ -4879,9 +4901,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[11], "name", sd->name, + set_table_entry(&table[11], "max_newidle_lb_cost", + &sd->max_newidle_lb_cost, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); - /* &table[12] is terminator */ + /* &table[13] is terminator */ return table; } @@ -5051,7 +5076,6 @@ static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { - case CPU_STARTING: case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; @@ -6025,6 +6049,8 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) , .last_balance = jiffies, .balance_interval = sd_weight, + .max_newidle_lb_cost = 0, + .next_decay_max_lb_cost = jiffies, }; SD_INIT_NAME(sd, NUMA); sd->private = &tl->data; @@ -6461,7 +6487,7 @@ static cpumask_var_t fallback_doms; * cpu core maps. It is supposed to return 1 if the topology changed * or 0 if it stayed the same. */ -int __attribute__((weak)) arch_update_cpu_topology(void) +int __weak arch_update_cpu_topology(void) { return 0; } @@ -6858,7 +6884,6 @@ void __init sched_init(void) rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED - INIT_LIST_HEAD(&rq->leaf_rt_rq_list); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif @@ -6947,7 +6972,8 @@ void __might_sleep(const char *file, int line, int preempt_offset) static unsigned long prev_jiffy; /* ratelimiting */ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ - if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || + if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && + !is_idle_task(current)) || system_state != SYSTEM_RUNNING || oops_in_progress) return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) @@ -6965,6 +6991,13 @@ void __might_sleep(const char *file, int line, int preempt_offset) debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); +#ifdef CONFIG_DEBUG_PREEMPT + if (!preempt_count_equals(preempt_offset)) { + pr_err("Preemption disabled at:"); + print_ip_sym(current->preempt_disable_ip); + pr_cont("\n"); + } +#endif dump_stack(); } EXPORT_SYMBOL(__might_sleep); @@ -7018,7 +7051,7 @@ void normalize_rt_tasks(void) * Renice negative nice level userspace * tasks back to 0: */ - if (TASK_NICE(p) < 0 && p->mm) + if (task_nice(p) < 0 && p->mm) set_user_nice(p, 0); continue; } @@ -7186,7 +7219,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); - tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, + tg = container_of(task_css_check(tsk, cpu_cgrp_id, lockdep_is_held(&tsk->sighand->siglock)), struct task_group, css); tg = autogroup_task_group(tsk, tg); @@ -7613,7 +7646,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css, tset) { + cgroup_taskset_for_each(task, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -7631,7 +7664,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css, tset) + cgroup_taskset_for_each(task, tset) sched_move_task(task); } @@ -7718,8 +7751,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) /* restart the period timer (if active) to handle new period expiry */ if (runtime_enabled && cfs_b->timer_active) { /* force a reprogram */ - cfs_b->timer_active = 0; - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, true); } raw_spin_unlock_irq(&cfs_b->lock); @@ -7970,8 +8002,7 @@ static struct cftype cpu_files[] = { { } /* terminate */ }; -struct cgroup_subsys cpu_cgroup_subsys = { - .name = "cpu", +struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, .css_free = cpu_cgroup_css_free, .css_online = cpu_cgroup_css_online, @@ -7979,7 +8010,6 @@ struct cgroup_subsys cpu_cgroup_subsys = { .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, - .subsys_id = cpu_cgroup_subsys_id, .base_cftypes = cpu_files, .early_init = 1, }; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 622e0818f90..c143ee380e3 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -41,7 +41,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) /* return cpu accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { - return css_ca(task_css(tsk, cpuacct_subsys_id)); + return css_ca(task_css(tsk, cpuacct_cgrp_id)); } static inline struct cpuacct *parent_ca(struct cpuacct *ca) @@ -275,11 +275,9 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) rcu_read_unlock(); } -struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", +struct cgroup_subsys cpuacct_cgrp_subsys = { .css_alloc = cpuacct_css_alloc, .css_free = cpuacct_css_free, - .subsys_id = cpuacct_subsys_id, .base_cftypes = files, .early_init = 1, }; diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5b9bb42b2d4..bd95963dae8 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -13,6 +13,7 @@ #include <linux/gfp.h> #include <linux/kernel.h> +#include <linux/slab.h> #include "cpudeadline.h" static inline int parent(int i) @@ -39,8 +40,10 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b) { int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; - swap(cp->elements[a], cp->elements[b]); - swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); + swap(cp->elements[a].cpu, cp->elements[b].cpu); + swap(cp->elements[a].dl , cp->elements[b].dl ); + + swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); } static void cpudl_heapify(struct cpudl *cp, int idx) @@ -140,7 +143,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) WARN_ON(!cpu_present(cpu)); raw_spin_lock_irqsave(&cp->lock, flags); - old_idx = cp->cpu_to_idx[cpu]; + old_idx = cp->elements[cpu].idx; if (!is_valid) { /* remove item */ if (old_idx == IDX_INVALID) { @@ -155,8 +158,8 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; cp->elements[old_idx].cpu = new_cpu; cp->size--; - cp->cpu_to_idx[new_cpu] = old_idx; - cp->cpu_to_idx[cpu] = IDX_INVALID; + cp->elements[new_cpu].idx = old_idx; + cp->elements[cpu].idx = IDX_INVALID; while (old_idx > 0 && dl_time_before( cp->elements[parent(old_idx)].dl, cp->elements[old_idx].dl)) { @@ -173,7 +176,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) cp->size++; cp->elements[cp->size - 1].dl = 0; cp->elements[cp->size - 1].cpu = cpu; - cp->cpu_to_idx[cpu] = cp->size - 1; + cp->elements[cpu].idx = cp->size - 1; cpudl_change_key(cp, cp->size - 1, dl); cpumask_clear_cpu(cpu, cp->free_cpus); } else { @@ -195,10 +198,21 @@ int cpudl_init(struct cpudl *cp) memset(cp, 0, sizeof(*cp)); raw_spin_lock_init(&cp->lock); cp->size = 0; - for (i = 0; i < NR_CPUS; i++) - cp->cpu_to_idx[i] = IDX_INVALID; - if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) + + cp->elements = kcalloc(nr_cpu_ids, + sizeof(struct cpudl_item), + GFP_KERNEL); + if (!cp->elements) + return -ENOMEM; + + if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { + kfree(cp->elements); return -ENOMEM; + } + + for_each_possible_cpu(i) + cp->elements[i].idx = IDX_INVALID; + cpumask_setall(cp->free_cpus); return 0; @@ -210,7 +224,6 @@ int cpudl_init(struct cpudl *cp) */ void cpudl_cleanup(struct cpudl *cp) { - /* - * nothing to do for the moment - */ + free_cpumask_var(cp->free_cpus); + kfree(cp->elements); } diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index a202789a412..538c9796ad4 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -5,17 +5,17 @@ #define IDX_INVALID -1 -struct array_item { +struct cpudl_item { u64 dl; int cpu; + int idx; }; struct cpudl { raw_spinlock_t lock; int size; - int cpu_to_idx[NR_CPUS]; - struct array_item elements[NR_CPUS]; cpumask_var_t free_cpus; + struct cpudl_item *elements; }; diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 8b836b376d9..8834243abee 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -30,6 +30,7 @@ #include <linux/gfp.h> #include <linux/sched.h> #include <linux/sched/rt.h> +#include <linux/slab.h> #include "cpupri.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ @@ -70,8 +71,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, int idx = 0; int task_pri = convert_prio(p->prio); - if (task_pri >= MAX_RT_PRIO) - return 0; + BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); for (idx = 0; idx < task_pri; idx++) { struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; @@ -219,8 +219,13 @@ int cpupri_init(struct cpupri *cp) goto cleanup; } + cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL); + if (!cp->cpu_to_pri) + goto cleanup; + for_each_possible_cpu(i) cp->cpu_to_pri[i] = CPUPRI_INVALID; + return 0; cleanup: @@ -237,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp) { int i; + kfree(cp->cpu_to_pri); for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) free_cpumask_var(cp->pri_to_cpu[i].mask); } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index f6d75617349..6b033347fdf 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -17,7 +17,7 @@ struct cpupri_vec { struct cpupri { struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; - int cpu_to_pri[NR_CPUS]; + int *cpu_to_pri; }; #ifdef CONFIG_SMP diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 99947919e30..72fdf06ef86 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime, p->utimescaled += cputime_scaled; account_group_user_time(p, cputime); - index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; + index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; /* Add user time to cpustat. */ task_group_account_field(p, index, (__force u64) cputime); @@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, p->gtime += cputime; /* Add guest time to cpustat. */ - if (TASK_NICE(p) > 0) { + if (task_nice(p) > 0) { cpustat[CPUTIME_NICE] += (__force u64) cputime; cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; } else { @@ -258,16 +258,22 @@ static __always_inline bool steal_account_process_tick(void) { #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { - u64 steal, st = 0; + u64 steal; + cputime_t steal_ct; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; - st = steal_ticks(steal); - this_rq()->prev_steal_time += st * TICK_NSEC; + /* + * cputime_t may be less precise than nsecs (eg: if it's + * based on jiffies). Lets cast the result to cputime + * granularity and account the rest on the next rounds. + */ + steal_ct = nsecs_to_cputime(steal); + this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); - account_steal_time(st); - return st; + account_steal_time(steal_ct); + return steal_ct; } #endif return false; @@ -326,50 +332,50 @@ out: * softirq as those do not count in task exec_runtime any more. */ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) + struct rq *rq, int ticks) { - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); + u64 cputime = (__force u64) cputime_one_jiffy; u64 *cpustat = kcpustat_this_cpu->cpustat; if (steal_account_process_tick()) return; + cputime *= ticks; + scaled *= ticks; + if (irqtime_account_hi_update()) { - cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; + cpustat[CPUTIME_IRQ] += cputime; } else if (irqtime_account_si_update()) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; + cpustat[CPUTIME_SOFTIRQ] += cputime; } else if (this_cpu_ksoftirqd() == p) { /* * ksoftirqd time do not get accounted in cpu_softirq_time. * So, we have to handle it separately here. * Also, p->stime needs to be updated for ksoftirqd. */ - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SOFTIRQ); + __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ); } else if (user_tick) { - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + account_user_time(p, cputime, scaled); } else if (p == rq->idle) { - account_idle_time(cputime_one_jiffy); + account_idle_time(cputime); } else if (p->flags & PF_VCPU) { /* System time or guest time */ - account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); + account_guest_time(p, cputime, scaled); } else { - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SYSTEM); + __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM); } } static void irqtime_account_idle_ticks(int ticks) { - int i; struct rq *rq = this_rq(); - for (i = 0; i < ticks; i++) - irqtime_account_process_tick(current, 0, rq); + irqtime_account_process_tick(current, 0, rq, ticks); } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ static inline void irqtime_account_idle_ticks(int ticks) {} static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) {} + struct rq *rq, int nr_ticks) {} #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /* @@ -458,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick) return; if (sched_clock_irqtime) { - irqtime_account_process_tick(p, user_tick, rq); + irqtime_account_process_tick(p, user_tick, rq, 1); return; } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6e79b3faa4c..14bc348ba3b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -210,6 +210,16 @@ static inline int has_pushable_dl_tasks(struct rq *rq) static int push_dl_task(struct rq *rq); +static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) +{ + return dl_task(prev); +} + +static inline void set_post_schedule(struct rq *rq) +{ + rq->post_schedule = has_pushable_dl_tasks(rq); +} + #else static inline @@ -232,6 +242,19 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { } +static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) +{ + return false; +} + +static inline int pull_dl_task(struct rq *rq) +{ + return 0; +} + +static inline void set_post_schedule(struct rq *rq) +{ +} #endif /* CONFIG_SMP */ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); @@ -490,9 +513,17 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct sched_dl_entity, dl_timer); struct task_struct *p = dl_task_of(dl_se); - struct rq *rq = task_rq(p); + struct rq *rq; +again: + rq = task_rq(p); raw_spin_lock(&rq->lock); + if (rq != task_rq(p)) { + /* Task was moved, retrying. */ + raw_spin_unlock(&rq->lock); + goto again; + } + /* * We need to take care of a possible races here. In fact, the * task might have changed its scheduling policy to something @@ -505,6 +536,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) sched_clock_tick(); update_rq_clock(rq); dl_se->dl_throttled = 0; + dl_se->dl_yielded = 0; if (p->on_rq) { enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (task_has_dl_policy(rq->curr)) @@ -586,8 +618,8 @@ static void update_curr_dl(struct rq *rq) * approach need further study. */ delta_exec = rq_clock_task(rq) - curr->se.exec_start; - if (unlikely((s64)delta_exec < 0)) - delta_exec = 0; + if (unlikely((s64)delta_exec <= 0)) + return; schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -870,10 +902,10 @@ static void yield_task_dl(struct rq *rq) * We make the task go to sleep until its current deadline by * forcing its runtime to zero. This way, update_curr_dl() stops * it and the bandwidth timer will wake it up and will give it - * new scheduling parameters (thanks to dl_new=1). + * new scheduling parameters (thanks to dl_yielded=1). */ if (p->dl.runtime > 0) { - rq->curr->dl.dl_new = 1; + rq->curr->dl.dl_yielded = 1; p->dl.runtime = 0; } update_curr_dl(rq); @@ -942,6 +974,8 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) resched_task(rq->curr); } +static int pull_dl_task(struct rq *this_rq); + #endif /* CONFIG_SMP */ /* @@ -988,7 +1022,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, return rb_entry(left, struct sched_dl_entity, rb_node); } -struct task_struct *pick_next_task_dl(struct rq *rq) +struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) { struct sched_dl_entity *dl_se; struct task_struct *p; @@ -996,9 +1030,29 @@ struct task_struct *pick_next_task_dl(struct rq *rq) dl_rq = &rq->dl; + if (need_pull_dl_task(rq, prev)) { + pull_dl_task(rq); + /* + * pull_rt_task() can drop (and re-acquire) rq->lock; this + * means a stop task can slip in, in which case we need to + * re-start task selection. + */ + if (rq->stop && rq->stop->on_rq) + return RETRY_TASK; + } + + /* + * When prev is DL, we may throttle it in put_prev_task(). + * So, we update time before we check for dl_nr_running. + */ + if (prev->sched_class == &dl_sched_class) + update_curr_dl(rq); + if (unlikely(!dl_rq->dl_nr_running)) return NULL; + put_prev_task(rq, prev); + dl_se = pick_next_dl_entity(rq, dl_rq); BUG_ON(!dl_se); @@ -1013,9 +1067,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq) start_hrtick_dl(rq, p); #endif -#ifdef CONFIG_SMP - rq->post_schedule = has_pushable_dl_tasks(rq); -#endif /* CONFIG_SMP */ + set_post_schedule(rq); return p; } @@ -1424,13 +1476,6 @@ skip: return ret; } -static void pre_schedule_dl(struct rq *rq, struct task_struct *prev) -{ - /* Try to pull other tasks here */ - if (dl_task(prev)) - pull_dl_task(rq); -} - static void post_schedule_dl(struct rq *rq) { push_dl_tasks(rq); @@ -1558,7 +1603,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (unlikely(p->dl.dl_throttled)) return; - if (p->on_rq || rq->curr != p) { + if (p->on_rq && rq->curr != p) { #ifdef CONFIG_SMP if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) /* Only reschedule if pushing failed */ @@ -1623,7 +1668,6 @@ const struct sched_class dl_sched_class = { .set_cpus_allowed = set_cpus_allowed_dl, .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, - .pre_schedule = pre_schedule_dl, .post_schedule = post_schedule_dl, .task_woken = task_woken_dl, #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index dd52e7ffb10..695f9773bb6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -111,8 +111,7 @@ static char *task_group_path(struct task_group *tg) if (autogroup_path(tg, group_path, PATH_MAX)) return group_path; - cgroup_path(tg->css.cgroup, group_path, PATH_MAX); - return group_path; + return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); } #endif @@ -321,6 +320,7 @@ do { \ P(sched_goidle); #ifdef CONFIG_SMP P64(avg_idle); + P64(max_idle_balance_cost); #endif P(ttwu_count); @@ -533,15 +533,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) unsigned long nr_faults = -1; int cpu_current, home_node; - if (p->numa_faults) - nr_faults = p->numa_faults[2*node + i]; + if (p->numa_faults_memory) + nr_faults = p->numa_faults_memory[2*node + i]; cpu_current = !i ? (task_node(p) == node) : (pol && node_isset(node, pol->v.nodes)); home_node = (p->numa_preferred_nid == node); - SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", + SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n", i, node, cpu_current, home_node, nr_faults); } } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9b4c4f32013..8cbe2d2c16d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ -static inline int +static inline struct cfs_rq * is_same_group(struct sched_entity *se, struct sched_entity *pse) { if (se->cfs_rq == pse->cfs_rq) - return 1; + return se->cfs_rq; - return 0; + return NULL; } static inline struct sched_entity *parent_entity(struct sched_entity *se) @@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) return se->parent; } -/* return depth at which a sched entity is present in the hierarchy */ -static inline int depth_se(struct sched_entity *se) -{ - int depth = 0; - - for_each_sched_entity(se) - depth++; - - return depth; -} - static void find_matching_se(struct sched_entity **se, struct sched_entity **pse) { @@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) */ /* First walk up until both entities are at same depth */ - se_depth = depth_se(*se); - pse_depth = depth_se(*pse); + se_depth = (*se)->depth; + pse_depth = (*pse)->depth; while (se_depth > pse_depth) { se_depth--; @@ -426,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) #define for_each_leaf_cfs_rq(rq, cfs_rq) \ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) -static inline int -is_same_group(struct sched_entity *se, struct sched_entity *pse) -{ - return 1; -} - static inline struct sched_entity *parent_entity(struct sched_entity *se) { return NULL; @@ -819,14 +802,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; -/* - * After skipping a page migration on a shared page, skip N more numa page - * migrations unconditionally. This reduces the number of NUMA migrations - * in shared memory workloads, and has the effect of pulling tasks towards - * where their memory lives, over pulling the memory towards the task. - */ -unsigned int sysctl_numa_balancing_migrate_deferred = 16; - static unsigned int task_nr_scan_windows(struct task_struct *p) { unsigned long rss = 0; @@ -893,10 +868,26 @@ struct numa_group { struct list_head task_list; struct rcu_head rcu; + nodemask_t active_nodes; unsigned long total_faults; + /* + * Faults_cpu is used to decide whether memory should move + * towards the CPU. As a consequence, these stats are weighted + * more by CPU use than by memory faults. + */ + unsigned long *faults_cpu; unsigned long faults[0]; }; +/* Shared or private faults. */ +#define NR_NUMA_HINT_FAULT_TYPES 2 + +/* Memory and CPU locality */ +#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2) + +/* Averaged statistics, and temporary buffers. */ +#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2) + pid_t task_numa_group_id(struct task_struct *p) { return p->numa_group ? p->numa_group->gid : 0; @@ -904,16 +895,16 @@ pid_t task_numa_group_id(struct task_struct *p) static inline int task_faults_idx(int nid, int priv) { - return 2 * nid + priv; + return NR_NUMA_HINT_FAULT_TYPES * nid + priv; } static inline unsigned long task_faults(struct task_struct *p, int nid) { - if (!p->numa_faults) + if (!p->numa_faults_memory) return 0; - return p->numa_faults[task_faults_idx(nid, 0)] + - p->numa_faults[task_faults_idx(nid, 1)]; + return p->numa_faults_memory[task_faults_idx(nid, 0)] + + p->numa_faults_memory[task_faults_idx(nid, 1)]; } static inline unsigned long group_faults(struct task_struct *p, int nid) @@ -925,6 +916,12 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) p->numa_group->faults[task_faults_idx(nid, 1)]; } +static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) +{ + return group->faults_cpu[task_faults_idx(nid, 0)] + + group->faults_cpu[task_faults_idx(nid, 1)]; +} + /* * These return the fraction of accesses done by a particular task, or * task group, on a particular numa node. The group weight is given a @@ -935,7 +932,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid) { unsigned long total_faults; - if (!p->numa_faults) + if (!p->numa_faults_memory) return 0; total_faults = p->total_numa_faults; @@ -954,6 +951,69 @@ static inline unsigned long group_weight(struct task_struct *p, int nid) return 1000 * group_faults(p, nid) / p->numa_group->total_faults; } +bool should_numa_migrate_memory(struct task_struct *p, struct page * page, + int src_nid, int dst_cpu) +{ + struct numa_group *ng = p->numa_group; + int dst_nid = cpu_to_node(dst_cpu); + int last_cpupid, this_cpupid; + + this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); + + /* + * Multi-stage node selection is used in conjunction with a periodic + * migration fault to build a temporal task<->page relation. By using + * a two-stage filter we remove short/unlikely relations. + * + * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate + * a task's usage of a particular page (n_p) per total usage of this + * page (n_t) (in a given time-span) to a probability. + * + * Our periodic faults will sample this probability and getting the + * same result twice in a row, given these samples are fully + * independent, is then given by P(n)^2, provided our sample period + * is sufficiently short compared to the usage pattern. + * + * This quadric squishes small probabilities, making it less likely we + * act on an unlikely task<->page relation. + */ + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + if (!cpupid_pid_unset(last_cpupid) && + cpupid_to_nid(last_cpupid) != dst_nid) + return false; + + /* Always allow migrate on private faults */ + if (cpupid_match_pid(p, last_cpupid)) + return true; + + /* A shared fault, but p->numa_group has not been set up yet. */ + if (!ng) + return true; + + /* + * Do not migrate if the destination is not a node that + * is actively used by this numa group. + */ + if (!node_isset(dst_nid, ng->active_nodes)) + return false; + + /* + * Source is a node that is not actively used by this + * numa group, while the destination is. Migrate. + */ + if (!node_isset(src_nid, ng->active_nodes)) + return true; + + /* + * Both source and destination are nodes in active + * use by this numa group. Maximize memory bandwidth + * by migrating from more heavily used groups, to less + * heavily used ones, spreading the load around. + * Use a 1/4 hysteresis to avoid spurious page movement. + */ + return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); +} + static unsigned long weighted_cpuload(const int cpu); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); @@ -1267,7 +1327,7 @@ static int task_numa_migrate(struct task_struct *p) static void numa_migrate_preferred(struct task_struct *p) { /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) + if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) return; /* Periodically retry migrating the task to the preferred node */ @@ -1282,6 +1342,38 @@ static void numa_migrate_preferred(struct task_struct *p) } /* + * Find the nodes on which the workload is actively running. We do this by + * tracking the nodes from which NUMA hinting faults are triggered. This can + * be different from the set of nodes where the workload's memory is currently + * located. + * + * The bitmask is used to make smarter decisions on when to do NUMA page + * migrations, To prevent flip-flopping, and excessive page migrations, nodes + * are added when they cause over 6/16 of the maximum number of faults, but + * only removed when they drop below 3/16. + */ +static void update_numa_active_node_mask(struct numa_group *numa_group) +{ + unsigned long faults, max_faults = 0; + int nid; + + for_each_online_node(nid) { + faults = group_faults_cpu(numa_group, nid); + if (faults > max_faults) + max_faults = faults; + } + + for_each_online_node(nid) { + faults = group_faults_cpu(numa_group, nid); + if (!node_isset(nid, numa_group->active_nodes)) { + if (faults > max_faults * 6 / 16) + node_set(nid, numa_group->active_nodes); + } else if (faults < max_faults * 3 / 16) + node_clear(nid, numa_group->active_nodes); + } +} + +/* * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS * increments. The more local the fault statistics are, the higher the scan * period will be for the next scan window. If local/remote ratio is below @@ -1355,11 +1447,41 @@ static void update_task_scan_period(struct task_struct *p, memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } +/* + * Get the fraction of time the task has been running since the last + * NUMA placement cycle. The scheduler keeps similar statistics, but + * decays those on a 32ms period, which is orders of magnitude off + * from the dozens-of-seconds NUMA balancing period. Use the scheduler + * stats only if the task is so new there are no NUMA statistics yet. + */ +static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) +{ + u64 runtime, delta, now; + /* Use the start of this time slice to avoid calculations. */ + now = p->se.exec_start; + runtime = p->se.sum_exec_runtime; + + if (p->last_task_numa_placement) { + delta = runtime - p->last_sum_exec_runtime; + *period = now - p->last_task_numa_placement; + } else { + delta = p->se.avg.runnable_avg_sum; + *period = p->se.avg.runnable_avg_period; + } + + p->last_sum_exec_runtime = runtime; + p->last_task_numa_placement = now; + + return delta; +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1, max_group_nid = -1; unsigned long max_faults = 0, max_group_faults = 0; unsigned long fault_types[2] = { 0, 0 }; + unsigned long total_faults; + u64 runtime, period; spinlock_t *group_lock = NULL; seq = ACCESS_ONCE(p->mm->numa_scan_seq); @@ -1368,10 +1490,14 @@ static void task_numa_placement(struct task_struct *p) p->numa_scan_seq = seq; p->numa_scan_period_max = task_scan_max(p); + total_faults = p->numa_faults_locality[0] + + p->numa_faults_locality[1]; + runtime = numa_get_avg_runtime(p, &period); + /* If the task is part of a group prevent parallel updates to group stats */ if (p->numa_group) { group_lock = &p->numa_group->lock; - spin_lock(group_lock); + spin_lock_irq(group_lock); } /* Find the node with the highest number of faults */ @@ -1379,24 +1505,37 @@ static void task_numa_placement(struct task_struct *p) unsigned long faults = 0, group_faults = 0; int priv, i; - for (priv = 0; priv < 2; priv++) { - long diff; + for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { + long diff, f_diff, f_weight; i = task_faults_idx(nid, priv); - diff = -p->numa_faults[i]; /* Decay existing window, copy faults since last scan */ - p->numa_faults[i] >>= 1; - p->numa_faults[i] += p->numa_faults_buffer[i]; - fault_types[priv] += p->numa_faults_buffer[i]; - p->numa_faults_buffer[i] = 0; + diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; + fault_types[priv] += p->numa_faults_buffer_memory[i]; + p->numa_faults_buffer_memory[i] = 0; - faults += p->numa_faults[i]; - diff += p->numa_faults[i]; + /* + * Normalize the faults_from, so all tasks in a group + * count according to CPU use, instead of by the raw + * number of faults. Tasks with little runtime have + * little over-all impact on throughput, and thus their + * faults are less important. + */ + f_weight = div64_u64(runtime << 16, period + 1); + f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / + (total_faults + 1); + f_diff = f_weight - p->numa_faults_cpu[i] / 2; + p->numa_faults_buffer_cpu[i] = 0; + + p->numa_faults_memory[i] += diff; + p->numa_faults_cpu[i] += f_diff; + faults += p->numa_faults_memory[i]; p->total_numa_faults += diff; if (p->numa_group) { /* safe because we can only change our own group */ p->numa_group->faults[i] += diff; + p->numa_group->faults_cpu[i] += f_diff; p->numa_group->total_faults += diff; group_faults += p->numa_group->faults[i]; } @@ -1416,6 +1555,7 @@ static void task_numa_placement(struct task_struct *p) update_task_scan_period(p, fault_types[0], fault_types[1]); if (p->numa_group) { + update_numa_active_node_mask(p->numa_group); /* * If the preferred task and group nids are different, * iterate over the nodes again to find the best place. @@ -1432,7 +1572,7 @@ static void task_numa_placement(struct task_struct *p) } } - spin_unlock(group_lock); + spin_unlock_irq(group_lock); } /* Preferred node as the node with the most faults */ @@ -1465,7 +1605,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (unlikely(!p->numa_group)) { unsigned int size = sizeof(struct numa_group) + - 2*nr_node_ids*sizeof(unsigned long); + 4*nr_node_ids*sizeof(unsigned long); grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); if (!grp) @@ -1475,9 +1615,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, spin_lock_init(&grp->lock); INIT_LIST_HEAD(&grp->task_list); grp->gid = p->pid; + /* Second half of the array tracks nids where faults happen */ + grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * + nr_node_ids; - for (i = 0; i < 2*nr_node_ids; i++) - grp->faults[i] = p->numa_faults[i]; + node_set(task_node(current), grp->active_nodes); + + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) + grp->faults[i] = p->numa_faults_memory[i]; grp->total_faults = p->total_numa_faults; @@ -1532,11 +1677,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!join) return; - double_lock(&my_grp->lock, &grp->lock); + BUG_ON(irqs_disabled()); + double_lock_irq(&my_grp->lock, &grp->lock); - for (i = 0; i < 2*nr_node_ids; i++) { - my_grp->faults[i] -= p->numa_faults[i]; - grp->faults[i] += p->numa_faults[i]; + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { + my_grp->faults[i] -= p->numa_faults_memory[i]; + grp->faults[i] += p->numa_faults_memory[i]; } my_grp->total_faults -= p->total_numa_faults; grp->total_faults += p->total_numa_faults; @@ -1546,7 +1692,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, grp->nr_tasks++; spin_unlock(&my_grp->lock); - spin_unlock(&grp->lock); + spin_unlock_irq(&grp->lock); rcu_assign_pointer(p->numa_group, grp); @@ -1561,34 +1707,38 @@ no_join: void task_numa_free(struct task_struct *p) { struct numa_group *grp = p->numa_group; + void *numa_faults = p->numa_faults_memory; + unsigned long flags; int i; - void *numa_faults = p->numa_faults; if (grp) { - spin_lock(&grp->lock); - for (i = 0; i < 2*nr_node_ids; i++) - grp->faults[i] -= p->numa_faults[i]; + spin_lock_irqsave(&grp->lock, flags); + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) + grp->faults[i] -= p->numa_faults_memory[i]; grp->total_faults -= p->total_numa_faults; list_del(&p->numa_entry); grp->nr_tasks--; - spin_unlock(&grp->lock); + spin_unlock_irqrestore(&grp->lock, flags); rcu_assign_pointer(p->numa_group, NULL); put_numa_group(grp); } - p->numa_faults = NULL; - p->numa_faults_buffer = NULL; + p->numa_faults_memory = NULL; + p->numa_faults_buffer_memory = NULL; + p->numa_faults_cpu= NULL; + p->numa_faults_buffer_cpu = NULL; kfree(numa_faults); } /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int last_cpupid, int node, int pages, int flags) +void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) { struct task_struct *p = current; bool migrated = flags & TNF_MIGRATED; + int cpu_node = task_node(current); int priv; if (!numabalancing_enabled) @@ -1603,16 +1753,24 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) return; /* Allocate buffer to track faults on a per-node basis */ - if (unlikely(!p->numa_faults)) { - int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; + if (unlikely(!p->numa_faults_memory)) { + int size = sizeof(*p->numa_faults_memory) * + NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; - /* numa_faults and numa_faults_buffer share the allocation */ - p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); - if (!p->numa_faults) + p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); + if (!p->numa_faults_memory) return; - BUG_ON(p->numa_faults_buffer); - p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); + BUG_ON(p->numa_faults_buffer_memory); + /* + * The averaged statistics, shared & private, memory & cpu, + * occupy the first half of the array. The second half of the + * array is for current counters, which are averaged into the + * first set by task_numa_placement. + */ + p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); + p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); + p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); p->total_numa_faults = 0; memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } @@ -1641,7 +1799,8 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) if (migrated) p->numa_pages_migrated += pages; - p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; + p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; + p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; } @@ -2219,13 +2378,20 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) se->avg.load_avg_contrib >>= NICE_0_SHIFT; } } -#else + +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) +{ + __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); + __update_tg_runnable_avg(&rq->avg, &rq->cfs); +} +#else /* CONFIG_FAIR_GROUP_SCHED */ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, int force_update) {} static inline void __update_tg_runnable_avg(struct sched_avg *sa, struct cfs_rq *cfs_rq) {} static inline void __update_group_entity_contrib(struct sched_entity *se) {} -#endif +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +#endif /* CONFIG_FAIR_GROUP_SCHED */ static inline void __update_task_entity_contrib(struct sched_entity *se) { @@ -2323,12 +2489,6 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); } -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) -{ - __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); - __update_tg_runnable_avg(&rq->avg, &rq->cfs); -} - /* Add the load generated by se into cfs_rq's child load-average */ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, @@ -2416,7 +2576,10 @@ void idle_exit_fair(struct rq *this_rq) update_rq_runnable_avg(this_rq, 0); } -#else +static int idle_balance(struct rq *this_rq); + +#else /* CONFIG_SMP */ + static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) {} static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} @@ -2428,7 +2591,13 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, int sleep) {} static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) {} -#endif + +static inline int idle_balance(struct rq *rq) +{ + return 0; +} + +#endif /* CONFIG_SMP */ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -2578,10 +2747,10 @@ static void __clear_buddies_last(struct sched_entity *se) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->last == se) - cfs_rq->last = NULL; - else + if (cfs_rq->last != se) break; + + cfs_rq->last = NULL; } } @@ -2589,10 +2758,10 @@ static void __clear_buddies_next(struct sched_entity *se) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->next == se) - cfs_rq->next = NULL; - else + if (cfs_rq->next != se) break; + + cfs_rq->next = NULL; } } @@ -2600,10 +2769,10 @@ static void __clear_buddies_skip(struct sched_entity *se) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->skip == se) - cfs_rq->skip = NULL; - else + if (cfs_rq->skip != se) break; + + cfs_rq->skip = NULL; } } @@ -2746,17 +2915,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); * 3) pick the "last" process, for cache locality * 4) do not run the "skip" process, if something else is available */ -static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) +static struct sched_entity * +pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) { - struct sched_entity *se = __pick_first_entity(cfs_rq); - struct sched_entity *left = se; + struct sched_entity *left = __pick_first_entity(cfs_rq); + struct sched_entity *se; + + /* + * If curr is set we have to see if its left of the leftmost entity + * still in the tree, provided there was anything in the tree at all. + */ + if (!left || (curr && entity_before(curr, left))) + left = curr; + + se = left; /* ideally we run the leftmost entity */ /* * Avoid running the skip buddy, if running something else can * be done without getting too unfair. */ if (cfs_rq->skip == se) { - struct sched_entity *second = __pick_next_entity(se); + struct sched_entity *second; + + if (se == curr) { + second = __pick_first_entity(cfs_rq); + } else { + second = __pick_next_entity(se); + if (!second || (curr && entity_before(curr, second))) + second = curr; + } + if (second && wakeup_preempt_entity(second, left) < 1) se = second; } @@ -2778,7 +2966,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) return se; } -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { @@ -2942,7 +3130,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) */ if (!cfs_b->timer_active) { __refill_cfs_bandwidth_runtime(cfs_b); - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, false); } if (cfs_b->runtime > 0) { @@ -3121,7 +3309,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); if (!cfs_b->timer_active) - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, false); raw_spin_unlock(&cfs_b->lock); } @@ -3433,22 +3621,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) } /* conditionally throttle active cfs_rq's from put_prev_entity() */ -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { if (!cfs_bandwidth_used()) - return; + return false; if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) - return; + return false; /* * it's possible for a throttled entity to be forced into a running * state (e.g. set_curr_task), in this case we're finished. */ if (cfs_rq_throttled(cfs_rq)) - return; + return true; throttle_cfs_rq(cfs_rq); + return true; } static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) @@ -3502,7 +3691,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) } /* requires cfs_b->lock, may release to reprogram timer */ -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) { /* * The timer may be active because we're trying to set a new bandwidth @@ -3517,7 +3706,7 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cpu_relax(); raw_spin_lock(&cfs_b->lock); /* if someone else restarted the timer then we're done */ - if (cfs_b->timer_active) + if (!force && cfs_b->timer_active) return; } @@ -3558,7 +3747,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) } static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -4213,13 +4402,14 @@ done: } /* - * sched_balance_self: balance the current task (running on cpu) in domains - * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and - * SD_BALANCE_EXEC. + * select_task_rq_fair: Select target runqueue for the waking task in domains + * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, + * SD_BALANCE_FORK, or SD_BALANCE_EXEC. * - * Balance, ie. select the least loaded group. + * Balances load by selecting the idlest cpu in the idlest group, or under + * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set. * - * Returns the target CPU number, or the same CPU if no balancing is needed. + * Returns the target cpu number. * * preempt must be disabled. */ @@ -4494,26 +4684,124 @@ preempt: set_last_buddy(se); } -static struct task_struct *pick_next_task_fair(struct rq *rq) +static struct task_struct * +pick_next_task_fair(struct rq *rq, struct task_struct *prev) { - struct task_struct *p; struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; + struct task_struct *p; + int new_tasks; +again: +#ifdef CONFIG_FAIR_GROUP_SCHED if (!cfs_rq->nr_running) - return NULL; + goto idle; + + if (prev->sched_class != &fair_sched_class) + goto simple; + + /* + * Because of the set_next_buddy() in dequeue_task_fair() it is rather + * likely that a next task is from the same cgroup as the current. + * + * Therefore attempt to avoid putting and setting the entire cgroup + * hierarchy, only change the part that actually changes. + */ do { - se = pick_next_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; + + /* + * Since we got here without doing put_prev_entity() we also + * have to consider cfs_rq->curr. If it is still a runnable + * entity, update_curr() will update its vruntime, otherwise + * forget we've ever seen it. + */ + if (curr && curr->on_rq) + update_curr(cfs_rq); + else + curr = NULL; + + /* + * This call to check_cfs_rq_runtime() will do the throttle and + * dequeue its entity in the parent(s). Therefore the 'simple' + * nr_running test will indeed be correct. + */ + if (unlikely(check_cfs_rq_runtime(cfs_rq))) + goto simple; + + se = pick_next_entity(cfs_rq, curr); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + + p = task_of(se); + + /* + * Since we haven't yet done put_prev_entity and if the selected task + * is a different task than we started out with, try and touch the + * least amount of cfs_rqs. + */ + if (prev != p) { + struct sched_entity *pse = &prev->se; + + while (!(cfs_rq = is_same_group(se, pse))) { + int se_depth = se->depth; + int pse_depth = pse->depth; + + if (se_depth <= pse_depth) { + put_prev_entity(cfs_rq_of(pse), pse); + pse = parent_entity(pse); + } + if (se_depth >= pse_depth) { + set_next_entity(cfs_rq_of(se), se); + se = parent_entity(se); + } + } + + put_prev_entity(cfs_rq, pse); + set_next_entity(cfs_rq, se); + } + + if (hrtick_enabled(rq)) + hrtick_start_fair(rq, p); + + return p; +simple: + cfs_rq = &rq->cfs; +#endif + + if (!cfs_rq->nr_running) + goto idle; + + put_prev_task(rq, prev); + + do { + se = pick_next_entity(cfs_rq, NULL); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); p = task_of(se); + if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); return p; + +idle: + new_tasks = idle_balance(rq); + /* + * Because idle_balance() releases (and re-acquires) rq->lock, it is + * possible for any higher priority task to appear. In that case we + * must re-start the pick_next_entity() loop. + */ + if (new_tasks < 0) + return RETRY_TASK; + + if (new_tasks > 0) + goto again; + + return NULL; } /* @@ -4751,7 +5039,7 @@ static void move_task(struct task_struct *p, struct lb_env *env) * Is this task likely cache-hot: */ static int -task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) +task_hot(struct task_struct *p, u64 now) { s64 delta; @@ -4785,7 +5073,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) { int src_nid, dst_nid; - if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || + if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) { return false; } @@ -4816,7 +5104,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) return false; - if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) + if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) return false; src_nid = cpu_to_node(env->src_cpu); @@ -4912,7 +5200,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) task is cache cold, or * 3) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); + tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); if (!tsk_cache_hot) tsk_cache_hot = migrate_degrades_locality(p, env); @@ -5775,12 +6063,10 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) pwr_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ - tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / - busiest->group_power; - if (busiest->avg_load > tmp) { + if (busiest->avg_load > scaled_busy_load_per_task) { pwr_move += busiest->group_power * min(busiest->load_per_task, - busiest->avg_load - tmp); + busiest->avg_load - scaled_busy_load_per_task); } /* Amount of load we'd add */ @@ -6359,17 +6645,24 @@ out: * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -void idle_balance(int this_cpu, struct rq *this_rq) +static int idle_balance(struct rq *this_rq) { struct sched_domain *sd; int pulled_task = 0; unsigned long next_balance = jiffies + HZ; u64 curr_cost = 0; + int this_cpu = this_rq->cpu; + + idle_enter_fair(this_rq); + /* + * We must set idle_stamp _before_ calling idle_balance(), such that we + * measure the duration of idle_balance() as idle time. + */ this_rq->idle_stamp = rq_clock(this_rq); if (this_rq->avg_idle < sysctl_sched_migration_cost) - return; + goto out; /* * Drop the rq->lock, but keep IRQ/preempt disabled. @@ -6407,15 +6700,24 @@ void idle_balance(int this_cpu, struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) { - this_rq->idle_stamp = 0; + if (pulled_task) break; - } } rcu_read_unlock(); raw_spin_lock(&this_rq->lock); + if (curr_cost > this_rq->max_idle_balance_cost) + this_rq->max_idle_balance_cost = curr_cost; + + /* + * While browsing the domains, we released the rq lock, a task could + * have been enqueued in the meantime. Since we're not going idle, + * pretend we pulled a task. + */ + if (this_rq->cfs.h_nr_running && !pulled_task) + pulled_task = 1; + if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* * We are going idle. next_balance may be set based on @@ -6424,8 +6726,20 @@ void idle_balance(int this_cpu, struct rq *this_rq) this_rq->next_balance = next_balance; } - if (curr_cost > this_rq->max_idle_balance_cost) - this_rq->max_idle_balance_cost = curr_cost; +out: + /* Is there a task of a high priority class? */ + if (this_rq->nr_running != this_rq->cfs.h_nr_running && + ((this_rq->stop && this_rq->stop->on_rq) || + this_rq->dl.dl_nr_running || + (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt)))) + pulled_task = -1; + + if (pulled_task) { + idle_exit_fair(this_rq); + this_rq->idle_stamp = 0; + } + + return pulled_task; } /* @@ -6496,6 +6810,11 @@ out_unlock: return 0; } +static inline int on_null_domain(struct rq *rq) +{ + return unlikely(!rcu_dereference_sched(rq->sd)); +} + #ifdef CONFIG_NO_HZ_COMMON /* * idle load balancing details @@ -6550,8 +6869,13 @@ static void nohz_balancer_kick(void) static inline void nohz_balance_exit_idle(int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); + /* + * Completely isolated CPUs don't ever set, so we must test. + */ + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + } clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } } @@ -6605,6 +6929,12 @@ void nohz_balance_enter_idle(int cpu) if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) return; + /* + * If we're a completely isolated CPU, we don't play. + */ + if (on_null_domain(cpu_rq(cpu))) + return; + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); atomic_inc(&nohz.nr_cpus); set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); @@ -6867,11 +7197,6 @@ static void run_rebalance_domains(struct softirq_action *h) nohz_idle_balance(this_rq, idle); } -static inline int on_null_domain(struct rq *rq) -{ - return !rcu_dereference_sched(rq->sd); -} - /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */ @@ -7036,7 +7361,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) */ static void switched_to_fair(struct rq *rq, struct task_struct *p) { - if (!p->se.on_rq) + struct sched_entity *se = &p->se; +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Since the real-depth could have been changed (only FAIR + * class maintain depth value), reset depth properly. + */ + se->depth = se->parent ? se->parent->depth + 1 : 0; +#endif + if (!se->on_rq) return; /* @@ -7084,7 +7417,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_FAIR_GROUP_SCHED static void task_move_group_fair(struct task_struct *p, int on_rq) { + struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq; + /* * If the task was not on the rq at the time of this cgroup movement * it must have been asleep, sleeping tasks keep their ->vruntime @@ -7110,23 +7445,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * To prevent boost or penalty in the new cfs_rq caused by delta * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. */ - if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) + if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) on_rq = 1; if (!on_rq) - p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; + se->vruntime -= cfs_rq_of(se)->min_vruntime; set_task_rq(p, task_cpu(p)); + se->depth = se->parent ? se->parent->depth + 1 : 0; if (!on_rq) { - cfs_rq = cfs_rq_of(&p->se); - p->se.vruntime += cfs_rq->min_vruntime; + cfs_rq = cfs_rq_of(se); + se->vruntime += cfs_rq->min_vruntime; #ifdef CONFIG_SMP /* * migrate_task_rq_fair() will have removed our previous * contribution, but we must synchronize for ongoing future * decay. */ - p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); - cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; + se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; #endif } } @@ -7222,10 +7558,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, if (!se) return; - if (!parent) + if (!parent) { se->cfs_rq = &rq->cfs; - else + se->depth = 0; + } else { se->cfs_rq = parent->my_q; + se->depth = parent->depth + 1; + } se->my_q = cfs_rq; /* guarantee group entities always have weight */ diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c new file mode 100644 index 00000000000..8f4390a079c --- /dev/null +++ b/kernel/sched/idle.c @@ -0,0 +1,265 @@ +/* + * Generic entry point for the idle threads + */ +#include <linux/sched.h> +#include <linux/cpu.h> +#include <linux/cpuidle.h> +#include <linux/tick.h> +#include <linux/mm.h> +#include <linux/stackprotector.h> + +#include <asm/tlb.h> + +#include <trace/events/power.h> + +static int __read_mostly cpu_idle_force_poll; + +void cpu_idle_poll_ctrl(bool enable) +{ + if (enable) { + cpu_idle_force_poll++; + } else { + cpu_idle_force_poll--; + WARN_ON_ONCE(cpu_idle_force_poll < 0); + } +} + +#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP +static int __init cpu_idle_poll_setup(char *__unused) +{ + cpu_idle_force_poll = 1; + return 1; +} +__setup("nohlt", cpu_idle_poll_setup); + +static int __init cpu_idle_nopoll_setup(char *__unused) +{ + cpu_idle_force_poll = 0; + return 1; +} +__setup("hlt", cpu_idle_nopoll_setup); +#endif + +static inline int cpu_idle_poll(void) +{ + rcu_idle_enter(); + trace_cpu_idle_rcuidle(0, smp_processor_id()); + local_irq_enable(); + while (!tif_need_resched()) + cpu_relax(); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); + rcu_idle_exit(); + return 1; +} + +/* Weak implementations for optional arch specific functions */ +void __weak arch_cpu_idle_prepare(void) { } +void __weak arch_cpu_idle_enter(void) { } +void __weak arch_cpu_idle_exit(void) { } +void __weak arch_cpu_idle_dead(void) { } +void __weak arch_cpu_idle(void) +{ + cpu_idle_force_poll = 1; + local_irq_enable(); +} + +/** + * cpuidle_idle_call - the main idle function + * + * NOTE: no locks or semaphores should be used here + * return non-zero on failure + */ +static int cpuidle_idle_call(void) +{ + struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + int next_state, entered_state, ret; + bool broadcast; + + /* + * Check if the idle task must be rescheduled. If it is the + * case, exit the function after re-enabling the local irq and + * set again the polling flag + */ + if (current_clr_polling_and_test()) { + local_irq_enable(); + __current_set_polling(); + return 0; + } + + /* + * During the idle period, stop measuring the disabled irqs + * critical sections latencies + */ + stop_critical_timings(); + + /* + * Tell the RCU framework we are entering an idle section, + * so no more rcu read side critical sections and one more + * step to the grace period + */ + rcu_idle_enter(); + + /* + * Check if the cpuidle framework is ready, otherwise fallback + * to the default arch specific idle method + */ + ret = cpuidle_enabled(drv, dev); + + if (!ret) { + /* + * Ask the governor to choose an idle state it thinks + * it is convenient to go to. There is *always* a + * convenient idle state + */ + next_state = cpuidle_select(drv, dev); + + /* + * The idle task must be scheduled, it is pointless to + * go to idle, just update no idle residency and get + * out of this function + */ + if (current_clr_polling_and_test()) { + dev->last_residency = 0; + entered_state = next_state; + local_irq_enable(); + } else { + broadcast = !!(drv->states[next_state].flags & + CPUIDLE_FLAG_TIMER_STOP); + + if (broadcast) + /* + * Tell the time framework to switch + * to a broadcast timer because our + * local timer will be shutdown. If a + * local timer is used from another + * cpu as a broadcast timer, this call + * may fail if it is not available + */ + ret = clockevents_notify( + CLOCK_EVT_NOTIFY_BROADCAST_ENTER, + &dev->cpu); + + if (!ret) { + trace_cpu_idle_rcuidle(next_state, dev->cpu); + + /* + * Enter the idle state previously + * returned by the governor + * decision. This function will block + * until an interrupt occurs and will + * take care of re-enabling the local + * interrupts + */ + entered_state = cpuidle_enter(drv, dev, + next_state); + + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, + dev->cpu); + + if (broadcast) + clockevents_notify( + CLOCK_EVT_NOTIFY_BROADCAST_EXIT, + &dev->cpu); + + /* + * Give the governor an opportunity to reflect on the + * outcome + */ + cpuidle_reflect(dev, entered_state); + } + } + } + + /* + * We can't use the cpuidle framework, let's use the default + * idle routine + */ + if (ret) + arch_cpu_idle(); + + __current_set_polling(); + + /* + * It is up to the idle functions to enable back the local + * interrupt + */ + if (WARN_ON_ONCE(irqs_disabled())) + local_irq_enable(); + + rcu_idle_exit(); + start_critical_timings(); + + return 0; +} + +/* + * Generic idle loop implementation + */ +static void cpu_idle_loop(void) +{ + while (1) { + tick_nohz_idle_enter(); + + while (!need_resched()) { + check_pgt_cache(); + rmb(); + + if (cpu_is_offline(smp_processor_id())) + arch_cpu_idle_dead(); + + local_irq_disable(); + arch_cpu_idle_enter(); + + /* + * In poll mode we reenable interrupts and spin. + * + * Also if we detected in the wakeup from idle + * path that the tick broadcast device expired + * for us, we don't want to go deep idle as we + * know that the IPI is going to arrive right + * away + */ + if (cpu_idle_force_poll || tick_check_broadcast_expired()) + cpu_idle_poll(); + else + cpuidle_idle_call(); + + arch_cpu_idle_exit(); + } + + /* + * Since we fell out of the loop above, we know + * TIF_NEED_RESCHED must be set, propagate it into + * PREEMPT_NEED_RESCHED. + * + * This is required because for polling idle loops we will + * not have had an IPI to fold the state for us. + */ + preempt_set_need_resched(); + tick_nohz_idle_exit(); + schedule_preempt_disabled(); + } +} + +void cpu_startup_entry(enum cpuhp_state state) +{ + /* + * This #ifdef needs to die, but it's too late in the cycle to + * make this generic (arm and sh have never invoked the canary + * init for the non boot cpus!). Will be fixed in 3.11 + */ +#ifdef CONFIG_X86 + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. The boot CPU already has it initialized but no harm + * in doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); +#endif + __current_set_polling(); + arch_cpu_idle_prepare(); + cpu_idle_loop(); +} diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 516c3d9ceea..879f2b75266 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -13,18 +13,8 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } - -static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) -{ - idle_exit_fair(rq); - rq_last_tick_reset(rq); -} - -static void post_schedule_idle(struct rq *rq) -{ - idle_enter_fair(rq); -} #endif /* CONFIG_SMP */ + /* * Idle tasks are unconditionally rescheduled: */ @@ -33,13 +23,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl resched_task(rq->idle); } -static struct task_struct *pick_next_task_idle(struct rq *rq) +static struct task_struct * +pick_next_task_idle(struct rq *rq, struct task_struct *prev) { + put_prev_task(rq, prev); + schedstat_inc(rq, sched_goidle); -#ifdef CONFIG_SMP - /* Trigger the post schedule to do an idle_enter for CFS */ - rq->post_schedule = 1; -#endif return rq->idle; } @@ -58,6 +47,8 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { + idle_exit_fair(rq); + rq_last_tick_reset(rq); } static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) @@ -101,8 +92,6 @@ const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, - .pre_schedule = pre_schedule_idle, - .post_schedule = post_schedule_idle, #endif .set_curr_task = set_curr_task_idle, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 1999021042c..bd2267ad404 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -229,6 +229,14 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) #ifdef CONFIG_SMP +static int pull_rt_task(struct rq *this_rq); + +static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) +{ + /* Try to pull RT tasks here if we lower this rq's prio */ + return rq->rt.highest_prio.curr > prev->prio; +} + static inline int rt_overloaded(struct rq *rq) { return atomic_read(&rq->rd->rto_count); @@ -315,6 +323,15 @@ static inline int has_pushable_tasks(struct rq *rq) return !plist_head_empty(&rq->rt.pushable_tasks); } +static inline void set_post_schedule(struct rq *rq) +{ + /* + * We detect this state here so that we can avoid taking the RQ + * lock again later if there is no need to push + */ + rq->post_schedule = has_pushable_tasks(rq); +} + static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) { plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); @@ -359,6 +376,19 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { } +static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) +{ + return false; +} + +static inline int pull_rt_task(struct rq *this_rq) +{ + return 0; +} + +static inline void set_post_schedule(struct rq *rq) +{ +} #endif /* CONFIG_SMP */ static inline int on_rt_rq(struct sched_rt_entity *rt_se) @@ -440,11 +470,6 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) dequeue_rt_entity(rt_se); } -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; -} - static int rt_se_boosted(struct sched_rt_entity *rt_se) { struct rt_rq *rt_rq = group_rt_rq(rt_se); @@ -515,11 +540,6 @@ static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { } -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled; -} - static inline const struct cpumask *sched_rt_period_mask(void) { return cpu_online_mask; @@ -1318,15 +1338,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) { struct sched_rt_entity *rt_se; struct task_struct *p; - struct rt_rq *rt_rq; - - rt_rq = &rq->rt; - - if (!rt_rq->rt_nr_running) - return NULL; - - if (rt_rq_throttled(rt_rq)) - return NULL; + struct rt_rq *rt_rq = &rq->rt; do { rt_se = pick_next_rt_entity(rq, rt_rq); @@ -1340,21 +1352,46 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return p; } -static struct task_struct *pick_next_task_rt(struct rq *rq) +static struct task_struct * +pick_next_task_rt(struct rq *rq, struct task_struct *prev) { - struct task_struct *p = _pick_next_task_rt(rq); + struct task_struct *p; + struct rt_rq *rt_rq = &rq->rt; + + if (need_pull_rt_task(rq, prev)) { + pull_rt_task(rq); + /* + * pull_rt_task() can drop (and re-acquire) rq->lock; this + * means a dl or stop task can slip in, in which case we need + * to re-start task selection. + */ + if (unlikely((rq->stop && rq->stop->on_rq) || + rq->dl.dl_nr_running)) + return RETRY_TASK; + } + + /* + * We may dequeue prev's rt_rq in put_prev_task(). + * So, we update time before rt_nr_running check. + */ + if (prev->sched_class == &rt_sched_class) + update_curr_rt(rq); + + if (!rt_rq->rt_nr_running) + return NULL; + + if (rt_rq_throttled(rt_rq)) + return NULL; + + put_prev_task(rq, prev); + + p = _pick_next_task_rt(rq); /* The running task is never eligible for pushing */ if (p) dequeue_pushable_task(rq, p); -#ifdef CONFIG_SMP - /* - * We detect this state here so that we can avoid taking the RQ - * lock again later if there is no need to push - */ - rq->post_schedule = has_pushable_tasks(rq); -#endif + set_post_schedule(rq); return p; } @@ -1724,13 +1761,6 @@ skip: return ret; } -static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) -{ - /* Try to pull RT tasks here if we lower this rq's prio */ - if (rq->rt.highest_prio.curr > prev->prio) - pull_rt_task(rq); -} - static void post_schedule_rt(struct rq *rq) { push_rt_tasks(rq); @@ -1833,7 +1863,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) resched_task(rq->curr); } -void init_sched_rt_class(void) +void __init init_sched_rt_class(void) { unsigned int i; @@ -2007,7 +2037,6 @@ const struct sched_class rt_sched_class = { .set_cpus_allowed = set_cpus_allowed_rt, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, - .pre_schedule = pre_schedule_rt, .post_schedule = post_schedule_rt, .task_woken = task_woken_rt, .switched_from = switched_from_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f964add50f3..369b4d663c4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -24,24 +24,6 @@ extern long calc_load_fold_active(struct rq *this_rq); extern void update_cpu_load_active(struct rq *this_rq); /* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], - * and back. - */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) -#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) - -/* - * 'User priority' is the nice value converted to something we - * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. - */ -#define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) -#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) - -/* * Helpers for converting nanosecond timing to jiffy resolution */ #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) @@ -296,7 +278,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); -extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); extern void free_rt_sched_group(struct task_group *tg); @@ -441,6 +423,18 @@ struct rt_rq { #endif }; +#ifdef CONFIG_RT_GROUP_SCHED +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; +} +#else +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled; +} +#endif + /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ @@ -558,11 +552,9 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; -#endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_RT_GROUP_SCHED - struct list_head leaf_rt_rq_list; -#endif + struct sched_avg avg; +#endif /* CONFIG_FAIR_GROUP_SCHED */ /* * This is part of a global counter where only the total sum @@ -651,8 +643,6 @@ struct rq { #ifdef CONFIG_SMP struct llist_head wake_list; #endif - - struct sched_avg avg; }; static inline int cpu_of(struct rq *rq) @@ -1112,6 +1102,8 @@ static const u32 prio_to_wmult[40] = { #define DEQUEUE_SLEEP 1 +#define RETRY_TASK ((void *)-1UL) + struct sched_class { const struct sched_class *next; @@ -1122,14 +1114,22 @@ struct sched_class { void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); - struct task_struct * (*pick_next_task) (struct rq *rq); + /* + * It is the responsibility of the pick_next_task() method that will + * return the next task to call put_prev_task() on the @prev task or + * something equivalent. + * + * May return RETRY_TASK when it finds a higher prio class has runnable + * tasks. + */ + struct task_struct * (*pick_next_task) (struct rq *rq, + struct task_struct *prev); void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p, int next_cpu); - void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); @@ -1159,6 +1159,11 @@ struct sched_class { #endif }; +static inline void put_prev_task(struct rq *rq, struct task_struct *prev) +{ + prev->sched_class->put_prev_task(rq, prev); +} + #define sched_class_highest (&stop_sched_class) #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) @@ -1175,16 +1180,14 @@ extern const struct sched_class idle_sched_class; extern void update_group_power(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); -extern void idle_balance(int this_cpu, struct rq *this_rq); extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); -#else /* CONFIG_SMP */ +#else -static inline void idle_balance(int cpu, struct rq *rq) -{ -} +static inline void idle_enter_fair(struct rq *rq) { } +static inline void idle_exit_fair(struct rq *rq) { } #endif @@ -1213,16 +1216,6 @@ extern void update_idle_cpu_load(struct rq *this_rq); extern void init_task_runnable_average(struct task_struct *p); -#ifdef CONFIG_PARAVIRT -static inline u64 steal_ticks(u64 steal) -{ - if (unlikely(steal > NSEC_PER_SEC)) - return div_u64(steal, TICK_NSEC); - - return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -} -#endif - static inline void inc_nr_running(struct rq *rq) { rq->nr_running++; @@ -1392,6 +1385,15 @@ static inline void double_lock(spinlock_t *l1, spinlock_t *l2) spin_lock_nested(l2, SINGLE_DEPTH_NESTING); } +static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock_irq(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) { if (l1 > l2) diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index da98af347e8..a476bea17fb 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -142,4 +142,4 @@ static int __init proc_schedstat_init(void) proc_create("schedstat", 0, NULL, &proc_schedstat_operations); return 0; } -module_init(proc_schedstat_init); +subsys_initcall(proc_schedstat_init); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index fdb6bb0b335..d6ce65dde54 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -23,16 +23,19 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) /* we're never preempted */ } -static struct task_struct *pick_next_task_stop(struct rq *rq) +static struct task_struct * +pick_next_task_stop(struct rq *rq, struct task_struct *prev) { struct task_struct *stop = rq->stop; - if (stop && stop->on_rq) { - stop->se.exec_start = rq_clock_task(rq); - return stop; - } + if (!stop || !stop->on_rq) + return NULL; - return NULL; + put_prev_task(rq, prev); + + stop->se.exec_start = rq_clock_task(rq); + + return stop; } static void |