diff options
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 414 |
1 files changed, 109 insertions, 305 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624a393..58453b8272f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -512,11 +512,6 @@ static inline void init_hrtick(void) * the target CPU. */ #ifdef CONFIG_SMP - -#ifndef tsk_is_polling -#define tsk_is_polling(t) 0 -#endif - void resched_task(struct task_struct *p) { int cpu; @@ -549,7 +544,7 @@ void resched_cpu(int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * In the semi idle case, use the nearest busy cpu for migrating timers * from an idle cpu. This is good for power-savings. @@ -587,7 +582,7 @@ unlock: * account when the CPU goes back to idle and evaluates the timer * wheel for the next timer event. */ -void wake_up_idle_cpu(int cpu) +static void wake_up_idle_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -617,20 +612,56 @@ void wake_up_idle_cpu(int cpu) smp_send_reschedule(cpu); } +static bool wake_up_full_nohz_cpu(int cpu) +{ + if (tick_nohz_full_cpu(cpu)) { + if (cpu != smp_processor_id() || + tick_nohz_tick_stopped()) + smp_send_reschedule(cpu); + return true; + } + + return false; +} + +void wake_up_nohz_cpu(int cpu) +{ + if (!wake_up_full_nohz_cpu(cpu)) + wake_up_idle_cpu(cpu); +} + static inline bool got_nohz_idle_kick(void) { int cpu = smp_processor_id(); return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); } -#else /* CONFIG_NO_HZ */ +#else /* CONFIG_NO_HZ_COMMON */ static inline bool got_nohz_idle_kick(void) { return false; } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ + +#ifdef CONFIG_NO_HZ_FULL +bool sched_can_stop_tick(void) +{ + struct rq *rq; + + rq = this_rq(); + + /* Make sure rq->nr_running update is visible after the IPI */ + smp_rmb(); + + /* More than one running task need preemption */ + if (rq->nr_running > 1) + return false; + + return true; +} +#endif /* CONFIG_NO_HZ_FULL */ void sched_avg_update(struct rq *rq) { @@ -1288,8 +1319,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { - trace_sched_wakeup(p, true); check_preempt_curr(rq, p, wake_flags); + trace_sched_wakeup(p, true); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -1362,7 +1393,8 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) + if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() + && !tick_nohz_full_cpu(smp_processor_id())) return; /* @@ -1379,6 +1411,7 @@ void scheduler_ipi(void) * somewhat pessimize the simple resched case. */ irq_enter(); + tick_nohz_full_check(); sched_ttwu_pending(); /* @@ -1498,8 +1531,10 @@ static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); - BUG_ON(rq != this_rq()); - BUG_ON(p == current); + if (WARN_ON_ONCE(rq != this_rq()) || + WARN_ON_ONCE(p == current)) + return; + lockdep_assert_held(&rq->lock); if (!raw_spin_trylock(&p->pi_lock)) { @@ -1858,6 +1893,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) kprobe_flush_task(prev); put_task_struct(prev); } + + tick_nohz_task_switch(current); } #ifdef CONFIG_SMP @@ -2121,7 +2158,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) return load >> FSHIFT; } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * Handle NO_HZ for the global load-average. * @@ -2347,12 +2384,12 @@ static void calc_global_nohz(void) smp_wmb(); calc_load_idx++; } -#else /* !CONFIG_NO_HZ */ +#else /* !CONFIG_NO_HZ_COMMON */ static inline long calc_load_fold_idle(void) { return 0; } static inline void calc_global_nohz(void) { } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ /* * calc_load - update the avenrun load estimates 10 ticks after the @@ -2512,7 +2549,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, sched_avg_update(this_rq); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * There is no sane way to deal with nohz on smp when using jiffies because the * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading @@ -2572,7 +2609,7 @@ void update_cpu_load_nohz(void) } raw_spin_unlock(&this_rq->lock); } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ /* * Called from scheduler_tick() @@ -2699,8 +2736,35 @@ void scheduler_tick(void) rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif + rq_last_tick_reset(rq); } +#ifdef CONFIG_NO_HZ_FULL +/** + * scheduler_tick_max_deferment + * + * Keep at least one tick per second when a single + * active task is running because the scheduler doesn't + * yet completely support full dynticks environment. + * + * This makes sure that uptime, CFS vruntime, load + * balancing, etc... continue to move forward, even + * with a very low granularity. + */ +u64 scheduler_tick_max_deferment(void) +{ + struct rq *rq = this_rq(); + unsigned long next, now = ACCESS_ONCE(jiffies); + + next = rq->last_sched_tick + HZ; + + if (time_before_eq(next, now)) + return 0; + + return jiffies_to_usecs(next - now) * NSEC_PER_USEC; +} +#endif + notrace unsigned long get_parent_ip(unsigned long addr) { if (in_lock_functions(addr)) { @@ -2997,51 +3061,6 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - -static inline bool owner_running(struct mutex *lock, struct task_struct *owner) -{ - if (lock->owner != owner) - return false; - - /* - * Ensure we emit the owner->on_cpu, dereference _after_ checking - * lock->owner still matches owner, if that fails, owner might - * point to free()d memory, if it still matches, the rcu_read_lock() - * ensures the memory stays valid. - */ - barrier(); - - return owner->on_cpu; -} - -/* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. - */ -int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) -{ - if (!sched_feat(OWNER_SPIN)) - return 0; - - rcu_read_lock(); - while (owner_running(lock, owner)) { - if (need_resched()) - break; - - arch_mutex_cpu_relax(); - } - rcu_read_unlock(); - - /* - * We break out the loop above on need_resched() and when the - * owner changed, which is a sign for heavy contention. Return - * success only when lock->owner is NULL. - */ - return lock->owner == NULL; -} -#endif - #ifdef CONFIG_PREEMPT /* * this is the entry point to schedule() from in-kernel preemption @@ -3082,11 +3101,13 @@ EXPORT_SYMBOL(preempt_schedule); asmlinkage void __sched preempt_schedule_irq(void) { struct thread_info *ti = current_thread_info(); + enum ctx_state prev_state; /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); - user_exit(); + prev_state = exception_enter(); + do { add_preempt_count(PREEMPT_ACTIVE); local_irq_enable(); @@ -3100,6 +3121,8 @@ asmlinkage void __sched preempt_schedule_irq(void) */ barrier(); } while (need_resched()); + + exception_exit(prev_state); } #endif /* CONFIG_PREEMPT */ @@ -4126,6 +4149,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) get_task_struct(p); rcu_read_unlock(); + if (p->flags & PF_NO_SETAFFINITY) { + retval = -EINVAL; + goto out_put_task; + } if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { retval = -ENOMEM; goto out_put_task; @@ -4626,6 +4653,7 @@ void sched_show_task(struct task_struct *p) task_pid_nr(p), ppid, (unsigned long)task_thread_info(p)->flags); + print_worker_info(KERN_INFO, p); show_stack(p, NULL); } @@ -4773,11 +4801,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; } - if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { - ret = -EINVAL; - goto out; - } - do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ @@ -4999,7 +5022,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) } static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX; +static int max_load_idx = CPU_LOAD_IDX_MAX-1; static void set_table_entry(struct ctl_table *entry, @@ -6248,7 +6271,7 @@ static void sched_init_numa(void) * 'level' contains the number of unique distances, excluding the * identity distance node_distance(i,i). * - * The sched_domains_nume_distance[] array includes the actual distance + * The sched_domains_numa_distance[] array includes the actual distance * numbers. */ @@ -6861,11 +6884,15 @@ int in_sched_functions(unsigned long addr) } #ifdef CONFIG_CGROUP_SCHED +/* + * Default task group. + * Every task in system belongs to this group at bootup. + */ struct task_group root_task_group; LIST_HEAD(task_groups); #endif -DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); void __init sched_init(void) { @@ -6902,7 +6929,7 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CPUMASK_OFFSTACK for_each_possible_cpu(i) { - per_cpu(load_balance_tmpmask, i) = (void *)ptr; + per_cpu(load_balance_mask, i) = (void *)ptr; ptr += cpumask_size(); } #endif /* CONFIG_CPUMASK_OFFSTACK */ @@ -6928,12 +6955,6 @@ void __init sched_init(void) #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT - root_cpuacct.cpustat = &kernel_cpustat; - root_cpuacct.cpuusage = alloc_percpu(u64); - /* Too early, not expected to fail */ - BUG_ON(!root_cpuacct.cpuusage); -#endif for_each_possible_cpu(i) { struct rq *rq; @@ -6997,9 +7018,12 @@ void __init sched_init(void) INIT_LIST_HEAD(&rq->cfs_tasks); rq_attach_root(rq, &def_root_domain); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON rq->nohz_flags = 0; #endif +#ifdef CONFIG_NO_HZ_FULL + rq->last_sched_tick = 0; +#endif #endif init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); @@ -7455,7 +7479,7 @@ unlock: return err; } -int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) { u64 rt_runtime, rt_period; @@ -7467,7 +7491,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } -long sched_group_rt_runtime(struct task_group *tg) +static long sched_group_rt_runtime(struct task_group *tg) { u64 rt_runtime_us; @@ -7479,7 +7503,7 @@ long sched_group_rt_runtime(struct task_group *tg) return rt_runtime_us; } -int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) { u64 rt_runtime, rt_period; @@ -7492,7 +7516,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } -long sched_group_rt_period(struct task_group *tg) +static long sched_group_rt_period(struct task_group *tg) { u64 rt_period_us; @@ -7527,7 +7551,7 @@ static int sched_rt_global_constraints(void) return ret; } -int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) { /* Don't accept realtime tasks when there is no way for them to run */ if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) @@ -8035,226 +8059,6 @@ struct cgroup_subsys cpu_cgroup_subsys = { #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT - -/* - * CPU accounting code for task groups. - * - * Based on the work by Paul Menage (menage@google.com) and Balbir Singh - * (balbir@in.ibm.com). - */ - -struct cpuacct root_cpuacct; - -/* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) -{ - struct cpuacct *ca; - - if (!cgrp->parent) - return &root_cpuacct.css; - - ca = kzalloc(sizeof(*ca), GFP_KERNEL); - if (!ca) - goto out; - - ca->cpuusage = alloc_percpu(u64); - if (!ca->cpuusage) - goto out_free_ca; - - ca->cpustat = alloc_percpu(struct kernel_cpustat); - if (!ca->cpustat) - goto out_free_cpuusage; - - return &ca->css; - -out_free_cpuusage: - free_percpu(ca->cpuusage); -out_free_ca: - kfree(ca); -out: - return ERR_PTR(-ENOMEM); -} - -/* destroy an existing cpu accounting group */ -static void cpuacct_css_free(struct cgroup *cgrp) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - - free_percpu(ca->cpustat); - free_percpu(ca->cpuusage); - kfree(ca); -} - -static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) -{ - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - u64 data; - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit read safe on 32-bit platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); - data = *cpuusage; - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - data = *cpuusage; -#endif - - return data; -} - -static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) -{ - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit write safe on 32-bit platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); - *cpuusage = val; - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - *cpuusage = val; -#endif -} - -/* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - u64 totalcpuusage = 0; - int i; - - for_each_present_cpu(i) - totalcpuusage += cpuacct_cpuusage_read(ca, i); - - return totalcpuusage; -} - -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, - u64 reset) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int err = 0; - int i; - - if (reset) { - err = -EINVAL; - goto out; - } - - for_each_present_cpu(i) - cpuacct_cpuusage_write(ca, i, 0); - -out: - return err; -} - -static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) -{ - struct cpuacct *ca = cgroup_ca(cgroup); - u64 percpu; - int i; - - for_each_present_cpu(i) { - percpu = cpuacct_cpuusage_read(ca, i); - seq_printf(m, "%llu ", (unsigned long long) percpu); - } - seq_printf(m, "\n"); - return 0; -} - -static const char *cpuacct_stat_desc[] = { - [CPUACCT_STAT_USER] = "user", - [CPUACCT_STAT_SYSTEM] = "system", -}; - -static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int cpu; - s64 val = 0; - - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_USER]; - val += kcpustat->cpustat[CPUTIME_NICE]; - } - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); - - val = 0; - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_SYSTEM]; - val += kcpustat->cpustat[CPUTIME_IRQ]; - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; - } - - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); - - return 0; -} - -static struct cftype files[] = { - { - .name = "usage", - .read_u64 = cpuusage_read, - .write_u64 = cpuusage_write, - }, - { - .name = "usage_percpu", - .read_seq_string = cpuacct_percpu_seq_read, - }, - { - .name = "stat", - .read_map = cpuacct_stats_show, - }, - { } /* terminate */ -}; - -/* - * charge this task's execution time to its accounting group. - * - * called with rq->lock held. - */ -void cpuacct_charge(struct task_struct *tsk, u64 cputime) -{ - struct cpuacct *ca; - int cpu; - - if (unlikely(!cpuacct_subsys.active)) - return; - - cpu = task_cpu(tsk); - - rcu_read_lock(); - - ca = task_ca(tsk); - - for (; ca; ca = parent_ca(ca)) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - *cpuusage += cputime; - } - - rcu_read_unlock(); -} - -struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", - .css_alloc = cpuacct_css_alloc, - .css_free = cpuacct_css_free, - .subsys_id = cpuacct_subsys_id, - .base_cftypes = files, -}; -#endif /* CONFIG_CGROUP_CPUACCT */ - void dump_cpu_task(int cpu) { pr_info("Task dump for CPU %d:\n", cpu); |