summaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c211
1 files changed, 169 insertions, 42 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 0236958addc..ad1962dc0aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -201,7 +201,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
hrtimer_init(&rt_b->rt_period_timer,
CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rt_b->rt_period_timer.function = sched_rt_period_timer;
- rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+ rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
}
static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
@@ -600,7 +600,6 @@ struct rq {
/* BKL stats */
unsigned int bkl_count;
#endif
- struct lock_class_key rq_lock_key;
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -809,9 +808,9 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
/*
* ratelimit for updating the group shares.
- * default: 0.5ms
+ * default: 0.25ms
*/
-const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
+unsigned int sysctl_sched_shares_ratelimit = 250000;
/*
* period over which we measure -rt task cpu usage in us.
@@ -834,7 +833,7 @@ static inline u64 global_rt_period(void)
static inline u64 global_rt_runtime(void)
{
- if (sysctl_sched_rt_period < 0)
+ if (sysctl_sched_rt_runtime < 0)
return RUNTIME_INF;
return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
@@ -1088,7 +1087,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
return NOTIFY_DONE;
}
-static void init_hrtick(void)
+static __init void init_hrtick(void)
{
hotcpu_notifier(hotplug_hrtick, 0);
}
@@ -1120,7 +1119,7 @@ static void init_rq_hrtick(struct rq *rq)
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rq->hrtick_timer.function = hrtick;
- rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+ rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
}
#else
static inline void hrtick_clear(struct rq *rq)
@@ -2759,10 +2758,10 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
} else {
if (rq1 < rq2) {
spin_lock(&rq1->lock);
- spin_lock(&rq2->lock);
+ spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
} else {
spin_lock(&rq2->lock);
- spin_lock(&rq1->lock);
+ spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
}
}
update_rq_clock(rq1);
@@ -2805,14 +2804,21 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
if (busiest < this_rq) {
spin_unlock(&this_rq->lock);
spin_lock(&busiest->lock);
- spin_lock(&this_rq->lock);
+ spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
ret = 1;
} else
- spin_lock(&busiest->lock);
+ spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
}
return ret;
}
+static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(busiest->lock)
+{
+ spin_unlock(&busiest->lock);
+ lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
+
/*
* If dest_cpu is allowed for this process, migrate the task to it.
* This is accomplished by forcing the cpu_allowed mask to only
@@ -3637,7 +3643,7 @@ redo:
ld_moved = move_tasks(this_rq, this_cpu, busiest,
imbalance, sd, CPU_NEWLY_IDLE,
&all_pinned);
- spin_unlock(&busiest->lock);
+ double_unlock_balance(this_rq, busiest);
if (unlikely(all_pinned)) {
cpu_clear(cpu_of(busiest), *cpus);
@@ -3752,7 +3758,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
else
schedstat_inc(sd, alb_failed);
}
- spin_unlock(&target_rq->lock);
+ double_unlock_balance(busiest_rq, target_rq);
}
#ifdef CONFIG_NO_HZ
@@ -4173,6 +4179,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
}
/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+cputime_t task_utime(struct task_struct *p)
+{
+ return p->utime;
+}
+
+cputime_t task_stime(struct task_struct *p)
+{
+ return p->stime;
+}
+#else
+cputime_t task_utime(struct task_struct *p)
+{
+ clock_t utime = cputime_to_clock_t(p->utime),
+ total = utime + cputime_to_clock_t(p->stime);
+ u64 temp;
+
+ /*
+ * Use CFS's precise accounting:
+ */
+ temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
+
+ if (total) {
+ temp *= utime;
+ do_div(temp, total);
+ }
+ utime = (clock_t)temp;
+
+ p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
+ return p->prev_utime;
+}
+
+cputime_t task_stime(struct task_struct *p)
+{
+ clock_t stime;
+
+ /*
+ * Use CFS's precise accounting. (we subtract utime from
+ * the total, to make sure the total observed by userspace
+ * grows monotonically - apps rely on that):
+ */
+ stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
+ cputime_to_clock_t(task_utime(p));
+
+ if (stime >= 0)
+ p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
+
+ return p->prev_stime;
+}
+#endif
+
+inline cputime_t task_gtime(struct task_struct *p)
+{
+ return p->gtime;
+}
+
+/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*
@@ -4663,6 +4728,52 @@ int __sched wait_for_completion_killable(struct completion *x)
}
EXPORT_SYMBOL(wait_for_completion_killable);
+/**
+ * try_wait_for_completion - try to decrement a completion without blocking
+ * @x: completion structure
+ *
+ * Returns: 0 if a decrement cannot be done without blocking
+ * 1 if a decrement succeeded.
+ *
+ * If a completion is being used as a counting completion,
+ * attempt to decrement the counter without blocking. This
+ * enables us to avoid waiting if the resource the completion
+ * is protecting is not available.
+ */
+bool try_wait_for_completion(struct completion *x)
+{
+ int ret = 1;
+
+ spin_lock_irq(&x->wait.lock);
+ if (!x->done)
+ ret = 0;
+ else
+ x->done--;
+ spin_unlock_irq(&x->wait.lock);
+ return ret;
+}
+EXPORT_SYMBOL(try_wait_for_completion);
+
+/**
+ * completion_done - Test to see if a completion has any waiters
+ * @x: completion structure
+ *
+ * Returns: 0 if there are waiters (wait_for_completion() in progress)
+ * 1 if there are no waiters.
+ *
+ */
+bool completion_done(struct completion *x)
+{
+ int ret = 1;
+
+ spin_lock_irq(&x->wait.lock);
+ if (!x->done)
+ ret = 0;
+ spin_unlock_irq(&x->wait.lock);
+ return ret;
+}
+EXPORT_SYMBOL(completion_done);
+
static long __sched
sleep_on_common(wait_queue_head_t *q, int state, long timeout)
{
@@ -5004,19 +5115,21 @@ recheck:
return -EPERM;
}
+ if (user) {
#ifdef CONFIG_RT_GROUP_SCHED
- /*
- * Do not allow realtime tasks into groups that have no runtime
- * assigned.
- */
- if (user
- && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
- return -EPERM;
+ /*
+ * Do not allow realtime tasks into groups that have no runtime
+ * assigned.
+ */
+ if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+ return -EPERM;
#endif
- retval = security_task_setscheduler(p, policy, param);
- if (retval)
- return retval;
+ retval = security_task_setscheduler(p, policy, param);
+ if (retval)
+ return retval;
+ }
+
/*
* make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
@@ -5732,6 +5845,8 @@ static inline void sched_init_granularity(void)
sysctl_sched_latency = limit;
sysctl_sched_wakeup_granularity *= factor;
+
+ sysctl_sched_shares_ratelimit *= factor;
}
#ifdef CONFIG_SMP
@@ -7581,24 +7696,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
* and partition_sched_domains() will fallback to the single partition
* 'fallback_doms', it also forces the domains to be rebuilt.
*
+ * If doms_new==NULL it will be replaced with cpu_online_map.
+ * ndoms_new==0 is a special case for destroying existing domains.
+ * It will not create the default domain.
+ *
* Call with hotplug lock held
*/
void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
struct sched_domain_attr *dattr_new)
{
- int i, j;
+ int i, j, n;
mutex_lock(&sched_domains_mutex);
/* always unregister in case we don't destroy any domains */
unregister_sched_domain_sysctl();
- if (doms_new == NULL)
- ndoms_new = 0;
+ n = doms_new ? ndoms_new : 0;
/* Destroy deleted domains */
for (i = 0; i < ndoms_cur; i++) {
- for (j = 0; j < ndoms_new; j++) {
+ for (j = 0; j < n; j++) {
if (cpus_equal(doms_cur[i], doms_new[j])
&& dattrs_equal(dattr_cur, i, dattr_new, j))
goto match1;
@@ -7611,7 +7729,6 @@ match1:
if (doms_new == NULL) {
ndoms_cur = 0;
- ndoms_new = 1;
doms_new = &fallback_doms;
cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
dattr_new = NULL;
@@ -7648,8 +7765,13 @@ match2:
int arch_reinit_sched_domains(void)
{
get_online_cpus();
+
+ /* Destroy domains first to force the rebuild */
+ partition_sched_domains(0, NULL, NULL);
+
rebuild_sched_domains();
put_online_cpus();
+
return 0;
}
@@ -7671,34 +7793,34 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
}
#ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sys_device *dev,
- struct sysdev_attribute *attr, char *page)
+static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+ char *page)
{
return sprintf(page, "%u\n", sched_mc_power_savings);
}
-static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
- struct sysdev_attribute *attr,
+static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
const char *buf, size_t count)
{
return sched_power_savings_store(buf, count, 0);
}
-static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
- sched_mc_power_savings_store);
+static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
+ sched_mc_power_savings_show,
+ sched_mc_power_savings_store);
#endif
#ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sys_device *dev,
- struct sysdev_attribute *attr, char *page)
+static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+ char *page)
{
return sprintf(page, "%u\n", sched_smt_power_savings);
}
-static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
- struct sysdev_attribute *attr,
+static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
const char *buf, size_t count)
{
return sched_power_savings_store(buf, count, 1);
}
-static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+ sched_smt_power_savings_show,
sched_smt_power_savings_store);
#endif
@@ -7733,7 +7855,7 @@ static int update_sched_domains(struct notifier_block *nfb,
case CPU_ONLINE_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- partition_sched_domains(0, NULL, NULL);
+ partition_sched_domains(1, NULL, NULL);
return NOTIFY_OK;
default:
@@ -7998,7 +8120,6 @@ void __init sched_init(void)
rq = cpu_rq(i);
spin_lock_init(&rq->lock);
- lockdep_set_class(&rq->lock, &rq->rq_lock_key);
rq->nr_running = 0;
init_cfs_rq(&rq->cfs, rq);
init_rt_rq(&rq->rt, rq);
@@ -8455,8 +8576,8 @@ struct task_group *sched_create_group(struct task_group *parent)
WARN_ON(!parent); /* root should already exist */
tg->parent = parent;
- list_add_rcu(&tg->siblings, &parent->children);
INIT_LIST_HEAD(&tg->children);
+ list_add_rcu(&tg->siblings, &parent->children);
spin_unlock_irqrestore(&task_group_lock, flags);
return tg;
@@ -8788,6 +8909,9 @@ static int sched_rt_global_constraints(void)
u64 rt_runtime, rt_period;
int ret = 0;
+ if (sysctl_sched_rt_period <= 0)
+ return -EINVAL;
+
rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
rt_runtime = tg->rt_bandwidth.rt_runtime;
@@ -8804,6 +8928,9 @@ static int sched_rt_global_constraints(void)
unsigned long flags;
int i;
+ if (sysctl_sched_rt_period <= 0)
+ return -EINVAL;
+
spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
for_each_possible_cpu(i) {
struct rt_rq *rt_rq = &cpu_rq(i)->rt;