summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2006-06-27 02:54:51 -0700
committerLinus Torvalds <torvalds@g5.osdl.org>2006-06-27 17:32:46 -0700
commitb29739f902ee76a05493fb7d2303490fc75364f4 (patch)
tree1bf48dfb74752a7ef24a2a4a74c45da0aaec754b
parent77ba89c5cf28d5d98a3cae17f67a3e42b102cc25 (diff)
[PATCH] pi-futex: scheduler support for pi
Add framework to boost/unboost the priority of RT tasks. This consists of: - caching the 'normal' priority in ->normal_prio - providing a functions to set/get the priority of the task - make sched_setscheduler() aware of boosting The effective_prio() cleanups also fix a priority-calculation bug pointed out by Andrey Gelman, in set_user_nice(). has_rt_policy() fix: Peter Williams <pwil3058@bigpond.net.au> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> Cc: Andrey Gelman <agelman@012.net.il> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/linux/init_task.h2
-rw-r--r--include/linux/sched.h21
-rw-r--r--kernel/sched.c189
3 files changed, 181 insertions, 31 deletions
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e127ef7e8da..678c1a90380 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -87,6 +87,7 @@ extern struct group_info init_groups;
.lock_depth = -1, \
.prio = MAX_PRIO-20, \
.static_prio = MAX_PRIO-20, \
+ .normal_prio = MAX_PRIO-20, \
.policy = SCHED_NORMAL, \
.cpus_allowed = CPU_MASK_ALL, \
.mm = NULL, \
@@ -122,6 +123,7 @@ extern struct group_info init_groups;
.journal_info = NULL, \
.cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
.fs_excl = ATOMIC_INIT(0), \
+ .pi_lock = SPIN_LOCK_UNLOCKED, \
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0bc81a151e5..6f167645e7e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -495,8 +495,11 @@ struct signal_struct {
#define MAX_PRIO (MAX_RT_PRIO + 40)
-#define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO))
+#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
+#define rt_task(p) rt_prio((p)->prio)
#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
+#define has_rt_policy(p) \
+ unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH)
/*
* Some day this will be a full-fledged user tracking system..
@@ -725,7 +728,7 @@ struct task_struct {
#endif
#endif
int load_weight; /* for niceness load balancing purposes */
- int prio, static_prio;
+ int prio, static_prio, normal_prio;
struct list_head run_list;
prio_array_t *array;
@@ -852,6 +855,9 @@ struct task_struct {
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
spinlock_t alloc_lock;
+ /* Protection of the PI data structures: */
+ spinlock_t pi_lock;
+
#ifdef CONFIG_DEBUG_MUTEXES
/* mutex deadlock detection */
struct mutex_waiter *blocked_on;
@@ -1018,6 +1024,17 @@ static inline void idle_task_exit(void) {}
#endif
extern void sched_idle_next(void);
+
+#ifdef CONFIG_RT_MUTEXES
+extern int rt_mutex_getprio(task_t *p);
+extern void rt_mutex_setprio(task_t *p, int prio);
+#else
+static inline int rt_mutex_getprio(task_t *p)
+{
+ return p->normal_prio;
+}
+#endif
+
extern void set_user_nice(task_t *p, long nice);
extern int task_prio(const task_t *p);
extern int task_nice(const task_t *p);
diff --git a/kernel/sched.c b/kernel/sched.c
index 15abf083324..08431f07a99 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -355,6 +355,25 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
/*
+ * __task_rq_lock - lock the runqueue a given task resides on.
+ * Must be called interrupts disabled.
+ */
+static inline runqueue_t *__task_rq_lock(task_t *p)
+ __acquires(rq->lock)
+{
+ struct runqueue *rq;
+
+repeat_lock_task:
+ rq = task_rq(p);
+ spin_lock(&rq->lock);
+ if (unlikely(rq != task_rq(p))) {
+ spin_unlock(&rq->lock);
+ goto repeat_lock_task;
+ }
+ return rq;
+}
+
+/*
* task_rq_lock - lock the runqueue a given task resides on and disable
* interrupts. Note the ordering: we can safely lookup the task_rq without
* explicitly disabling preemption.
@@ -375,6 +394,12 @@ repeat_lock_task:
return rq;
}
+static inline void __task_rq_unlock(runqueue_t *rq)
+ __releases(rq->lock)
+{
+ spin_unlock(&rq->lock);
+}
+
static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
__releases(rq->lock)
{
@@ -638,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
}
/*
- * effective_prio - return the priority that is based on the static
+ * __normal_prio - return the priority that is based on the static
* priority but is modified by bonuses/penalties.
*
* We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
@@ -651,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
*
* Both properties are important to certain workloads.
*/
-static int effective_prio(task_t *p)
+
+static inline int __normal_prio(task_t *p)
{
int bonus, prio;
- if (rt_task(p))
- return p->prio;
-
bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
prio = p->static_prio - bonus;
@@ -692,7 +715,7 @@ static int effective_prio(task_t *p)
static void set_load_weight(task_t *p)
{
- if (rt_task(p)) {
+ if (has_rt_policy(p)) {
#ifdef CONFIG_SMP
if (p == task_rq(p)->migration_thread)
/*
@@ -731,6 +754,44 @@ static inline void dec_nr_running(task_t *p, runqueue_t *rq)
}
/*
+ * Calculate the expected normal priority: i.e. priority
+ * without taking RT-inheritance into account. Might be
+ * boosted by interactivity modifiers. Changes upon fork,
+ * setprio syscalls, and whenever the interactivity
+ * estimator recalculates.
+ */
+static inline int normal_prio(task_t *p)
+{
+ int prio;
+
+ if (has_rt_policy(p))
+ prio = MAX_RT_PRIO-1 - p->rt_priority;
+ else
+ prio = __normal_prio(p);
+ return prio;
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks, or might be boosted by
+ * interactivity modifiers. Will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(task_t *p)
+{
+ p->normal_prio = normal_prio(p);
+ /*
+ * If we are RT tasks or we were boosted to RT priority,
+ * keep the priority unchanged. Otherwise, update priority
+ * to the normal priority:
+ */
+ if (!rt_prio(p->prio))
+ return p->normal_prio;
+ return p->prio;
+}
+
+/*
* __activate_task - move a task to the runqueue.
*/
static void __activate_task(task_t *p, runqueue_t *rq)
@@ -752,6 +813,10 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
inc_nr_running(p, rq);
}
+/*
+ * Recalculate p->normal_prio and p->prio after having slept,
+ * updating the sleep-average too:
+ */
static int recalc_task_prio(task_t *p, unsigned long long now)
{
/* Caller must always ensure 'now >= p->timestamp' */
@@ -1448,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags)
* event cannot wake it up and insert it on the runqueue either.
*/
p->state = TASK_RUNNING;
+
+ /*
+ * Make sure we do not leak PI boosting priority to the child:
+ */
+ p->prio = current->normal_prio;
+
INIT_LIST_HEAD(&p->run_list);
p->array = NULL;
#ifdef CONFIG_SCHEDSTATS
@@ -1527,6 +1598,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
__activate_task(p, rq);
else {
p->prio = current->prio;
+ p->normal_prio = current->normal_prio;
list_add_tail(&p->run_list, &current->run_list);
p->array = current->array;
p->array->nr_active++;
@@ -3668,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
EXPORT_SYMBOL(sleep_on_timeout);
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance logic.
+ */
+void rt_mutex_setprio(task_t *p, int prio)
+{
+ unsigned long flags;
+ prio_array_t *array;
+ runqueue_t *rq;
+ int oldprio;
+
+ BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+ rq = task_rq_lock(p, &flags);
+
+ oldprio = p->prio;
+ array = p->array;
+ if (array)
+ dequeue_task(p, array);
+ p->prio = prio;
+
+ if (array) {
+ /*
+ * If changing to an RT priority then queue it
+ * in the active array!
+ */
+ if (rt_task(p))
+ array = rq->active;
+ enqueue_task(p, array);
+ /*
+ * Reschedule if we are currently running on this runqueue and
+ * our priority decreased, or if we are not currently running on
+ * this runqueue and our priority is higher than the current's
+ */
+ if (task_running(rq, p)) {
+ if (p->prio > oldprio)
+ resched_task(rq->curr);
+ } else if (TASK_PREEMPTS_CURR(p, rq))
+ resched_task(rq->curr);
+ }
+ task_rq_unlock(rq, &flags);
+}
+
+#endif
+
void set_user_nice(task_t *p, long nice)
{
unsigned long flags;
prio_array_t *array;
runqueue_t *rq;
- int old_prio, new_prio, delta;
+ int old_prio, delta;
if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
return;
@@ -3688,7 +3813,7 @@ void set_user_nice(task_t *p, long nice)
* it wont have any effect on scheduling until the task is
* not SCHED_NORMAL/SCHED_BATCH:
*/
- if (rt_task(p)) {
+ if (has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
@@ -3698,12 +3823,11 @@ void set_user_nice(task_t *p, long nice)
dec_raw_weighted_load(rq, p);
}
- old_prio = p->prio;
- new_prio = NICE_TO_PRIO(nice);
- delta = new_prio - old_prio;
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p);
- p->prio += delta;
+ old_prio = p->prio;
+ p->prio = effective_prio(p);
+ delta = p->prio - old_prio;
if (array) {
enqueue_task(p, array);
@@ -3718,7 +3842,6 @@ void set_user_nice(task_t *p, long nice)
out_unlock:
task_rq_unlock(rq, &flags);
}
-
EXPORT_SYMBOL(set_user_nice);
/*
@@ -3833,16 +3956,14 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
BUG_ON(p->array);
p->policy = policy;
p->rt_priority = prio;
- if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
- p->prio = MAX_RT_PRIO-1 - p->rt_priority;
- } else {
- p->prio = p->static_prio;
- /*
- * SCHED_BATCH tasks are treated as perpetual CPU hogs:
- */
- if (policy == SCHED_BATCH)
- p->sleep_avg = 0;
- }
+ p->normal_prio = normal_prio(p);
+ /* we are holding p->pi_lock already */
+ p->prio = rt_mutex_getprio(p);
+ /*
+ * SCHED_BATCH tasks are treated as perpetual CPU hogs:
+ */
+ if (policy == SCHED_BATCH)
+ p->sleep_avg = 0;
set_load_weight(p);
}
@@ -3912,14 +4033,20 @@ recheck:
if (retval)
return retval;
/*
+ * make sure no PI-waiters arrive (or leave) while we are
+ * changing the priority of the task:
+ */
+ spin_lock_irqsave(&p->pi_lock, flags);
+ /*
* To be able to change p->policy safely, the apropriate
* runqueue lock must be held.
*/
- rq = task_rq_lock(p, &flags);
+ rq = __task_rq_lock(p);
/* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
- task_rq_unlock(rq, &flags);
+ __task_rq_unlock(rq);
+ spin_unlock_irqrestore(&p->pi_lock, flags);
goto recheck;
}
array = p->array;
@@ -3940,7 +4067,9 @@ recheck:
} else if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
}
- task_rq_unlock(rq, &flags);
+ __task_rq_unlock(rq);
+ spin_unlock_irqrestore(&p->pi_lock, flags);
+
return 0;
}
EXPORT_SYMBOL_GPL(sched_setscheduler);
@@ -4575,7 +4704,7 @@ void __devinit init_idle(task_t *idle, int cpu)
idle->timestamp = sched_clock();
idle->sleep_avg = 0;
idle->array = NULL;
- idle->prio = MAX_PRIO;
+ idle->prio = idle->normal_prio = MAX_PRIO;
idle->state = TASK_RUNNING;
idle->cpus_allowed = cpumask_of_cpu(cpu);
set_task_cpu(idle, cpu);
@@ -6582,7 +6711,8 @@ void normalize_rt_tasks(void)
if (!rt_task(p))
continue;
- rq = task_rq_lock(p, &flags);
+ spin_lock_irqsave(&p->pi_lock, flags);
+ rq = __task_rq_lock(p);
array = p->array;
if (array)
@@ -6593,7 +6723,8 @@ void normalize_rt_tasks(void)
resched_task(rq->curr);
}
- task_rq_unlock(rq, &flags);
+ __task_rq_unlock(rq);
+ spin_unlock_irqrestore(&p->pi_lock, flags);
}
read_unlock_irq(&tasklist_lock);
}