summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c3
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/auditsc.c10
-rw-r--r--kernel/cpu.c8
-rw-r--r--kernel/exit.c9
-rw-r--r--kernel/fork.c19
-rw-r--r--kernel/futex.c1067
-rw-r--r--kernel/futex_compat.c14
-rw-r--r--kernel/hrtimer.c4
-rw-r--r--kernel/mutex-debug.c5
-rw-r--r--kernel/power/Kconfig13
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/rcupdate.c14
-rw-r--r--kernel/rcutorture.c201
-rw-r--r--kernel/resource.c38
-rw-r--r--kernel/rtmutex-debug.c513
-rw-r--r--kernel/rtmutex-debug.h37
-rw-r--r--kernel/rtmutex-tester.c440
-rw-r--r--kernel/rtmutex.c990
-rw-r--r--kernel/rtmutex.h29
-rw-r--r--kernel/rtmutex_common.h123
-rw-r--r--kernel/sched.c1199
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/sysctl.c27
-rw-r--r--kernel/timer.c4
-rw-r--r--kernel/workqueue.c2
28 files changed, 4215 insertions, 569 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 752bd7d383a..82fb182f6f6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -16,6 +16,9 @@ obj-$(CONFIG_FUTEX) += futex.o
ifeq ($(CONFIG_COMPAT),y)
obj-$(CONFIG_FUTEX) += futex_compat.o
endif
+obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
+obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
+obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += cpu.o spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 368c4f03fe0..126ca43d5d2 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -521,6 +521,7 @@ static void do_acct_process(struct file *file)
/**
* acct_init_pacct - initialize a new pacct_struct
+ * @pacct: per-process accounting info struct to initialize
*/
void acct_init_pacct(struct pacct_struct *pacct)
{
@@ -576,7 +577,7 @@ void acct_collect(long exitcode, int group_dead)
*
* handles process accounting for an exiting task
*/
-void acct_process()
+void acct_process(void)
{
struct file *file = NULL;
diff --git a/kernel/audit.c b/kernel/audit.c
index 7dfac7031bd..82443fb433e 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -818,7 +818,7 @@ err:
*/
unsigned int audit_serial(void)
{
- static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;
+ static DEFINE_SPINLOCK(serial_lock);
static unsigned int serial = 0;
unsigned long flags;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9ebd96fda29..dc5e3f01efe 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -658,8 +658,7 @@ static void audit_log_task_context(struct audit_buffer *ab)
return;
error_path:
- if (ctx)
- kfree(ctx);
+ kfree(ctx);
audit_panic("error in audit_log_task_context");
return;
}
@@ -1367,7 +1366,7 @@ int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
* @mqdes: MQ descriptor
* @msg_len: Message length
* @msg_prio: Message priority
- * @abs_timeout: Message timeout in absolute time
+ * @u_abs_timeout: Message timeout in absolute time
*
* Returns 0 for success or NULL context or < 0 on error.
*/
@@ -1409,8 +1408,8 @@ int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
* __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
* @mqdes: MQ descriptor
* @msg_len: Message length
- * @msg_prio: Message priority
- * @abs_timeout: Message timeout in absolute time
+ * @u_msg_prio: Message priority
+ * @u_abs_timeout: Message timeout in absolute time
*
* Returns 0 for success or NULL context or < 0 on error.
*/
@@ -1558,7 +1557,6 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
* @uid: msgq user id
* @gid: msgq group id
* @mode: msgq mode (permissions)
- * @ipcp: in-kernel IPC permissions
*
* Returns 0 for success or NULL context or < 0 on error.
*/
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 03dcd981846..70fbf2e8376 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -18,7 +18,7 @@
/* This protects CPUs going up and down... */
static DEFINE_MUTEX(cpucontrol);
-static BLOCKING_NOTIFIER_HEAD(cpu_chain);
+static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
#ifdef CONFIG_HOTPLUG_CPU
static struct task_struct *lock_cpu_hotplug_owner;
@@ -69,10 +69,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
#endif /* CONFIG_HOTPLUG_CPU */
/* Need to know about CPUs going up/down? */
-int register_cpu_notifier(struct notifier_block *nb)
+int __cpuinit register_cpu_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&cpu_chain, nb);
}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
EXPORT_SYMBOL(register_cpu_notifier);
void unregister_cpu_notifier(struct notifier_block *nb)
@@ -81,7 +84,6 @@ void unregister_cpu_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_cpu_notifier);
-#ifdef CONFIG_HOTPLUG_CPU
static inline void check_for_tasks(int cpu)
{
struct task_struct *p;
diff --git a/kernel/exit.c b/kernel/exit.c
index 304ef637be6..ab06b9f88f6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -926,9 +926,18 @@ fastcall NORET_TYPE void do_exit(long code)
tsk->mempolicy = NULL;
#endif
/*
+ * This must happen late, after the PID is not
+ * hashed anymore:
+ */
+ if (unlikely(!list_empty(&tsk->pi_state_list)))
+ exit_pi_state_list(tsk);
+ if (unlikely(current->pi_state_cache))
+ kfree(current->pi_state_cache);
+ /*
* If DEBUG_MUTEXES is on, make sure we are holding no locks:
*/
mutex_debug_check_no_locks_held(tsk);
+ rt_mutex_debug_check_no_locks_held(tsk);
if (tsk->io_context)
exit_io_context();
diff --git a/kernel/fork.c b/kernel/fork.c
index 9b4e54ef022..628198a4f28 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -104,6 +104,7 @@ static kmem_cache_t *mm_cachep;
void free_task(struct task_struct *tsk)
{
free_thread_info(tsk->thread_info);
+ rt_mutex_debug_task_free(tsk);
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
@@ -913,6 +914,19 @@ asmlinkage long sys_set_tid_address(int __user *tidptr)
return current->pid;
}
+static inline void rt_mutex_init_task(struct task_struct *p)
+{
+#ifdef CONFIG_RT_MUTEXES
+ spin_lock_init(&p->pi_lock);
+ plist_head_init(&p->pi_waiters, &p->pi_lock);
+ p->pi_blocked_on = NULL;
+# ifdef CONFIG_DEBUG_RT_MUTEXES
+ spin_lock_init(&p->held_list_lock);
+ INIT_LIST_HEAD(&p->held_list_head);
+# endif
+#endif
+}
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -1034,6 +1048,8 @@ static task_t *copy_process(unsigned long clone_flags,
mpol_fix_fork_child_flag(p);
#endif
+ rt_mutex_init_task(p);
+
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
@@ -1076,6 +1092,9 @@ static task_t *copy_process(unsigned long clone_flags,
#ifdef CONFIG_COMPAT
p->compat_robust_list = NULL;
#endif
+ INIT_LIST_HEAD(&p->pi_state_list);
+ p->pi_state_cache = NULL;
+
/*
* sigaltstack should be cleared when sharing the same VM
*/
diff --git a/kernel/futex.c b/kernel/futex.c
index e1a380c77a5..6c91f938005 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -12,6 +12,10 @@
* (C) Copyright 2006 Red Hat Inc, All Rights Reserved
* Thanks to Thomas Gleixner for suggestions, analysis and fixes.
*
+ * PI-futex support started by Ingo Molnar and Thomas Gleixner
+ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
* enough at me, Linus for the original (flawed) idea, Matthew
* Kirkwood for proof-of-concept implementation.
@@ -46,6 +50,8 @@
#include <linux/signal.h>
#include <asm/futex.h>
+#include "rtmutex_common.h"
+
#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
/*
@@ -63,7 +69,7 @@ union futex_key {
int offset;
} shared;
struct {
- unsigned long uaddr;
+ unsigned long address;
struct mm_struct *mm;
int offset;
} private;
@@ -75,6 +81,27 @@ union futex_key {
};
/*
+ * Priority Inheritance state:
+ */
+struct futex_pi_state {
+ /*
+ * list of 'owned' pi_state instances - these have to be
+ * cleaned up in do_exit() if the task exits prematurely:
+ */
+ struct list_head list;
+
+ /*
+ * The PI object:
+ */
+ struct rt_mutex pi_mutex;
+
+ struct task_struct *owner;
+ atomic_t refcount;
+
+ union futex_key key;
+};
+
+/*
* We use this hashed waitqueue instead of a normal wait_queue_t, so
* we can wake only the relevant ones (hashed queues may be shared).
*
@@ -87,15 +114,19 @@ struct futex_q {
struct list_head list;
wait_queue_head_t waiters;
- /* Which hash list lock to use. */
+ /* Which hash list lock to use: */
spinlock_t *lock_ptr;
- /* Key which the futex is hashed on. */
+ /* Key which the futex is hashed on: */
union futex_key key;
- /* For fd, sigio sent using these. */
+ /* For fd, sigio sent using these: */
int fd;
struct file *filp;
+
+ /* Optional priority inheritance state: */
+ struct futex_pi_state *pi_state;
+ struct task_struct *task;
};
/*
@@ -144,8 +175,9 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
*
* Should be called with &current->mm->mmap_sem but NOT any spinlocks.
*/
-static int get_futex_key(unsigned long uaddr, union futex_key *key)
+static int get_futex_key(u32 __user *uaddr, union futex_key *key)
{
+ unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct page *page;
@@ -154,16 +186,16 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
/*
* The futex address must be "naturally" aligned.
*/
- key->both.offset = uaddr % PAGE_SIZE;
+ key->both.offset = address % PAGE_SIZE;
if (unlikely((key->both.offset % sizeof(u32)) != 0))
return -EINVAL;
- uaddr -= key->both.offset;
+ address -= key->both.offset;
/*
* The futex is hashed differently depending on whether
* it's in a shared or private mapping. So check vma first.
*/
- vma = find_extend_vma(mm, uaddr);
+ vma = find_extend_vma(mm, address);
if (unlikely(!vma))
return -EFAULT;
@@ -184,7 +216,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
*/
if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
key->private.mm = mm;
- key->private.uaddr = uaddr;
+ key->private.address = address;
return 0;
}
@@ -194,7 +226,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
key->shared.inode = vma->vm_file->f_dentry->d_inode;
key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
- key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT)
+ key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
+ vma->vm_pgoff);
return 0;
}
@@ -205,7 +237,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
* from swap. But that's a lot of code to duplicate here
* for a rare case, so we simply fetch the page.
*/
- err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL);
+ err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
if (err >= 0) {
key->shared.pgoff =
page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -246,18 +278,244 @@ static void drop_key_refs(union futex_key *key)
}
}
-static inline int get_futex_value_locked(int *dest, int __user *from)
+static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
{
int ret;
inc_preempt_count();
- ret = __copy_from_user_inatomic(dest, from, sizeof(int));
+ ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
dec_preempt_count();
return ret ? -EFAULT : 0;
}
/*
+ * Fault handling. Called with current->mm->mmap_sem held.
+ */
+static int futex_handle_fault(unsigned long address, int attempt)
+{
+ struct vm_area_struct * vma;
+ struct mm_struct *mm = current->mm;
+
+ if (attempt >= 2 || !(vma = find_vma(mm, address)) ||
+ vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
+ return -EFAULT;
+
+ switch (handle_mm_fault(mm, vma, address, 1)) {
+ case VM_FAULT_MINOR:
+ current->min_flt++;
+ break;
+ case VM_FAULT_MAJOR:
+ current->maj_flt++;
+ break;
+ default:
+ return -EFAULT;
+ }
+ return 0;
+}
+
+/*
+ * PI code:
+ */
+static int refill_pi_state_cache(void)
+{
+ struct futex_pi_state *pi_state;
+
+ if (likely(current->pi_state_cache))
+ return 0;
+
+ pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
+
+ if (!pi_state)
+ return -ENOMEM;
+
+ memset(pi_state, 0, sizeof(*pi_state));
+ INIT_LIST_HEAD(&pi_state->list);
+ /* pi_mutex gets initialized later */
+ pi_state->owner = NULL;
+ atomic_set(&pi_state->refcount, 1);
+
+ current->pi_state_cache = pi_state;
+
+ return 0;
+}
+
+static struct futex_pi_state * alloc_pi_state(void)
+{
+ struct futex_pi_state *pi_state = current->pi_state_cache;
+
+ WARN_ON(!pi_state);
+ current->pi_state_cache = NULL;
+
+ return pi_state;
+}
+
+static void free_pi_state(struct futex_pi_state *pi_state)
+{
+ if (!atomic_dec_and_test(&pi_state->refcount))
+ return;
+
+ /*
+ * If pi_state->owner is NULL, the owner is most probably dying
+ * and has cleaned up the pi_state already
+ */
+ if (pi_state->owner) {
+ spin_lock_irq(&pi_state->owner->pi_lock);
+ list_del_init(&pi_state->list);
+ spin_unlock_irq(&pi_state->owner->pi_lock);
+
+ rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
+ }
+
+ if (current->pi_state_cache)
+ kfree(pi_state);
+ else {
+ /*
+ * pi_state->list is already empty.
+ * clear pi_state->owner.
+ * refcount is at 0 - put it back to 1.
+ */
+ pi_state->owner = NULL;
+ atomic_set(&pi_state->refcount, 1);
+ current->pi_state_cache = pi_state;
+ }
+}
+
+/*
+ * Look up the task based on what TID userspace gave us.
+ * We dont trust it.
+ */
+static struct task_struct * futex_find_get_task(pid_t pid)
+{
+ struct task_struct *p;
+
+ read_lock(&tasklist_lock);
+ p = find_task_by_pid(pid);
+ if (!p)
+ goto out_unlock;
+ if ((current->euid != p->euid) && (current->euid != p->uid)) {
+ p = NULL;
+ goto out_unlock;
+ }
+ if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) {
+ p = NULL;
+ goto out_unlock;
+ }
+ get_task_struct(p);
+out_unlock:
+ read_unlock(&tasklist_lock);
+
+ return p;
+}
+
+/*
+ * This task is holding PI mutexes at exit time => bad.
+ * Kernel cleans up PI-state, but userspace is likely hosed.
+ * (Robust-futex cleanup is separate and might save the day for userspace.)
+ */
+void exit_pi_state_list(struct task_struct *curr)
+{
+ struct futex_hash_bucket *hb;
+ struct list_head *next, *head = &curr->pi_state_list;
+ struct futex_pi_state *pi_state;
+ union futex_key key;
+
+ /*
+ * We are a ZOMBIE and nobody can enqueue itself on
+ * pi_state_list anymore, but we have to be careful
+ * versus waiters unqueueing themselfs
+ */
+ spin_lock_irq(&curr->pi_lock);
+ while (!list_empty(head)) {
+
+ next = head->next;
+ pi_state = list_entry(next, struct futex_pi_state, list);
+ key = pi_state->key;
+ spin_unlock_irq(&curr->pi_lock);
+
+ hb = hash_futex(&key);
+ spin_lock(&hb->lock);
+
+ spin_lock_irq(&curr->pi_lock);
+ if (head->next != next) {
+ spin_unlock(&hb->lock);
+ continue;
+ }
+
+ list_del_init(&pi_state->list);
+
+ WARN_ON(pi_state->owner != curr);
+
+ pi_state->owner = NULL;
+ spin_unlock_irq(&curr->pi_lock);
+
+ rt_mutex_unlock(&pi_state->pi_mutex);
+
+ spin_unlock(&hb->lock);
+
+ spin_lock_irq(&curr->pi_lock);
+ }
+ spin_unlock_irq(&curr->pi_lock);
+}
+
+static int
+lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
+{
+ struct futex_pi_state *pi_state = NULL;
+ struct futex_q *this, *next;
+ struct list_head *head;
+ struct task_struct *p;
+ pid_t pid;
+
+ head = &hb->chain;
+
+ list_for_each_entry_safe(this, next, head, list) {
+ if (match_futex (&this->key, &me->key)) {
+ /*
+ * Another waiter already exists - bump up
+ * the refcount and return its pi_state:
+ */
+ pi_state = this->pi_state;
+ atomic_inc(&pi_state->refcount);
+ me->pi_state = pi_state;
+
+ return 0;
+ }
+ }
+
+ /*
+ * We are the first waiter - try to look up the real owner and
+ * attach the new pi_state to it:
+ */
+ pid = uval & FUTEX_TID_MASK;
+ p = futex_find_get_task(pid);
+ if (!p)
+ return -ESRCH;
+
+ pi_state = alloc_pi_state();
+
+ /*
+ * Initialize the pi_mutex in locked state and make 'p'
+ * the owner of it:
+ */
+ rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
+
+ /* Store the key for possible exit cleanups: */
+ pi_state->key = me->key;
+
+ spin_lock_irq(&p->pi_lock);
+ list_add(&pi_state->list, &p->pi_state_list);
+ pi_state->owner = p;
+ spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
+
+ me->pi_state = pi_state;
+
+ return 0;
+}
+
+/*
* The hash bucket lock must be held when this is called.
* Afterwards, the futex_q must not be accessed.
*/
@@ -284,16 +542,80 @@ static void wake_futex(struct futex_q *q)
q->lock_ptr = NULL;
}
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
+{
+ struct task_struct *new_owner;
+ struct futex_pi_state *pi_state = this->pi_state;
+ u32 curval, newval;
+
+ if (!pi_state)
+ return -EINVAL;
+
+ new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+
+ /*
+ * This happens when we have stolen the lock and the original
+ * pending owner did not enqueue itself back on the rt_mutex.
+ * Thats not a tragedy. We know that way, that a lock waiter
+ * is on the fly. We make the futex_q waiter the pending owner.
+ */
+ if (!new_owner)
+ new_owner = this->task;
+
+ /*
+ * We pass it to the next owner. (The WAITERS bit is always
+ * kept enabled while there is PI state around. We must also
+ * preserve the owner died bit.)
+ */
+ newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid;
+
+ inc_preempt_count();
+ curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+ dec_preempt_count();
+
+ if (curval == -EFAULT)
+ return -EFAULT;
+ if (curval != uval)
+ return -EINVAL;
+
+ list_del_init(&pi_state->owner->pi_state_list);
+ list_add(&pi_state->list, &new_owner->pi_state_list);
+ pi_state->owner = new_owner;
+ rt_mutex_unlock(&pi_state->pi_mutex);
+
+ return 0;
+}
+
+static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
+{
+ u32 oldval;
+
+ /*
+ * There is no waiter, so we unlock the futex. The owner died
+ * bit has not to be preserved here. We are the owner:
+ */
+ inc_preempt_count();
+ oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
+ dec_preempt_count();
+
+ if (oldval == -EFAULT)
+ return oldval;
+ if (oldval != uval)
+ return -EAGAIN;
+
+ return 0;
+}
+
/*
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
*/
-static int futex_wake(unsigned long uaddr, int nr_wake)
+static int futex_wake(u32 __user *uaddr, int nr_wake)
{
- union futex_key key;
- struct futex_hash_bucket *bh;
- struct list_head *head;
+ struct futex_hash_bucket *hb;
struct futex_q *this, *next;
+ struct list_head *head;
+ union futex_key key;
int ret;
down_read(&current->mm->mmap_sem);
@@ -302,19 +624,21 @@ static int futex_wake(unsigned long uaddr, int nr_wake)
if (unlikely(ret != 0))
goto out;
- bh = hash_futex(&key);
- spin_lock(&bh->lock);
- head = &bh->chain;
+ hb = hash_futex(&key);
+ spin_lock(&hb->lock);
+ head = &hb->chain;
list_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key)) {
+ if (this->pi_state)
+ return -EINVAL;
wake_futex(this);
if (++ret >= nr_wake)
break;
}
}
- spin_unlock(&bh->lock);
+ spin_unlock(&hb->lock);
out:
up_read(&current->mm->mmap_sem);
return ret;
@@ -324,10 +648,12 @@ out:
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
*/
-static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op)
+static int
+futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
+ int nr_wake, int nr_wake2, int op)
{
union futex_key key1, key2;
- struct futex_hash_bucket *bh1, *bh2;
+ struct futex_hash_bucket *hb1, *hb2;
struct list_head *head;
struct futex_q *this, *next;
int ret, op_ret, attempt = 0;
@@ -342,27 +668,29 @@ retryfull:
if (unlikely(ret != 0))
goto out;
- bh1 = hash_futex(&key1);
- bh2 = hash_futex(&key2);
+ hb1 = hash_futex(&key1);
+ hb2 = hash_futex(&key2);
retry:
- if (bh1 < bh2)
- spin_lock(&bh1->lock);
- spin_lock(&bh2->lock);
- if (bh1 > bh2)
- spin_lock(&bh1->lock);
+ if (hb1 < hb2)
+ spin_lock(&hb1->lock);
+ spin_lock(&hb2->lock);
+ if (hb1 > hb2)
+ spin_lock(&hb1->lock);
- op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2);
+ op_ret = futex_atomic_op_inuser(op, uaddr2);
if (unlikely(op_ret < 0)) {
- int dummy;
+ u32 dummy;
- spin_unlock(&bh1->lock);
- if (bh1 != bh2)
- spin_unlock(&bh2->lock);
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
#ifndef CONFIG_MMU
- /* we don't get EFAULT from MMU faults if we don't have an MMU,
- * but we might get them from range checking */
+ /*
+ * we don't get EFAULT from MMU faults if we don't have an MMU,
+ * but we might get them from range checking
+ */
ret = op_ret;
goto out;
#endif
@@ -372,47 +700,34 @@ retry:
goto out;
}
- /* futex_atomic_op_inuser needs to both read and write
+ /*
+ * futex_atomic_op_inuser needs to both read and write
* *(int __user *)uaddr2, but we can't modify it
* non-atomically. Therefore, if get_user below is not
* enough, we need to handle the fault ourselves, while
- * still holding the mmap_sem. */
+ * still holding the mmap_sem.
+ */
if (attempt++) {
- struct vm_area_struct * vma;
- struct mm_struct *mm = current->mm;
-
- ret = -EFAULT;
- if (attempt >= 2 ||
- !(vma = find_vma(mm, uaddr2)) ||
- vma->vm_start > uaddr2 ||
- !(vma->vm_flags & VM_WRITE))
- goto out;
-
- switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
- case VM_FAULT_MINOR:
- current->min_flt++;
- break;
- case VM_FAULT_MAJOR:
- current->maj_flt++;
- break;
- default:
+ if (futex_handle_fault((unsigned long)uaddr2,
+ attempt))
goto out;
- }
goto retry;
}
- /* If we would have faulted, release mmap_sem,
- * fault it in and start all over again. */
+ /*
+ * If we would have faulted, release mmap_sem,
+ * fault it in and start all over again.
+ */
up_read(&current->mm->mmap_sem);
- ret = get_user(dummy, (int __user *)uaddr2);
+ ret = get_user(dummy, uaddr2);
if (ret)
return ret;
goto retryfull;
}
- head = &bh1->chain;
+ head = &hb1->chain;
list_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key1)) {
@@ -423,7 +738,7 @@ retry:
}
if (op_ret > 0) {
- head = &bh2->chain;
+ head = &hb2->chain;
op_ret = 0;
list_for_each_entry_safe(this, next, head, list) {
@@ -436,9 +751,9 @@ retry:
ret += op_ret;
}
- spin_unlock(&bh1->lock);
- if (bh1 != bh2)
- spin_unlock(&bh2->lock);
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
out:
up_read(&current->mm->mmap_sem);
return ret;
@@ -448,11 +763,11 @@ out:
* Requeue all waiters hashed on one physical page to another
* physical page.
*/
-static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
- int nr_wake, int nr_requeue, int *valp)
+static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
+ int nr_wake, int nr_requeue, u32 *cmpval)
{
union futex_key key1, key2;
- struct futex_hash_bucket *bh1, *bh2;
+ struct futex_hash_bucket *hb1, *hb2;
struct list_head *head1;
struct futex_q *this, *next;
int ret, drop_count = 0;
@@ -467,68 +782,72 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
if (unlikely(ret != 0))
goto out;
- bh1 = hash_futex(&key1);
- bh2 = hash_futex(&key2);
+ hb1 = hash_futex(&key1);
+ hb2 = hash_futex(&key2);
- if (bh1 < bh2)
- spin_lock(&bh1->lock);
- spin_lock(&bh2->lock);
- if (bh1 > bh2)
- spin_lock(&bh1->lock);
+ if (hb1 < hb2)
+ spin_lock(&hb1->lock);
+ spin_lock(&hb2->lock);
+ if (hb1 > hb2)
+ spin_lock(&hb1->lock);
- if (likely(valp != NULL)) {
- int curval;
+ if (likely(cmpval != NULL)) {
+ u32 curval;
- ret = get_futex_value_locked(&curval, (int __user *)uaddr1);
+ ret = get_futex_value_locked(&curval, uaddr1);
if (unlikely(ret)) {
- spin_unlock(&bh1->lock);
- if (bh1 != bh2)
- spin_unlock(&bh2->lock);
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
- /* If we would have faulted, release mmap_sem, fault
+ /*
+ * If we would have faulted, release mmap_sem, fault
* it in and start all over again.
*/
up_read(&current->mm->mmap_sem);
- ret = get_user(curval, (int __user *)uaddr1);
+ ret = get_user(curval, uaddr1);
if (!ret)
goto retry;
return ret;
}
- if (curval != *valp) {
+ if (curval != *cmpval) {
ret = -EAGAIN;
goto out_unlock;
}
}
- head1 = &bh1->chain;
+ head1 = &hb1->chain;
list_for_each_entry_safe(this, next, head1, list) {
if (!match_futex (&this->key, &key1))
continue;
if (++ret <= nr_wake) {
wake_futex(this);
} else {
- list_move_tail(&this->list, &bh2->chain);
- this->lock_ptr = &bh2->lock;
+ /*
+ * If key1 and key2 hash to the same bucket, no need to
+ * requeue.
+ */
+ if (likely(head1 != &hb2->chain)) {
+ list_move_tail(&this->list, &hb2->chain);
+ this->lock_ptr = &hb2->lock;
+ }
this->key = key2;
get_key_refs(&key2);
drop_count++;
if (ret - nr_wake >= nr_requeue)
break;
- /* Make sure to stop if key1 == key2 */
- if (head1 == &bh2->chain && head1 != &next->list)
- head1 = &this->list;
}
}
out_unlock:
- spin_unlock(&bh1->lock);
- if (bh1 != bh2)
- spin_unlock(&bh2->lock);
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
/* drop_key_refs() must be called outside the spinlocks. */
while (--drop_count >= 0)
@@ -543,7 +862,7 @@ out:
static inline struct futex_hash_bucket *
queue_lock(struct futex_q *q, int fd, struct file *filp)
{
- struct futex_hash_bucket *bh;
+ struct futex_hash_bucket *hb;
q->fd = fd;
q->filp = filp;
@@ -551,23 +870,24 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
init_waitqueue_head(&q->waiters);
get_key_refs(&q->key);
- bh = hash_futex(&q->key);
- q->lock_ptr = &bh->lock;
+ hb = hash_futex(&q->key);
+ q->lock_ptr = &hb->lock;
- spin_lock(&bh->lock);
- return bh;
+ spin_lock(&hb->lock);
+ return hb;
}
-static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh)
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{
- list_add_tail(&q->list, &bh->chain);
- spin_unlock(&bh->lock);
+ list_add_tail(&q->list, &hb->chain);
+ q->task = current;
+ spin_unlock(&hb->lock);
}
static inline void
-queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
+queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
{
- spin_unlock(&bh->lock);
+ spin_unlock(&hb->lock);
drop_key_refs(&q->key);
}
@@ -579,16 +899,17 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
/* The key must be already stored in q->key. */
static void queue_me(struct futex_q *q, int fd, struct file *filp)
{
- struct futex_hash_bucket *bh;
- bh = queue_lock(q, fd, filp);
- __queue_me(q, bh);
+ struct futex_hash_bucket *hb;
+
+ hb = queue_lock(q, fd, filp);
+ __queue_me(q, hb);
}
/* Return 1 if we were still queued (ie. 0 means we were woken) */
static int unqueue_me(struct futex_q *q)
{
- int ret = 0;
spinlock_t *lock_ptr;
+ int ret = 0;
/* In the common case we don't take the spinlock, which is nice. */
retry:
@@ -614,6 +935,9 @@ static int unqueue_me(struct futex_q *q)
}
WARN_ON(list_empty(&q->list));
list_del(&q->list);
+
+ BUG_ON(q->pi_state);
+
spin_unlock(lock_ptr);
ret = 1;
}
@@ -622,21 +946,42 @@ static int unqueue_me(struct futex_q *q)
return ret;
}
-static int futex_wait(unsigned long uaddr, int val, unsigned long time)
+/*
+ * PI futexes can not be requeued and must remove themself from the
+ * hash bucket. The hash bucket lock is held on entry and dropped here.
+ */
+static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
{
- DECLARE_WAITQUEUE(wait, current);
- int ret, curval;
+ WARN_ON(list_empty(&q->list));
+ list_del(&q->list);
+
+ BUG_ON(!q->pi_state);
+ free_pi_state(q->pi_state);
+ q->pi_state = NULL;
+
+ spin_unlock(&hb->lock);
+
+ drop_key_refs(&q->key);
+}
+
+static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
+{
+ struct task_struct *curr = current;
+ DECLARE_WAITQUEUE(wait, curr);
+ struct futex_hash_bucket *hb;
struct futex_q q;
- struct futex_hash_bucket *bh;
+ u32 uval;
+ int ret;
+ q.pi_state = NULL;
retry:
- down_read(&current->mm->mmap_sem);
+ down_read(&curr->mm->mmap_sem);
ret = get_futex_key(uaddr, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
- bh = queue_lock(&q, -1, NULL);
+ hb = queue_lock(&q, -1, NULL);
/*
* Access the page AFTER the futex is queued.
@@ -658,37 +1003,35 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
* We hold the mmap semaphore, so the mapping cannot have changed
* since we looked it up in get_futex_key.
*/
-
- ret = get_futex_value_locked(&curval, (int __user *)uaddr);
+ ret = get_futex_value_locked(&uval, uaddr);
if (unlikely(ret)) {
- queue_unlock(&q, bh);
+ queue_unlock(&q, hb);
- /* If we would have faulted, release mmap_sem, fault it in and
+ /*
+ * If we would have faulted, release mmap_sem, fault it in and
* start all over again.
*/
- up_read(&current->mm->mmap_sem);
+ up_read(&curr->mm->mmap_sem);
- ret = get_user(curval, (int __user *)uaddr);
+ ret = get_user(uval, uaddr);
if (!ret)
goto retry;
return ret;
}
- if (curval != val) {
- ret = -EWOULDBLOCK;
- queue_unlock(&q, bh);
- goto out_release_sem;
- }
+ ret = -EWOULDBLOCK;
+ if (uval != val)
+ goto out_unlock_release_sem;
/* Only actually queue if *uaddr contained val. */
- __queue_me(&q, bh);
+ __queue_me(&q, hb);
/*
* Now the futex is queued and we have checked the data, we
* don't want to hold mmap_sem while we sleep.
- */
- up_read(&current->mm->mmap_sem);
+ */
+ up_read(&curr->mm->mmap_sem);
/*
* There might have been scheduling since the queue_me(), as we
@@ -720,12 +1063,421 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
return 0;
if (time == 0)
return -ETIMEDOUT;
- /* We expect signal_pending(current), but another thread may
- * have handled it for us already. */
+ /*
+ * We expect signal_pending(current), but another thread may
+ * have handled it for us already.
+ */
return -EINTR;
+ out_unlock_release_sem:
+ queue_unlock(&q, hb);
+
out_release_sem:
+ up_read(&curr->mm->mmap_sem);
+ return ret;
+}
+
+/*
+ * Userspace tried a 0 -> TID atomic transition of the futex value
+ * and failed. The kernel side here does the whole locking operation:
+ * if there are waiters then it will block, it does PI, etc. (Due to
+ * races the kernel might see a 0 value of the futex too.)
+ */
+static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
+ struct hrtimer_sleeper *to)
+{
+ struct task_struct *curr = current;
+ struct futex_hash_bucket *hb;
+ u32 uval, newval, curval;
+ struct futex_q q;
+ int ret, attempt = 0;
+
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+
+ q.pi_state = NULL;
+ retry:
+ down_read(&curr->mm->mmap_sem);
+
+ ret = get_futex_key(uaddr, &q.key);
+ if (unlikely(ret != 0))
+ goto out_release_sem;
+
+ hb = queue_lock(&q, -1, NULL);
+
+ retry_locked:
+ /*
+ * To avoid races, we attempt to take the lock here again
+ * (by doing a 0 -> TID atomic cmpxchg), while holding all
+ * the locks. It will most likely not succeed.
+ */
+ newval = current->pid;
+
+ inc_preempt_count();
+ curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
+ dec_preempt_count();
+
+ if (unlikely(curval == -EFAULT))
+ goto uaddr_faulted;
+
+ /* We own the lock already */
+ if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
+ if (!detect && 0)
+ force_sig(SIGKILL, current);
+ ret = -EDEADLK;
+ goto out_unlock_release_sem;
+ }
+
+ /*
+ * Surprise - we got the lock. Just return
+ * to userspace:
+ */
+ if (unlikely(!curval))
+ goto out_unlock_release_sem;
+
+ uval = curval;
+ newval = uval | FUTEX_WAITERS;
+
+ inc_preempt_count();
+ curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+ dec_preempt_count();
+
+ if (unlikely(curval == -EFAULT))
+ goto uaddr_faulted;
+ if (unlikely(curval != uval))
+ goto retry_locked;
+
+ /*
+ * We dont have the lock. Look up the PI state (or create it if
+ * we are the first waiter):
+ */
+ ret = lookup_pi_state(uval, hb, &q);
+
+ if (unlikely(ret)) {
+ /*
+ * There were no waiters and the owner task lookup
+ * failed. When the OWNER_DIED bit is set, then we
+ * know that this is a robust futex and we actually
+ * take the lock. This is safe as we are protected by
+ * the hash bucket lock. We also set the waiters bit
+ * unconditionally here, to simplify glibc handling of
+ * multiple tasks racing to acquire the lock and
+ * cleanup the problems which were left by the dead
+ * owner.
+ */
+ if (curval & FUTEX_OWNER_DIED) {
+ uval = newval;
+ newval = current->pid |
+ FUTEX_OWNER_DIED | FUTEX_WAITERS;
+
+ inc_preempt_count();
+ curval = futex_atomic_cmpxchg_inatomic(uaddr,
+ uval, newval);
+ dec_preempt_count();
+
+ if (unlikely(curval == -EFAULT))
+ goto uaddr_faulted;
+ if (unlikely(curval != uval))
+ goto retry_locked;
+ ret = 0;
+ }
+ goto out_unlock_release_sem;
+ }
+
+ /*
+ * Only actually queue now that the atomic ops are done:
+ */
+ __queue_me(&q, hb);
+
+ /*
+ * Now the futex is queued and we have checked the data, we
+ * don't want to hold mmap_sem while we sleep.
+ */
+ up_read(&curr->mm->mmap_sem);
+
+ WARN_ON(!q.pi_state);
+ /*
+ * Block on the PI mutex:
+ */
+ if (!trylock)
+ ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
+ else {
+ ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
+ /* Fixup the trylock return value: */
+ ret = ret ? 0 : -EWOULDBLOCK;
+ }
+
+ down_read(&curr->mm->mmap_sem);
+ hb = queue_lock(&q, -1, NULL);
+
+ /*
+ * Got the lock. We might not be the anticipated owner if we
+ * did a lock-steal - fix up the PI-state in that case.
+ */
+ if (!ret && q.pi_state->owner != curr) {
+ u32 newtid = current->pid | FUTEX_WAITERS;
+
+ /* Owner died? */
+ if (q.pi_state->owner != NULL) {
+ spin_lock_irq(&q.pi_state->owner->pi_lock);
+ list_del_init(&q.pi_state->list);
+ spin_unlock_irq(&q.pi_state->owner->pi_lock);
+ } else
+ newtid |= FUTEX_OWNER_DIED;
+
+ q.pi_state->owner = current;
+
+ spin_lock_irq(&current->pi_lock);
+ list_add(&q.pi_state->list, &current->pi_state_list);
+ spin_unlock_irq(&current->pi_lock);
+
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(&q, hb);
+ up_read(&curr->mm->mmap_sem);
+ /*
+ * We own it, so we have to replace the pending owner
+ * TID. This must be atomic as we have preserve the
+ * owner died bit here.
+ */
+ ret = get_user(uval, uaddr);
+ while (!ret) {
+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
+ curval = futex_atomic_cmpxchg_inatomic(uaddr,
+ uval, newval);
+ if (curval == -EFAULT)
+ ret = -EFAULT;
+ if (curval == uval)
+ break;
+ uval = curval;
+ }
+ } else {
+ /*
+ * Catch the rare case, where the lock was released
+ * when we were on the way back before we locked
+ * the hash bucket.
+ */
+ if (ret && q.pi_state->owner == curr) {
+ if (rt_mutex_trylock(&q.pi_state->pi_mutex))
+ ret = 0;
+ }
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(&q, hb);
+ up_read(&curr->mm->mmap_sem);
+ }
+
+ if (!detect && ret == -EDEADLK && 0)
+ force_sig(SIGKILL, current);
+
+ return ret;
+
+ out_unlock_release_sem:
+ queue_unlock(&q, hb);
+
+ out_release_sem:
+ up_read(&curr->mm->mmap_sem);
+ return ret;
+
+ uaddr_faulted:
+ /*
+ * We have to r/w *(int __user *)uaddr, but we can't modify it
+ * non-atomically. Therefore, if get_user below is not
+ * enough, we need to handle the fault ourselves, while
+ * still holding the mmap_sem.
+ */
+ if (attempt++) {
+ if (futex_handle_fault((unsigned long)uaddr, attempt))
+ goto out_unlock_release_sem;
+
+ goto retry_locked;
+ }
+
+ queue_unlock(&q, hb);
+ up_read(&curr->mm->mmap_sem);
+
+ ret = get_user(uval, uaddr);
+ if (!ret && (uval != -EFAULT))
+ goto retry;
+
+ return ret;
+}
+
+/*
+ * Restart handler
+ */
+static long futex_lock_pi_restart(struct restart_block *restart)
+{
+ struct hrtimer_sleeper timeout, *to = NULL;
+ int ret;
+
+ restart->fn = do_no_restart_syscall;
+
+ if (restart->arg2 || restart->arg3) {
+ to = &timeout;
+ hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
+ hrtimer_init_sleeper(to, current);
+ to->timer.expires.tv64 = ((u64)restart->arg1 << 32) |
+ (u64) restart->arg0;
+ }
+
+ pr_debug("lock_pi restart: %p, %d (%d)\n",
+ (u32 __user *)restart->arg0, current->pid);
+
+ ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1,
+ 0, to);
+
+ if (ret != -EINTR)
+ return ret;
+
+ restart->fn = futex_lock_pi_restart;
+
+ /* The other values are filled in */
+ return -ERESTART_RESTARTBLOCK;
+}
+
+/*
+ * Called from the syscall entry below.
+ */
+static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
+ long nsec, int trylock)
+{
+ struct hrtimer_sleeper timeout, *to = NULL;
+ struct restart_block *restart;
+ int ret;
+
+ if (sec != MAX_SCHEDULE_TIMEOUT) {
+ to = &timeout;
+ hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
+ hrtimer_init_sleeper(to, current);
+ to->timer.expires = ktime_set(sec, nsec);
+ }
+
+ ret = do_futex_lock_pi(uaddr, detect, trylock, to);
+
+ if (ret != -EINTR)
+ return ret;
+
+ pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid);
+
+ restart = &current_thread_info()->restart_block;
+ restart->fn = futex_lock_pi_restart;
+ restart->arg0 = (unsigned long) uaddr;
+ restart->arg1 = detect;
+ if (to) {
+ restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF;
+ restart->arg3 = to->timer.expires.tv64 >> 32;
+ } else
+ restart->arg2 = restart->arg3 = 0;
+
+ return -ERESTART_RESTARTBLOCK;
+}
+
+/*
+ * Userspace attempted a TID -> 0 atomic transition, and failed.
+ * This is the in-kernel slowpath: we look up the PI state (if any),
+ * and do the rt-mutex unlock.
+ */
+static int futex_unlock_pi(u32 __user *uaddr)
+{
+ struct futex_hash_bucket *hb;
+ struct futex_q *this, *next;
+ u32 uval;
+ struct list_head *head;
+ union futex_key key;
+ int ret, attempt = 0;
+
+retry:
+ if (get_user(uval, uaddr))
+ return -EFAULT;
+ /*
+ * We release only a lock we actually own:
+ */
+ if ((uval & FUTEX_TID_MASK) != current->pid)
+ return -EPERM;
+ /*
+ * First take all the futex related locks:
+ */
+ down_read(&current->mm->mmap_sem);
+
+ ret = get_futex_key(uaddr, &key);
+ if (unlikely(ret != 0))
+ goto out;
+
+ hb = hash_futex(&key);
+ spin_lock(&hb->lock);
+
+retry_locked:
+ /*
+ * To avoid races, try to do the TID -> 0 atomic transition
+ * again. If it succeeds then we can return without waking
+ * anyone else up:
+ */
+ inc_preempt_count();
+ uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
+ dec_preempt_count();
+
+ if (unlikely(uval == -EFAULT))
+ goto pi_faulted;
+ /*
+ * Rare case: we managed to release the lock atomically,
+ * no need to wake anyone else up:
+ */
+ if (unlikely(uval == current->pid))
+ goto out_unlock;
+
+ /*
+ * Ok, other tasks may need to be woken up - check waiters
+ * and do the wakeup if necessary:
+ */
+ head = &hb->chain;
+
+ list_for_each_entry_safe(this, next, head, list) {
+ if (!match_futex (&this->key, &key))
+ continue;
+ ret = wake_futex_pi(uaddr, uval, this);
+ /*
+ * The atomic access to the futex value
+ * generated a pagefault, so retry the
+ * user-access and the wakeup:
+ */
+ if (ret == -EFAULT)
+ goto pi_faulted;
+ goto out_unlock;
+ }
+ /*
+ * No waiters - kernel unlocks the futex:
+ */
+ ret = unlock_futex_pi(uaddr, uval);
+ if (ret == -EFAULT)
+ goto pi_faulted;
+
+out_unlock:
+ spin_unlock(&hb->lock);
+out:
up_read(&current->mm->mmap_sem);
+
+ return ret;
+
+pi_faulted:
+ /*
+ * We have to r/w *(int __user *)uaddr, but we can't modify it
+ * non-atomically. Therefore, if get_user below is not
+ * enough, we need to handle the fault ourselves, while
+ * still holding the mmap_sem.
+ */
+ if (attempt++) {
+ if (futex_handle_fault((unsigned long)uaddr, attempt))
+ goto out_unlock;
+
+ goto retry_locked;
+ }
+
+ spin_unlock(&hb->lock);
+ up_read(&current->mm->mmap_sem);
+
+ ret = get_user(uval, uaddr);
+ if (!ret && (uval != -EFAULT))
+ goto retry;
+
return ret;
}
@@ -735,6 +1487,7 @@ static int futex_close(struct inode *inode, struct file *filp)
unqueue_me(q);
kfree(q);
+
return 0;
}
@@ -766,7 +1519,7 @@ static struct file_operations futex_fops = {
* Signal allows caller to avoid the race which would occur if they
* set the sigio stuff up afterwards.
*/
-static int futex_fd(unsigned long uaddr, int signal)
+static int futex_fd(u32 __user *uaddr, int signal)
{
struct futex_q *q;
struct file *filp;
@@ -803,6 +1556,7 @@ static int futex_fd(unsigned long uaddr, int signal)
err = -ENOMEM;
goto error;
}
+ q->pi_state = NULL;
down_read(&current->mm->mmap_sem);
err = get_futex_key(uaddr, &q->key);
@@ -840,7 +1594,7 @@ error:
* Implementation: user-space maintains a per-thread list of locks it
* is holding. Upon do_exit(), the kernel carefully walks this list,
* and marks all locks that are owned by this thread with the
- * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is
+ * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
* always manipulated with the lock held, so the list is private and
* per-thread. Userspace also maintains a per-thread 'list_op_pending'
* field, to allow the kernel to clean up if the thread dies after
@@ -915,7 +1669,7 @@ err_unlock:
*/
int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
{
- u32 uval;
+ u32 uval, nval;
retry:
if (get_user(uval, uaddr))
@@ -932,12 +1686,16 @@ retry:
* thread-death.) The rest of the cleanup is done in
* userspace.
*/
- if (futex_atomic_cmpxchg_inatomic(uaddr, uval,
- uval | FUTEX_OWNER_DIED) != uval)
+ nval = futex_atomic_cmpxchg_inatomic(uaddr, uval,
+ uval | FUTEX_OWNER_DIED);
+ if (nval == -EFAULT)
+ return -1;
+
+ if (nval != uval)
goto retry;
if (uval & FUTEX_WAITERS)
- futex_wake((unsigned long)uaddr, 1);
+ futex_wake(uaddr, 1);
}
return 0;
}
@@ -978,7 +1736,7 @@ void exit_robust_list(struct task_struct *curr)
while (entry != &head->list) {
/*
* A pending lock might already be on the list, so
- * dont process it twice:
+ * don't process it twice:
*/
if (entry != pending)
if (handle_futex_death((void *)entry + futex_offset,
@@ -999,8 +1757,8 @@ void exit_robust_list(struct task_struct *curr)
}
}
-long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
- unsigned long uaddr2, int val2, int val3)
+long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+ u32 __user *uaddr2, u32 val2, u32 val3)
{
int ret;
@@ -1024,6 +1782,15 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
case FUTEX_WAKE_OP:
ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
break;
+ case FUTEX_LOCK_PI:
+ ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
+ break;
+ case FUTEX_UNLOCK_PI:
+ ret = futex_unlock_pi(uaddr);
+ break;
+ case FUTEX_TRYLOCK_PI:
+ ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
+ break;
default:
ret = -ENOSYS;
}
@@ -1031,29 +1798,33 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
}
-asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
+asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
struct timespec __user *utime, u32 __user *uaddr2,
- int val3)
+ u32 val3)
{
struct timespec t;
unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
- int val2 = 0;
+ u32 val2 = 0;
- if (utime && (op == FUTEX_WAIT)) {
+ if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
if (copy_from_user(&t, utime, sizeof(t)) != 0)
return -EFAULT;
if (!timespec_valid(&t))
return -EINVAL;
- timeout = timespec_to_jiffies(&t) + 1;
+ if (op == FUTEX_WAIT)
+ timeout = timespec_to_jiffies(&t) + 1;
+ else {
+ timeout = t.tv_sec;
+ val2 = t.tv_nsec;
+ }
}
/*
* requeue parameter in 'utime' if op == FUTEX_REQUEUE.
*/
- if (op >= FUTEX_REQUEUE)
- val2 = (int) (unsigned long) utime;
+ if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+ val2 = (u32) (unsigned long) utime;
- return do_futex((unsigned long)uaddr, op, val, timeout,
- (unsigned long)uaddr2, val2, val3);
+ return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
}
static int futexfs_get_sb(struct file_system_type *fs_type,
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 1ab6a0ea3d1..d1d92b441fb 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -129,16 +129,20 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
int val2 = 0;
- if (utime && (op == FUTEX_WAIT)) {
+ if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
if (get_compat_timespec(&t, utime))
return -EFAULT;
if (!timespec_valid(&t))
return -EINVAL;
- timeout = timespec_to_jiffies(&t) + 1;
+ if (op == FUTEX_WAIT)
+ timeout = timespec_to_jiffies(&t) + 1;
+ else {
+ timeout = t.tv_sec;
+ val2 = t.tv_nsec;
+ }
}
- if (op >= FUTEX_REQUEUE)
+ if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
val2 = (int) (unsigned long) utime;
- return do_futex((unsigned long)uaddr, op, val, timeout,
- (unsigned long)uaddr2, val2, val3);
+ return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 55601b3ce60..8d3dc29ef41 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -833,7 +833,7 @@ static void migrate_hrtimers(int cpu)
}
#endif /* CONFIG_HOTPLUG_CPU */
-static int hrtimer_cpu_notify(struct notifier_block *self,
+static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
@@ -857,7 +857,7 @@ static int hrtimer_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block hrtimers_nb = {
+static struct notifier_block __devinitdata hrtimers_nb = {
.notifier_call = hrtimer_cpu_notify,
};
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 036b6285b15..e38e4bac97c 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -16,6 +16,7 @@
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/module.h>
+#include <linux/poison.h>
#include <linux/spinlock.h>
#include <linux/kallsyms.h>
#include <linux/interrupt.h>
@@ -381,7 +382,7 @@ void debug_mutex_set_owner(struct mutex *lock,
void debug_mutex_init_waiter(struct mutex_waiter *waiter)
{
- memset(waiter, 0x11, sizeof(*waiter));
+ memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
waiter->magic = waiter;
INIT_LIST_HEAD(&waiter->list);
}
@@ -397,7 +398,7 @@ void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
void debug_mutex_free_waiter(struct mutex_waiter *waiter)
{
DEBUG_WARN_ON(!list_empty(&waiter->list));
- memset(waiter, 0x22, sizeof(*waiter));
+ memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter));
}
void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index fc311a4673a..857b4fa0912 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -38,13 +38,22 @@ config PM_DEBUG
config PM_TRACE
bool "Suspend/resume event tracing"
- depends on PM && PM_DEBUG && X86_32
- default y
+ depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL
+ default n
---help---
This enables some cheesy code to save the last PM event point in the
RTC across reboots, so that you can debug a machine that just hangs
during suspend (or more commonly, during resume).
+ To use this debugging feature you should attempt to suspend the machine,
+ then reboot it, then run
+
+ dmesg -s 1000000 | grep 'hash matches'
+
+ CAUTION: this option will cause your machine's real-time clock to be
+ set to an invalid time after a resume.
+
+
config SOFTWARE_SUSPEND
bool "Software Suspend"
depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
diff --git a/kernel/profile.c b/kernel/profile.c
index 68afe121e50..5a730fdb1a2 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -299,7 +299,7 @@ out:
}
#ifdef CONFIG_HOTPLUG_CPU
-static int profile_cpu_callback(struct notifier_block *info,
+static int __devinit profile_cpu_callback(struct notifier_block *info,
unsigned long action, void *__cpu)
{
int node, cpu = (unsigned long)__cpu;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 20e9710fc21..f464f5ae3f1 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -182,6 +182,15 @@ long rcu_batches_completed(void)
return rcu_ctrlblk.completed;
}
+/*
+ * Return the number of RCU batches processed thus far. Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed_bh(void)
+{
+ return rcu_bh_ctrlblk.completed;
+}
+
static void rcu_barrier_callback(struct rcu_head *notused)
{
if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -539,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu)
tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
}
-static int rcu_cpu_notify(struct notifier_block *self,
+static int __devinit rcu_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
@@ -556,7 +565,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block rcu_nb = {
+static struct notifier_block __devinitdata rcu_nb = {
.notifier_call = rcu_cpu_notify,
};
@@ -619,6 +628,7 @@ module_param(qlowmark, int, 0);
module_param(rsinterval, int, 0);
#endif
EXPORT_SYMBOL_GPL(rcu_batches_completed);
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
EXPORT_SYMBOL_GPL(call_rcu);
EXPORT_SYMBOL_GPL(call_rcu_bh);
EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 8154e7589d1..4d1c3d24712 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -1,5 +1,5 @@
/*
- * Read-Copy Update /proc-based torture test facility
+ * Read-Copy Update module-based torture test facility
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -53,6 +53,7 @@ static int stat_interval; /* Interval between stats, in seconds. */
static int verbose; /* Print more debug info. */
static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
+static char *torture_type = "rcu"; /* What to torture. */
module_param(nreaders, int, 0);
MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
@@ -64,13 +65,16 @@ module_param(test_no_idle_hz, bool, 0);
MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
module_param(shuffle_interval, int, 0);
MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
-#define TORTURE_FLAG "rcutorture: "
+module_param(torture_type, charp, 0);
+MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)");
+
+#define TORTURE_FLAG "-torture:"
#define PRINTK_STRING(s) \
- do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
+ do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
#define VERBOSE_PRINTK_STRING(s) \
- do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
+ do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
#define VERBOSE_PRINTK_ERRSTRING(s) \
- do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0)
+ do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
static char printk_buf[4096];
@@ -139,28 +143,6 @@ rcu_torture_free(struct rcu_torture *p)
spin_unlock_bh(&rcu_torture_lock);
}
-static void
-rcu_torture_cb(struct rcu_head *p)
-{
- int i;
- struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
-
- if (fullstop) {
- /* Test is ending, just drop callbacks on the floor. */
- /* The next initialization will pick up the pieces. */
- return;
- }
- i = rp->rtort_pipe_count;
- if (i > RCU_TORTURE_PIPE_LEN)
- i = RCU_TORTURE_PIPE_LEN;
- atomic_inc(&rcu_torture_wcount[i]);
- if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
- rp->rtort_mbtest = 0;
- rcu_torture_free(rp);
- } else
- call_rcu(p, rcu_torture_cb);
-}
-
struct rcu_random_state {
unsigned long rrs_state;
unsigned long rrs_count;
@@ -191,6 +173,119 @@ rcu_random(struct rcu_random_state *rrsp)
}
/*
+ * Operations vector for selecting different types of tests.
+ */
+
+struct rcu_torture_ops {
+ void (*init)(void);
+ void (*cleanup)(void);
+ int (*readlock)(void);
+ void (*readunlock)(int idx);
+ int (*completed)(void);
+ void (*deferredfree)(struct rcu_torture *p);
+ int (*stats)(char *page);
+ char *name;
+};
+static struct rcu_torture_ops *cur_ops = NULL;
+
+/*
+ * Definitions for rcu torture testing.
+ */
+
+static int rcu_torture_read_lock(void)
+{
+ rcu_read_lock();
+ return 0;
+}
+
+static void rcu_torture_read_unlock(int idx)
+{
+ rcu_read_unlock();
+}
+
+static int rcu_torture_completed(void)
+{
+ return rcu_batches_completed();
+}
+
+static void
+rcu_torture_cb(struct rcu_head *p)
+{
+ int i;
+ struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
+
+ if (fullstop) {
+ /* Test is ending, just drop callbacks on the floor. */
+ /* The next initialization will pick up the pieces. */
+ return;
+ }
+ i = rp->rtort_pipe_count;
+ if (i > RCU_TORTURE_PIPE_LEN)
+ i = RCU_TORTURE_PIPE_LEN;
+ atomic_inc(&rcu_torture_wcount[i]);
+ if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+ rp->rtort_mbtest = 0;
+ rcu_torture_free(rp);
+ } else
+ cur_ops->deferredfree(rp);
+}
+
+static void rcu_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops rcu_ops = {
+ .init = NULL,
+ .cleanup = NULL,
+ .readlock = rcu_torture_read_lock,
+ .readunlock = rcu_torture_read_unlock,
+ .completed = rcu_torture_completed,
+ .deferredfree = rcu_torture_deferred_free,
+ .stats = NULL,
+ .name = "rcu"
+};
+
+/*
+ * Definitions for rcu_bh torture testing.
+ */
+
+static int rcu_bh_torture_read_lock(void)
+{
+ rcu_read_lock_bh();
+ return 0;
+}
+
+static void rcu_bh_torture_read_unlock(int idx)
+{
+ rcu_read_unlock_bh();
+}
+
+static int rcu_bh_torture_completed(void)
+{
+ return rcu_batches_completed_bh();
+}
+
+static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops rcu_bh_ops = {
+ .init = NULL,
+ .cleanup = NULL,
+ .readlock = rcu_bh_torture_read_lock,
+ .readunlock = rcu_bh_torture_read_unlock,
+ .completed = rcu_bh_torture_completed,
+ .deferredfree = rcu_bh_torture_deferred_free,
+ .stats = NULL,
+ .name = "rcu_bh"
+};
+
+static struct rcu_torture_ops *torture_ops[] =
+ { &rcu_ops, &rcu_bh_ops, NULL };
+
+/*
* RCU torture writer kthread. Repeatedly substitutes a new structure
* for that pointed to by rcu_torture_current, freeing the old structure
* after a series of grace periods (the "pipeline").
@@ -209,8 +304,6 @@ rcu_torture_writer(void *arg)
do {
schedule_timeout_uninterruptible(1);
- if (rcu_batches_completed() == oldbatch)
- continue;
if ((rp = rcu_torture_alloc()) == NULL)
continue;
rp->rtort_pipe_count = 0;
@@ -225,10 +318,10 @@ rcu_torture_writer(void *arg)
i = RCU_TORTURE_PIPE_LEN;
atomic_inc(&rcu_torture_wcount[i]);
old_rp->rtort_pipe_count++;
- call_rcu(&old_rp->rtort_rcu, rcu_torture_cb);
+ cur_ops->deferredfree(old_rp);
}
rcu_torture_current_version++;
- oldbatch = rcu_batches_completed();
+ oldbatch = cur_ops->completed();
} while (!kthread_should_stop() && !fullstop);
VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
while (!kthread_should_stop())
@@ -246,6 +339,7 @@ static int
rcu_torture_reader(void *arg)
{
int completed;
+ int idx;
DEFINE_RCU_RANDOM(rand);
struct rcu_torture *p;
int pipe_count;
@@ -254,12 +348,12 @@ rcu_torture_reader(void *arg)
set_user_nice(current, 19);
do {
- rcu_read_lock();
- completed = rcu_batches_completed();
+ idx = cur_ops->readlock();
+ completed = cur_ops->completed();
p = rcu_dereference(rcu_torture_current);
if (p == NULL) {
/* Wait for rcu_torture_writer to get underway */
- rcu_read_unlock();
+ cur_ops->readunlock(idx);
schedule_timeout_interruptible(HZ);
continue;
}
@@ -273,14 +367,14 @@ rcu_torture_reader(void *arg)
pipe_count = RCU_TORTURE_PIPE_LEN;
}
++__get_cpu_var(rcu_torture_count)[pipe_count];
- completed = rcu_batches_completed() - completed;
+ completed = cur_ops->completed() - completed;
if (completed > RCU_TORTURE_PIPE_LEN) {
/* Should not happen, but... */
completed = RCU_TORTURE_PIPE_LEN;
}
++__get_cpu_var(rcu_torture_batch)[completed];
preempt_enable();
- rcu_read_unlock();
+ cur_ops->readunlock(idx);
schedule();
} while (!kthread_should_stop() && !fullstop);
VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
@@ -311,7 +405,7 @@ rcu_torture_printk(char *page)
if (pipesummary[i] != 0)
break;
}
- cnt += sprintf(&page[cnt], "rcutorture: ");
+ cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
cnt += sprintf(&page[cnt],
"rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
"rtmbe: %d",
@@ -324,7 +418,7 @@ rcu_torture_printk(char *page)
atomic_read(&n_rcu_torture_mberror));
if (atomic_read(&n_rcu_torture_mberror) != 0)
cnt += sprintf(&page[cnt], " !!!");
- cnt += sprintf(&page[cnt], "\nrcutorture: ");
+ cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
if (i > 1) {
cnt += sprintf(&page[cnt], "!!! ");
atomic_inc(&n_rcu_torture_error);
@@ -332,17 +426,19 @@ rcu_torture_printk(char *page)
cnt += sprintf(&page[cnt], "Reader Pipe: ");
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
- cnt += sprintf(&page[cnt], "\nrcutorture: ");
+ cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
cnt += sprintf(&page[cnt], "Reader Batch: ");
- for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
- cnt += sprintf(&page[cnt], "\nrcutorture: ");
+ cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
cnt += sprintf(&page[cnt], " %d",
atomic_read(&rcu_torture_wcount[i]));
}
cnt += sprintf(&page[cnt], "\n");
+ if (cur_ops->stats != NULL)
+ cnt += cur_ops->stats(&page[cnt]);
return cnt;
}
@@ -444,11 +540,11 @@ rcu_torture_shuffle(void *arg)
static inline void
rcu_torture_print_module_parms(char *tag)
{
- printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d "
+ printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d "
"stat_interval=%d verbose=%d test_no_idle_hz=%d "
"shuffle_interval = %d\n",
- tag, nrealreaders, stat_interval, verbose, test_no_idle_hz,
- shuffle_interval);
+ torture_type, tag, nrealreaders, stat_interval, verbose,
+ test_no_idle_hz, shuffle_interval);
}
static void
@@ -493,6 +589,9 @@ rcu_torture_cleanup(void)
rcu_barrier();
rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
+
+ if (cur_ops->cleanup != NULL)
+ cur_ops->cleanup();
if (atomic_read(&n_rcu_torture_error))
rcu_torture_print_module_parms("End of test: FAILURE");
else
@@ -508,6 +607,20 @@ rcu_torture_init(void)
/* Process args and tell the world that the torturer is on the job. */
+ for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) {
+ cur_ops = torture_ops[i];
+ if (strcmp(torture_type, cur_ops->name) == 0) {
+ break;
+ }
+ }
+ if (cur_ops == NULL) {
+ printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
+ torture_type);
+ return (-EINVAL);
+ }
+ if (cur_ops->init != NULL)
+ cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+
if (nreaders >= 0)
nrealreaders = nreaders;
else
diff --git a/kernel/resource.c b/kernel/resource.c
index e3080fcc66a..2404f9b0bc4 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -232,6 +232,44 @@ int release_resource(struct resource *old)
EXPORT_SYMBOL(release_resource);
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Finds the lowest memory reosurce exists within [res->start.res->end)
+ * the caller must specify res->start, res->end, res->flags.
+ * If found, returns 0, res is overwritten, if not found, returns -1.
+ */
+int find_next_system_ram(struct resource *res)
+{
+ resource_size_t start, end;
+ struct resource *p;
+
+ BUG_ON(!res);
+
+ start = res->start;
+ end = res->end;
+
+ read_lock(&resource_lock);
+ for (p = iomem_resource.child; p ; p = p->sibling) {
+ /* system ram is just marked as IORESOURCE_MEM */
+ if (p->flags != res->flags)
+ continue;
+ if (p->start > end) {
+ p = NULL;
+ break;
+ }
+ if (p->start >= start)
+ break;
+ }
+ read_unlock(&resource_lock);
+ if (!p)
+ return -1;
+ /* copy data */
+ res->start = p->start;
+ res->end = p->end;
+ return 0;
+}
+#endif
+
/*
* Find empty slot in the resource tree given range and alignment.
*/
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
new file mode 100644
index 00000000000..4aa8a2c9f45
--- /dev/null
+++ b/kernel/rtmutex-debug.c
@@ -0,0 +1,513 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This code is based on the rt.c implementation in the preempt-rt tree.
+ * Portions of said code are
+ *
+ * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ * Copyright (C) 2006 Esben Nielsen
+ * Copyright (C) 2006 Kihon Technologies Inc.,
+ * Steven Rostedt <rostedt@goodmis.org>
+ *
+ * See rt.c in preempt-rt for proper credits and further information
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+# define TRACE_WARN_ON(x) WARN_ON(x)
+# define TRACE_BUG_ON(x) BUG_ON(x)
+
+# define TRACE_OFF() \
+do { \
+ if (rt_trace_on) { \
+ rt_trace_on = 0; \
+ console_verbose(); \
+ if (spin_is_locked(&current->pi_lock)) \
+ spin_unlock(&current->pi_lock); \
+ if (spin_is_locked(&current->held_list_lock)) \
+ spin_unlock(&current->held_list_lock); \
+ } \
+} while (0)
+
+# define TRACE_OFF_NOLOCK() \
+do { \
+ if (rt_trace_on) { \
+ rt_trace_on = 0; \
+ console_verbose(); \
+ } \
+} while (0)
+
+# define TRACE_BUG_LOCKED() \
+do { \
+ TRACE_OFF(); \
+ BUG(); \
+} while (0)
+
+# define TRACE_WARN_ON_LOCKED(c) \
+do { \
+ if (unlikely(c)) { \
+ TRACE_OFF(); \
+ WARN_ON(1); \
+ } \
+} while (0)
+
+# define TRACE_BUG_ON_LOCKED(c) \
+do { \
+ if (unlikely(c)) \
+ TRACE_BUG_LOCKED(); \
+} while (0)
+
+#ifdef CONFIG_SMP
+# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c)
+#else
+# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0)
+#endif
+
+/*
+ * deadlock detection flag. We turn it off when we detect
+ * the first problem because we dont want to recurse back
+ * into the tracing code when doing error printk or
+ * executing a BUG():
+ */
+int rt_trace_on = 1;
+
+void deadlock_trace_off(void)
+{
+ rt_trace_on = 0;
+}
+
+static void printk_task(task_t *p)
+{
+ if (p)
+ printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
+ else
+ printk("<none>");
+}
+
+static void printk_task_short(task_t *p)
+{
+ if (p)
+ printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
+ else
+ printk("<none>");
+}
+
+static void printk_lock(struct rt_mutex *lock, int print_owner)
+{
+ if (lock->name)
+ printk(" [%p] {%s}\n",
+ lock, lock->name);
+ else
+ printk(" [%p] {%s:%d}\n",
+ lock, lock->file, lock->line);
+
+ if (print_owner && rt_mutex_owner(lock)) {
+ printk(".. ->owner: %p\n", lock->owner);
+ printk(".. held by: ");
+ printk_task(rt_mutex_owner(lock));
+ printk("\n");
+ }
+ if (rt_mutex_owner(lock)) {
+ printk("... acquired at: ");
+ print_symbol("%s\n", lock->acquire_ip);
+ }
+}
+
+static void printk_waiter(struct rt_mutex_waiter *w)
+{
+ printk("-------------------------\n");
+ printk("| waiter struct %p:\n", w);
+ printk("| w->list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
+ w->list_entry.plist.prio_list.prev, w->list_entry.plist.prio_list.next,
+ w->list_entry.plist.node_list.prev, w->list_entry.plist.node_list.next,
+ w->list_entry.prio);
+ printk("| w->pi_list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
+ w->pi_list_entry.plist.prio_list.prev, w->pi_list_entry.plist.prio_list.next,
+ w->pi_list_entry.plist.node_list.prev, w->pi_list_entry.plist.node_list.next,
+ w->pi_list_entry.prio);
+ printk("\n| lock:\n");
+ printk_lock(w->lock, 1);
+ printk("| w->ti->task:\n");
+ printk_task(w->task);
+ printk("| blocked at: ");
+ print_symbol("%s\n", w->ip);
+ printk("-------------------------\n");
+}
+
+static void show_task_locks(task_t *p)
+{
+ switch (p->state) {
+ case TASK_RUNNING: printk("R"); break;
+ case TASK_INTERRUPTIBLE: printk("S"); break;
+ case TASK_UNINTERRUPTIBLE: printk("D"); break;
+ case TASK_STOPPED: printk("T"); break;
+ case EXIT_ZOMBIE: printk("Z"); break;
+ case EXIT_DEAD: printk("X"); break;
+ default: printk("?"); break;
+ }
+ printk_task(p);
+ if (p->pi_blocked_on) {
+ struct rt_mutex *lock = p->pi_blocked_on->lock;
+
+ printk(" blocked on:");
+ printk_lock(lock, 1);
+ } else
+ printk(" (not blocked)\n");
+}
+
+void rt_mutex_show_held_locks(task_t *task, int verbose)
+{
+ struct list_head *curr, *cursor = NULL;
+ struct rt_mutex *lock;
+ task_t *t;
+ unsigned long flags;
+ int count = 0;
+
+ if (!rt_trace_on)
+ return;
+
+ if (verbose) {
+ printk("------------------------------\n");
+ printk("| showing all locks held by: | (");
+ printk_task_short(task);
+ printk("):\n");
+ printk("------------------------------\n");
+ }
+
+next:
+ spin_lock_irqsave(&task->held_list_lock, flags);
+ list_for_each(curr, &task->held_list_head) {
+ if (cursor && curr != cursor)
+ continue;
+ lock = list_entry(curr, struct rt_mutex, held_list_entry);
+ t = rt_mutex_owner(lock);
+ WARN_ON(t != task);
+ count++;
+ cursor = curr->next;
+ spin_unlock_irqrestore(&task->held_list_lock, flags);
+
+ printk("\n#%03d: ", count);
+ printk_lock(lock, 0);
+ goto next;
+ }
+ spin_unlock_irqrestore(&task->held_list_lock, flags);
+
+ printk("\n");
+}
+
+void rt_mutex_show_all_locks(void)
+{
+ task_t *g, *p;
+ int count = 10;
+ int unlock = 1;
+
+ printk("\n");
+ printk("----------------------\n");
+ printk("| showing all tasks: |\n");
+ printk("----------------------\n");
+
+ /*
+ * Here we try to get the tasklist_lock as hard as possible,
+ * if not successful after 2 seconds we ignore it (but keep
+ * trying). This is to enable a debug printout even if a
+ * tasklist_lock-holding task deadlocks or crashes.
+ */
+retry:
+ if (!read_trylock(&tasklist_lock)) {
+ if (count == 10)
+ printk("hm, tasklist_lock locked, retrying... ");
+ if (count) {
+ count--;
+ printk(" #%d", 10-count);
+ mdelay(200);
+ goto retry;
+ }
+ printk(" ignoring it.\n");
+ unlock = 0;
+ }
+ if (count != 10)
+ printk(" locked it.\n");
+
+ do_each_thread(g, p) {
+ show_task_locks(p);
+ if (!unlock)
+ if (read_trylock(&tasklist_lock))
+ unlock = 1;
+ } while_each_thread(g, p);
+
+ printk("\n");
+
+ printk("-----------------------------------------\n");
+ printk("| showing all locks held in the system: |\n");
+ printk("-----------------------------------------\n");
+
+ do_each_thread(g, p) {
+ rt_mutex_show_held_locks(p, 0);
+ if (!unlock)
+ if (read_trylock(&tasklist_lock))
+ unlock = 1;
+ } while_each_thread(g, p);
+
+
+ printk("=============================================\n\n");
+
+ if (unlock)
+ read_unlock(&tasklist_lock);
+}
+
+void rt_mutex_debug_check_no_locks_held(task_t *task)
+{
+ struct rt_mutex_waiter *w;
+ struct list_head *curr;
+ struct rt_mutex *lock;
+
+ if (!rt_trace_on)
+ return;
+ if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) {
+ printk("BUG: PI priority boost leaked!\n");
+ printk_task(task);
+ printk("\n");
+ }
+ if (list_empty(&task->held_list_head))
+ return;
+
+ spin_lock(&task->pi_lock);
+ plist_for_each_entry(w, &task->pi_waiters, pi_list_entry) {
+ TRACE_OFF();
+
+ printk("hm, PI interest held at exit time? Task:\n");
+ printk_task(task);
+ printk_waiter(w);
+ return;
+ }
+ spin_unlock(&task->pi_lock);
+
+ list_for_each(curr, &task->held_list_head) {
+ lock = list_entry(curr, struct rt_mutex, held_list_entry);
+
+ printk("BUG: %s/%d, lock held at task exit time!\n",
+ task->comm, task->pid);
+ printk_lock(lock, 1);
+ if (rt_mutex_owner(lock) != task)
+ printk("exiting task is not even the owner??\n");
+ }
+}
+
+int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
+{
+ const void *to = from + len;
+ struct list_head *curr;
+ struct rt_mutex *lock;
+ unsigned long flags;
+ void *lock_addr;
+
+ if (!rt_trace_on)
+ return 0;
+
+ spin_lock_irqsave(&current->held_list_lock, flags);
+ list_for_each(curr, &current->held_list_head) {
+ lock = list_entry(curr, struct rt_mutex, held_list_entry);
+ lock_addr = lock;
+ if (lock_addr < from || lock_addr >= to)
+ continue;
+ TRACE_OFF();
+
+ printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
+ current->comm, current->pid, lock, from, to);
+ dump_stack();
+ printk_lock(lock, 1);
+ if (rt_mutex_owner(lock) != current)
+ printk("freeing task is not even the owner??\n");
+ return 1;
+ }
+ spin_unlock_irqrestore(&current->held_list_lock, flags);
+
+ return 0;
+}
+
+void rt_mutex_debug_task_free(struct task_struct *task)
+{
+ WARN_ON(!plist_head_empty(&task->pi_waiters));
+ WARN_ON(task->pi_blocked_on);
+}
+
+/*
+ * We fill out the fields in the waiter to store the information about
+ * the deadlock. We print when we return. act_waiter can be NULL in
+ * case of a remove waiter operation.
+ */
+void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
+ struct rt_mutex *lock)
+{
+ struct task_struct *task;
+
+ if (!rt_trace_on || detect || !act_waiter)
+ return;
+
+ task = rt_mutex_owner(act_waiter->lock);
+ if (task && task != current) {
+ act_waiter->deadlock_task_pid = task->pid;
+ act_waiter->deadlock_lock = lock;
+ }
+}
+
+void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
+{
+ struct task_struct *task;
+
+ if (!waiter->deadlock_lock || !rt_trace_on)
+ return;
+
+ task = find_task_by_pid(waiter->deadlock_task_pid);
+ if (!task)
+ return;
+
+ TRACE_OFF_NOLOCK();
+
+ printk("\n============================================\n");
+ printk( "[ BUG: circular locking deadlock detected! ]\n");
+ printk( "--------------------------------------------\n");
+ printk("%s/%d is deadlocking current task %s/%d\n\n",
+ task->comm, task->pid, current->comm, current->pid);
+
+ printk("\n1) %s/%d is trying to acquire this lock:\n",
+ current->comm, current->pid);
+ printk_lock(waiter->lock, 1);
+
+ printk("... trying at: ");
+ print_symbol("%s\n", waiter->ip);
+
+ printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid);
+ printk_lock(waiter->deadlock_lock, 1);
+
+ rt_mutex_show_held_locks(current, 1);
+ rt_mutex_show_held_locks(task, 1);
+
+ printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid);
+ show_stack(task, NULL);
+ printk("\n%s/%d's [current] stackdump:\n\n",
+ current->comm, current->pid);
+ dump_stack();
+ rt_mutex_show_all_locks();
+ printk("[ turning off deadlock detection."
+ "Please report this trace. ]\n\n");
+ local_irq_disable();
+}
+
+void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__)
+{
+ unsigned long flags;
+
+ if (rt_trace_on) {
+ TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
+
+ spin_lock_irqsave(&current->held_list_lock, flags);
+ list_add_tail(&lock->held_list_entry, &current->held_list_head);
+ spin_unlock_irqrestore(&current->held_list_lock, flags);
+
+ lock->acquire_ip = ip;
+ }
+}
+
+void debug_rt_mutex_unlock(struct rt_mutex *lock)
+{
+ unsigned long flags;
+
+ if (rt_trace_on) {
+ TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
+ TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
+
+ spin_lock_irqsave(&current->held_list_lock, flags);
+ list_del_init(&lock->held_list_entry);
+ spin_unlock_irqrestore(&current->held_list_lock, flags);
+ }
+}
+
+void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
+ struct task_struct *powner __IP_DECL__)
+{
+ unsigned long flags;
+
+ if (rt_trace_on) {
+ TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
+
+ spin_lock_irqsave(&powner->held_list_lock, flags);
+ list_add_tail(&lock->held_list_entry, &powner->held_list_head);
+ spin_unlock_irqrestore(&powner->held_list_lock, flags);
+
+ lock->acquire_ip = ip;
+ }
+}
+
+void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
+{
+ unsigned long flags;
+
+ if (rt_trace_on) {
+ struct task_struct *owner = rt_mutex_owner(lock);
+
+ TRACE_WARN_ON_LOCKED(!owner);
+ TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
+
+ spin_lock_irqsave(&owner->held_list_lock, flags);
+ list_del_init(&lock->held_list_entry);
+ spin_unlock_irqrestore(&owner->held_list_lock, flags);
+ }
+}
+
+void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ memset(waiter, 0x11, sizeof(*waiter));
+ plist_node_init(&waiter->list_entry, MAX_PRIO);
+ plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
+}
+
+void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
+{
+ TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
+ TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+ TRACE_WARN_ON(waiter->task);
+ memset(waiter, 0x22, sizeof(*waiter));
+}
+
+void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
+{
+ void *addr = lock;
+
+ if (rt_trace_on) {
+ rt_mutex_debug_check_no_locks_freed(addr,
+ sizeof(struct rt_mutex));
+ INIT_LIST_HEAD(&lock->held_list_entry);
+ lock->name = name;
+ }
+}
+
+void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, task_t *task)
+{
+}
+
+void rt_mutex_deadlock_account_unlock(struct task_struct *task)
+{
+}
+
diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h
new file mode 100644
index 00000000000..7612fbc62d7
--- /dev/null
+++ b/kernel/rtmutex-debug.h
@@ -0,0 +1,37 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c. Debug version.
+ */
+
+#define __IP_DECL__ , unsigned long ip
+#define __IP__ , ip
+#define __RET_IP__ , (unsigned long)__builtin_return_address(0)
+
+extern void
+rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
+extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
+extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
+extern void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__);
+extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
+ struct task_struct *powner __IP_DECL__);
+extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
+ struct rt_mutex *lock);
+extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
+# define debug_rt_mutex_reset_waiter(w) \
+ do { (w)->deadlock_lock = NULL; } while (0)
+
+static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
+ int detect)
+{
+ return (waiter != NULL);
+}
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
new file mode 100644
index 00000000000..e82c2f84824
--- /dev/null
+++ b/kernel/rtmutex-tester.c
@@ -0,0 +1,440 @@
+/*
+ * RT-Mutex-tester: scriptable tester for rt mutexes
+ *
+ * started by Thomas Gleixner:
+ *
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ */
+#include <linux/config.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
+#include <linux/sysdev.h>
+#include <linux/timer.h>
+
+#include "rtmutex.h"
+
+#define MAX_RT_TEST_THREADS 8
+#define MAX_RT_TEST_MUTEXES 8
+
+static spinlock_t rttest_lock;
+static atomic_t rttest_event;
+
+struct test_thread_data {
+ int opcode;
+ int opdata;
+ int mutexes[MAX_RT_TEST_MUTEXES];
+ int bkl;
+ int event;
+ struct sys_device sysdev;
+};
+
+static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
+static task_t *threads[MAX_RT_TEST_THREADS];
+static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
+
+enum test_opcodes {
+ RTTEST_NOP = 0,
+ RTTEST_SCHEDOT, /* 1 Sched other, data = nice */
+ RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */
+ RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */
+ RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
+ RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */
+ RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
+ RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
+ RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
+ RTTEST_LOCKBKL, /* 9 Lock BKL */
+ RTTEST_UNLOCKBKL, /* 10 Unlock BKL */
+ RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */
+ RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
+ RTTEST_RESET = 99, /* 99 Reset all pending operations */
+};
+
+static int handle_op(struct test_thread_data *td, int lockwakeup)
+{
+ int i, id, ret = -EINVAL;
+
+ switch(td->opcode) {
+
+ case RTTEST_NOP:
+ return 0;
+
+ case RTTEST_LOCKCONT:
+ td->mutexes[td->opdata] = 1;
+ td->event = atomic_add_return(1, &rttest_event);
+ return 0;
+
+ case RTTEST_RESET:
+ for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
+ if (td->mutexes[i] == 4) {
+ rt_mutex_unlock(&mutexes[i]);
+ td->mutexes[i] = 0;
+ }
+ }
+
+ if (!lockwakeup && td->bkl == 4) {
+ unlock_kernel();
+ td->bkl = 0;
+ }
+ return 0;
+
+ case RTTEST_RESETEVENT:
+ atomic_set(&rttest_event, 0);
+ return 0;
+
+ default:
+ if (lockwakeup)
+ return ret;
+ }
+
+ switch(td->opcode) {
+
+ case RTTEST_LOCK:
+ case RTTEST_LOCKNOWAIT:
+ id = td->opdata;
+ if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
+ return ret;
+
+ td->mutexes[id] = 1;
+ td->event = atomic_add_return(1, &rttest_event);
+ rt_mutex_lock(&mutexes[id]);
+ td->event = atomic_add_return(1, &rttest_event);
+ td->mutexes[id] = 4;
+ return 0;
+
+ case RTTEST_LOCKINT:
+ case RTTEST_LOCKINTNOWAIT:
+ id = td->opdata;
+ if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
+ return ret;
+
+ td->mutexes[id] = 1;
+ td->event = atomic_add_return(1, &rttest_event);
+ ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
+ td->event = atomic_add_return(1, &rttest_event);
+ td->mutexes[id] = ret ? 0 : 4;
+ return ret ? -EINTR : 0;
+
+ case RTTEST_UNLOCK:
+ id = td->opdata;
+ if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
+ return ret;
+
+ td->event = atomic_add_return(1, &rttest_event);
+ rt_mutex_unlock(&mutexes[id]);
+ td->event = atomic_add_return(1, &rttest_event);
+ td->mutexes[id] = 0;
+ return 0;
+
+ case RTTEST_LOCKBKL:
+ if (td->bkl)
+ return 0;
+ td->bkl = 1;
+ lock_kernel();
+ td->bkl = 4;
+ return 0;
+
+ case RTTEST_UNLOCKBKL:
+ if (td->bkl != 4)
+ break;
+ unlock_kernel();
+ td->bkl = 0;
+ return 0;
+
+ default:
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Schedule replacement for rtsem_down(). Only called for threads with
+ * PF_MUTEX_TESTER set.
+ *
+ * This allows us to have finegrained control over the event flow.
+ *
+ */
+void schedule_rt_mutex_test(struct rt_mutex *mutex)
+{
+ int tid, op, dat;
+ struct test_thread_data *td;
+
+ /* We have to lookup the task */
+ for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
+ if (threads[tid] == current)
+ break;
+ }
+
+ BUG_ON(tid == MAX_RT_TEST_THREADS);
+
+ td = &thread_data[tid];
+
+ op = td->opcode;
+ dat = td->opdata;
+
+ switch (op) {
+ case RTTEST_LOCK:
+ case RTTEST_LOCKINT:
+ case RTTEST_LOCKNOWAIT:
+ case RTTEST_LOCKINTNOWAIT:
+ if (mutex != &mutexes[dat])
+ break;
+
+ if (td->mutexes[dat] != 1)
+ break;
+
+ td->mutexes[dat] = 2;
+ td->event = atomic_add_return(1, &rttest_event);
+ break;
+
+ case RTTEST_LOCKBKL:
+ default:
+ break;
+ }
+
+ schedule();
+
+
+ switch (op) {
+ case RTTEST_LOCK:
+ case RTTEST_LOCKINT:
+ if (mutex != &mutexes[dat])
+ return;
+
+ if (td->mutexes[dat] != 2)
+ return;
+
+ td->mutexes[dat] = 3;
+ td->event = atomic_add_return(1, &rttest_event);
+ break;
+
+ case RTTEST_LOCKNOWAIT:
+ case RTTEST_LOCKINTNOWAIT:
+ if (mutex != &mutexes[dat])
+ return;
+
+ if (td->mutexes[dat] != 2)
+ return;
+
+ td->mutexes[dat] = 1;
+ td->event = atomic_add_return(1, &rttest_event);
+ return;
+
+ case RTTEST_LOCKBKL:
+ return;
+ default:
+ return;
+ }
+
+ td->opcode = 0;
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (td->opcode > 0) {
+ int ret;
+
+ set_current_state(TASK_RUNNING);
+ ret = handle_op(td, 1);
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (td->opcode == RTTEST_LOCKCONT)
+ break;
+ td->opcode = ret;
+ }
+
+ /* Wait for the next command to be executed */
+ schedule();
+ }
+
+ /* Restore previous command and data */
+ td->opcode = op;
+ td->opdata = dat;
+}
+
+static int test_func(void *data)
+{
+ struct test_thread_data *td = data;
+ int ret;
+
+ current->flags |= PF_MUTEX_TESTER;
+ allow_signal(SIGHUP);
+
+ for(;;) {
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (td->opcode > 0) {
+ set_current_state(TASK_RUNNING);
+ ret = handle_op(td, 0);
+ set_current_state(TASK_INTERRUPTIBLE);
+ td->opcode = ret;
+ }
+
+ /* Wait for the next command to be executed */
+ schedule();
+
+ if (signal_pending(current))
+ flush_signals(current);
+
+ if(kthread_should_stop())
+ break;
+ }
+ return 0;
+}
+
+/**
+ * sysfs_test_command - interface for test commands
+ * @dev: thread reference
+ * @buf: command for actual step
+ * @count: length of buffer
+ *
+ * command syntax:
+ *
+ * opcode:data
+ */
+static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
+ size_t count)
+{
+ struct sched_param schedpar;
+ struct test_thread_data *td;
+ char cmdbuf[32];
+ int op, dat, tid, ret;
+
+ td = container_of(dev, struct test_thread_data, sysdev);
+ tid = td->sysdev.id;
+
+ /* strings from sysfs write are not 0 terminated! */
+ if (count >= sizeof(cmdbuf))
+ return -EINVAL;
+
+ /* strip of \n: */
+ if (buf[count-1] == '\n')
+ count--;
+ if (count < 1)
+ return -EINVAL;
+
+ memcpy(cmdbuf, buf, count);
+ cmdbuf[count] = 0;
+
+ if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
+ return -EINVAL;
+
+ switch (op) {
+ case RTTEST_SCHEDOT:
+ schedpar.sched_priority = 0;
+ ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
+ if (ret)
+ return ret;
+ set_user_nice(current, 0);
+ break;
+
+ case RTTEST_SCHEDRT:
+ schedpar.sched_priority = dat;
+ ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
+ if (ret)
+ return ret;
+ break;
+
+ case RTTEST_SIGNAL:
+ send_sig(SIGHUP, threads[tid], 0);
+ break;
+
+ default:
+ if (td->opcode > 0)
+ return -EBUSY;
+ td->opdata = dat;
+ td->opcode = op;
+ wake_up_process(threads[tid]);
+ }
+
+ return count;
+}
+
+/**
+ * sysfs_test_status - sysfs interface for rt tester
+ * @dev: thread to query
+ * @buf: char buffer to be filled with thread status info
+ */
+static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
+{
+ struct test_thread_data *td;
+ char *curr = buf;
+ task_t *tsk;
+ int i;
+
+ td = container_of(dev, struct test_thread_data, sysdev);
+ tsk = threads[td->sysdev.id];
+
+ spin_lock(&rttest_lock);
+
+ curr += sprintf(curr,
+ "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
+ td->opcode, td->event, tsk->state,
+ (MAX_RT_PRIO - 1) - tsk->prio,
+ (MAX_RT_PRIO - 1) - tsk->normal_prio,
+ tsk->pi_blocked_on, td->bkl);
+
+ for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
+ curr += sprintf(curr, "%d", td->mutexes[i]);
+
+ spin_unlock(&rttest_lock);
+
+ curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
+ mutexes[td->sysdev.id].owner);
+
+ return curr - buf;
+}
+
+static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
+static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
+
+static struct sysdev_class rttest_sysclass = {
+ set_kset_name("rttest"),
+};
+
+static int init_test_thread(int id)
+{
+ thread_data[id].sysdev.cls = &rttest_sysclass;
+ thread_data[id].sysdev.id = id;
+
+ threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
+ if (IS_ERR(threads[id]))
+ return PTR_ERR(threads[id]);
+
+ return sysdev_register(&thread_data[id].sysdev);
+}
+
+static int init_rttest(void)
+{
+ int ret, i;
+
+ spin_lock_init(&rttest_lock);
+
+ for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
+ rt_mutex_init(&mutexes[i]);
+
+ ret = sysdev_class_register(&rttest_sysclass);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
+ ret = init_test_thread(i);
+ if (ret)
+ break;
+ ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
+ if (ret)
+ break;
+ ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
+ if (ret)
+ break;
+ }
+
+ printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
+
+ return ret;
+}
+
+device_initcall(init_rttest);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
new file mode 100644
index 00000000000..45d61016da5
--- /dev/null
+++ b/kernel/rtmutex.c
@@ -0,0 +1,990 @@
+/*
+ * RT-Mutexes: simple blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner.
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
+ * Copyright (C) 2006 Esben Nielsen
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+/*
+ * lock->owner state tracking:
+ *
+ * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
+ * are used to keep track of the "owner is pending" and "lock has
+ * waiters" state.
+ *
+ * owner bit1 bit0
+ * NULL 0 0 lock is free (fast acquire possible)
+ * NULL 0 1 invalid state
+ * NULL 1 0 Transitional State*
+ * NULL 1 1 invalid state
+ * taskpointer 0 0 lock is held (fast release possible)
+ * taskpointer 0 1 task is pending owner
+ * taskpointer 1 0 lock is held and has waiters
+ * taskpointer 1 1 task is pending owner and lock has more waiters
+ *
+ * Pending ownership is assigned to the top (highest priority)
+ * waiter of the lock, when the lock is released. The thread is woken
+ * up and can now take the lock. Until the lock is taken (bit 0
+ * cleared) a competing higher priority thread can steal the lock
+ * which puts the woken up thread back on the waiters list.
+ *
+ * The fast atomic compare exchange based acquire and release is only
+ * possible when bit 0 and 1 of lock->owner are 0.
+ *
+ * (*) There's a small time where the owner can be NULL and the
+ * "lock has waiters" bit is set. This can happen when grabbing the lock.
+ * To prevent a cmpxchg of the owner releasing the lock, we need to set this
+ * bit before looking at the lock, hence the reason this is a transitional
+ * state.
+ */
+
+static void
+rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
+ unsigned long mask)
+{
+ unsigned long val = (unsigned long)owner | mask;
+
+ if (rt_mutex_has_waiters(lock))
+ val |= RT_MUTEX_HAS_WAITERS;
+
+ lock->owner = (struct task_struct *)val;
+}
+
+static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ lock->owner = (struct task_struct *)
+ ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
+}
+
+static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ if (!rt_mutex_has_waiters(lock))
+ clear_rt_mutex_waiters(lock);
+}
+
+/*
+ * We can speed up the acquire/release, if the architecture
+ * supports cmpxchg and if there's no debugging state to be set up
+ */
+#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+ do {
+ owner = *p;
+ } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+}
+#else
+# define rt_mutex_cmpxchg(l,c,n) (0)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ lock->owner = (struct task_struct *)
+ ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
+}
+#endif
+
+/*
+ * Calculate task priority from the waiter list priority
+ *
+ * Return task->normal_prio when the waiter list is empty or when
+ * the waiter is not allowed to do priority boosting
+ */
+int rt_mutex_getprio(struct task_struct *task)
+{
+ if (likely(!task_has_pi_waiters(task)))
+ return task->normal_prio;
+
+ return min(task_top_pi_waiter(task)->pi_list_entry.prio,
+ task->normal_prio);
+}
+
+/*
+ * Adjust the priority of a task, after its pi_waiters got modified.
+ *
+ * This can be both boosting and unboosting. task->pi_lock must be held.
+ */
+static void __rt_mutex_adjust_prio(struct task_struct *task)
+{
+ int prio = rt_mutex_getprio(task);
+
+ if (task->prio != prio)
+ rt_mutex_setprio(task, prio);
+}
+
+/*
+ * Adjust task priority (undo boosting). Called from the exit path of
+ * rt_mutex_slowunlock() and rt_mutex_slowlock().
+ *
+ * (Note: We do this outside of the protection of lock->wait_lock to
+ * allow the lock to be taken while or before we readjust the priority
+ * of task. We do not use the spin_xx_mutex() variants here as we are
+ * outside of the debug path.)
+ */
+static void rt_mutex_adjust_prio(struct task_struct *task)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&task->pi_lock, flags);
+ __rt_mutex_adjust_prio(task);
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+}
+
+/*
+ * Max number of times we'll walk the boosting chain:
+ */
+int max_lock_depth = 1024;
+
+/*
+ * Adjust the priority chain. Also used for deadlock detection.
+ * Decreases task's usage by one - may thus free the task.
+ * Returns 0 or -EDEADLK.
+ */
+static int rt_mutex_adjust_prio_chain(task_t *task,
+ int deadlock_detect,
+ struct rt_mutex *orig_lock,
+ struct rt_mutex_waiter *orig_waiter,
+ struct task_struct *top_task
+ __IP_DECL__)
+{
+ struct rt_mutex *lock;
+ struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
+ int detect_deadlock, ret = 0, depth = 0;
+ unsigned long flags;
+
+ detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
+ deadlock_detect);
+
+ /*
+ * The (de)boosting is a step by step approach with a lot of
+ * pitfalls. We want this to be preemptible and we want hold a
+ * maximum of two locks per step. So we have to check
+ * carefully whether things change under us.
+ */
+ again:
+ if (++depth > max_lock_depth) {
+ static int prev_max;
+
+ /*
+ * Print this only once. If the admin changes the limit,
+ * print a new message when reaching the limit again.
+ */
+ if (prev_max != max_lock_depth) {
+ prev_max = max_lock_depth;
+ printk(KERN_WARNING "Maximum lock depth %d reached "
+ "task: %s (%d)\n", max_lock_depth,
+ top_task->comm, top_task->pid);
+ }
+ put_task_struct(task);
+
+ return deadlock_detect ? -EDEADLK : 0;
+ }
+ retry:
+ /*
+ * Task can not go away as we did a get_task() before !
+ */
+ spin_lock_irqsave(&task->pi_lock, flags);
+
+ waiter = task->pi_blocked_on;
+ /*
+ * Check whether the end of the boosting chain has been
+ * reached or the state of the chain has changed while we
+ * dropped the locks.
+ */
+ if (!waiter || !waiter->task)
+ goto out_unlock_pi;
+
+ if (top_waiter && (!task_has_pi_waiters(task) ||
+ top_waiter != task_top_pi_waiter(task)))
+ goto out_unlock_pi;
+
+ /*
+ * When deadlock detection is off then we check, if further
+ * priority adjustment is necessary.
+ */
+ if (!detect_deadlock && waiter->list_entry.prio == task->prio)
+ goto out_unlock_pi;
+
+ lock = waiter->lock;
+ if (!spin_trylock(&lock->wait_lock)) {
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+ cpu_relax();
+ goto retry;
+ }
+
+ /* Deadlock detection */
+ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
+ debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
+ spin_unlock(&lock->wait_lock);
+ ret = deadlock_detect ? -EDEADLK : 0;
+ goto out_unlock_pi;
+ }
+
+ top_waiter = rt_mutex_top_waiter(lock);
+
+ /* Requeue the waiter */
+ plist_del(&waiter->list_entry, &lock->wait_list);
+ waiter->list_entry.prio = task->prio;
+ plist_add(&waiter->list_entry, &lock->wait_list);
+
+ /* Release the task */
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+ put_task_struct(task);
+
+ /* Grab the next task */
+ task = rt_mutex_owner(lock);
+ spin_lock_irqsave(&task->pi_lock, flags);
+
+ if (waiter == rt_mutex_top_waiter(lock)) {
+ /* Boost the owner */
+ plist_del(&top_waiter->pi_list_entry, &task->pi_waiters);
+ waiter->pi_list_entry.prio = waiter->list_entry.prio;
+ plist_add(&waiter->pi_list_entry, &task->pi_waiters);
+ __rt_mutex_adjust_prio(task);
+
+ } else if (top_waiter == waiter) {
+ /* Deboost the owner */
+ plist_del(&waiter->pi_list_entry, &task->pi_waiters);
+ waiter = rt_mutex_top_waiter(lock);
+ waiter->pi_list_entry.prio = waiter->list_entry.prio;
+ plist_add(&waiter->pi_list_entry, &task->pi_waiters);
+ __rt_mutex_adjust_prio(task);
+ }
+
+ get_task_struct(task);
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ top_waiter = rt_mutex_top_waiter(lock);
+ spin_unlock(&lock->wait_lock);
+
+ if (!detect_deadlock && waiter != top_waiter)
+ goto out_put_task;
+
+ goto again;
+
+ out_unlock_pi:
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+ out_put_task:
+ put_task_struct(task);
+ return ret;
+}
+
+/*
+ * Optimization: check if we can steal the lock from the
+ * assigned pending owner [which might not have taken the
+ * lock yet]:
+ */
+static inline int try_to_steal_lock(struct rt_mutex *lock)
+{
+ struct task_struct *pendowner = rt_mutex_owner(lock);
+ struct rt_mutex_waiter *next;
+ unsigned long flags;
+
+ if (!rt_mutex_owner_pending(lock))
+ return 0;
+
+ if (pendowner == current)
+ return 1;
+
+ spin_lock_irqsave(&pendowner->pi_lock, flags);
+ if (current->prio >= pendowner->prio) {
+ spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+ return 0;
+ }
+
+ /*
+ * Check if a waiter is enqueued on the pending owners
+ * pi_waiters list. Remove it and readjust pending owners
+ * priority.
+ */
+ if (likely(!rt_mutex_has_waiters(lock))) {
+ spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+ return 1;
+ }
+
+ /* No chain handling, pending owner is not blocked on anything: */
+ next = rt_mutex_top_waiter(lock);
+ plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
+ __rt_mutex_adjust_prio(pendowner);
+ spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+
+ /*
+ * We are going to steal the lock and a waiter was
+ * enqueued on the pending owners pi_waiters queue. So
+ * we have to enqueue this waiter into
+ * current->pi_waiters list. This covers the case,
+ * where current is boosted because it holds another
+ * lock and gets unboosted because the booster is
+ * interrupted, so we would delay a waiter with higher
+ * priority as current->normal_prio.
+ *
+ * Note: in the rare case of a SCHED_OTHER task changing
+ * its priority and thus stealing the lock, next->task
+ * might be current:
+ */
+ if (likely(next->task != current)) {
+ spin_lock_irqsave(&current->pi_lock, flags);
+ plist_add(&next->pi_list_entry, &current->pi_waiters);
+ __rt_mutex_adjust_prio(current);
+ spin_unlock_irqrestore(&current->pi_lock, flags);
+ }
+ return 1;
+}
+
+/*
+ * Try to take an rt-mutex
+ *
+ * This fails
+ * - when the lock has a real owner
+ * - when a different pending owner exists and has higher priority than current
+ *
+ * Must be called with lock->wait_lock held.
+ */
+static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__)
+{
+ /*
+ * We have to be careful here if the atomic speedups are
+ * enabled, such that, when
+ * - no other waiter is on the lock
+ * - the lock has been released since we did the cmpxchg
+ * the lock can be released or taken while we are doing the
+ * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
+ *
+ * The atomic acquire/release aware variant of
+ * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
+ * the WAITERS bit, the atomic release / acquire can not
+ * happen anymore and lock->wait_lock protects us from the
+ * non-atomic case.
+ *
+ * Note, that this might set lock->owner =
+ * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
+ * any more. This is fixed up when we take the ownership.
+ * This is the transitional state explained at the top of this file.
+ */
+ mark_rt_mutex_waiters(lock);
+
+ if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
+ return 0;
+
+ /* We got the lock. */
+ debug_rt_mutex_lock(lock __IP__);
+
+ rt_mutex_set_owner(lock, current, 0);
+
+ rt_mutex_deadlock_account_lock(lock, current);
+
+ return 1;
+}
+
+/*
+ * Task blocks on lock.
+ *
+ * Prepare waiter and propagate pi chain
+ *
+ * This must be called with lock->wait_lock held.
+ */
+static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ int detect_deadlock
+ __IP_DECL__)
+{
+ struct rt_mutex_waiter *top_waiter = waiter;
+ task_t *owner = rt_mutex_owner(lock);
+ int boost = 0, res;
+ unsigned long flags;
+
+ spin_lock_irqsave(&current->pi_lock, flags);
+ __rt_mutex_adjust_prio(current);
+ waiter->task = current;
+ waiter->lock = lock;
+ plist_node_init(&waiter->list_entry, current->prio);
+ plist_node_init(&waiter->pi_list_entry, current->prio);
+
+ /* Get the top priority waiter on the lock */
+ if (rt_mutex_has_waiters(lock))
+ top_waiter = rt_mutex_top_waiter(lock);
+ plist_add(&waiter->list_entry, &lock->wait_list);
+
+ current->pi_blocked_on = waiter;
+
+ spin_unlock_irqrestore(&current->pi_lock, flags);
+
+ if (waiter == rt_mutex_top_waiter(lock)) {
+ spin_lock_irqsave(&owner->pi_lock, flags);
+ plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
+ plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+
+ __rt_mutex_adjust_prio(owner);
+ if (owner->pi_blocked_on) {
+ boost = 1;
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(owner);
+ }
+ spin_unlock_irqrestore(&owner->pi_lock, flags);
+ }
+ else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
+ spin_lock_irqsave(&owner->pi_lock, flags);
+ if (owner->pi_blocked_on) {
+ boost = 1;
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(owner);
+ }
+ spin_unlock_irqrestore(&owner->pi_lock, flags);
+ }
+ if (!boost)
+ return 0;
+
+ spin_unlock(&lock->wait_lock);
+
+ res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
+ current __IP__);
+
+ spin_lock(&lock->wait_lock);
+
+ return res;
+}
+
+/*
+ * Wake up the next waiter on the lock.
+ *
+ * Remove the top waiter from the current tasks waiter list and from
+ * the lock waiter list. Set it as pending owner. Then wake it up.
+ *
+ * Called with lock->wait_lock held.
+ */
+static void wakeup_next_waiter(struct rt_mutex *lock)
+{
+ struct rt_mutex_waiter *waiter;
+ struct task_struct *pendowner;
+ unsigned long flags;
+
+ spin_lock_irqsave(&current->pi_lock, flags);
+
+ waiter = rt_mutex_top_waiter(lock);
+ plist_del(&waiter->list_entry, &lock->wait_list);
+
+ /*
+ * Remove it from current->pi_waiters. We do not adjust a
+ * possible priority boost right now. We execute wakeup in the
+ * boosted mode and go back to normal after releasing
+ * lock->wait_lock.
+ */
+ plist_del(&waiter->pi_list_entry, &current->pi_waiters);
+ pendowner = waiter->task;
+ waiter->task = NULL;
+
+ rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
+
+ spin_unlock_irqrestore(&current->pi_lock, flags);
+
+ /*
+ * Clear the pi_blocked_on variable and enqueue a possible
+ * waiter into the pi_waiters list of the pending owner. This
+ * prevents that in case the pending owner gets unboosted a
+ * waiter with higher priority than pending-owner->normal_prio
+ * is blocked on the unboosted (pending) owner.
+ */
+ spin_lock_irqsave(&pendowner->pi_lock, flags);
+
+ WARN_ON(!pendowner->pi_blocked_on);
+ WARN_ON(pendowner->pi_blocked_on != waiter);
+ WARN_ON(pendowner->pi_blocked_on->lock != lock);
+
+ pendowner->pi_blocked_on = NULL;
+
+ if (rt_mutex_has_waiters(lock)) {
+ struct rt_mutex_waiter *next;
+
+ next = rt_mutex_top_waiter(lock);
+ plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
+ }
+ spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+
+ wake_up_process(pendowner);
+}
+
+/*
+ * Remove a waiter from a lock
+ *
+ * Must be called with lock->wait_lock held
+ */
+static void remove_waiter(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter __IP_DECL__)
+{
+ int first = (waiter == rt_mutex_top_waiter(lock));
+ int boost = 0;
+ task_t *owner = rt_mutex_owner(lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&current->pi_lock, flags);
+ plist_del(&waiter->list_entry, &lock->wait_list);
+ waiter->task = NULL;
+ current->pi_blocked_on = NULL;
+ spin_unlock_irqrestore(&current->pi_lock, flags);
+
+ if (first && owner != current) {
+
+ spin_lock_irqsave(&owner->pi_lock, flags);
+
+ plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
+
+ if (rt_mutex_has_waiters(lock)) {
+ struct rt_mutex_waiter *next;
+
+ next = rt_mutex_top_waiter(lock);
+ plist_add(&next->pi_list_entry, &owner->pi_waiters);
+ }
+ __rt_mutex_adjust_prio(owner);
+
+ if (owner->pi_blocked_on) {
+ boost = 1;
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(owner);
+ }
+ spin_unlock_irqrestore(&owner->pi_lock, flags);
+ }
+
+ WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+
+ if (!boost)
+ return;
+
+ spin_unlock(&lock->wait_lock);
+
+ rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current __IP__);
+
+ spin_lock(&lock->wait_lock);
+}
+
+/*
+ * Recheck the pi chain, in case we got a priority setting
+ *
+ * Called from sched_setscheduler
+ */
+void rt_mutex_adjust_pi(struct task_struct *task)
+{
+ struct rt_mutex_waiter *waiter;
+ unsigned long flags;
+
+ spin_lock_irqsave(&task->pi_lock, flags);
+
+ waiter = task->pi_blocked_on;
+ if (!waiter || waiter->list_entry.prio == task->prio) {
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+ return;
+ }
+
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(task);
+ spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task __RET_IP__);
+}
+
+/*
+ * Slow path lock function:
+ */
+static int __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ int detect_deadlock __IP_DECL__)
+{
+ struct rt_mutex_waiter waiter;
+ int ret = 0;
+
+ debug_rt_mutex_init_waiter(&waiter);
+ waiter.task = NULL;
+
+ spin_lock(&lock->wait_lock);
+
+ /* Try to acquire the lock again: */
+ if (try_to_take_rt_mutex(lock __IP__)) {
+ spin_unlock(&lock->wait_lock);
+ return 0;
+ }
+
+ set_current_state(state);
+
+ /* Setup the timer, when timeout != NULL */
+ if (unlikely(timeout))
+ hrtimer_start(&timeout->timer, timeout->timer.expires,
+ HRTIMER_ABS);
+
+ for (;;) {
+ /* Try to acquire the lock: */
+ if (try_to_take_rt_mutex(lock __IP__))
+ break;
+
+ /*
+ * TASK_INTERRUPTIBLE checks for signals and
+ * timeout. Ignored otherwise.
+ */
+ if (unlikely(state == TASK_INTERRUPTIBLE)) {
+ /* Signal pending? */
+ if (signal_pending(current))
+ ret = -EINTR;
+ if (timeout && !timeout->task)
+ ret = -ETIMEDOUT;
+ if (ret)
+ break;
+ }
+
+ /*
+ * waiter.task is NULL the first time we come here and
+ * when we have been woken up by the previous owner
+ * but the lock got stolen by a higher prio task.
+ */
+ if (!waiter.task) {
+ ret = task_blocks_on_rt_mutex(lock, &waiter,
+ detect_deadlock __IP__);
+ /*
+ * If we got woken up by the owner then start loop
+ * all over without going into schedule to try
+ * to get the lock now:
+ */
+ if (unlikely(!waiter.task))
+ continue;
+
+ if (unlikely(ret))
+ break;
+ }
+
+ spin_unlock(&lock->wait_lock);
+
+ debug_rt_mutex_print_deadlock(&waiter);
+
+ if (waiter.task)
+ schedule_rt_mutex(lock);
+
+ spin_lock(&lock->wait_lock);
+ set_current_state(state);
+ }
+
+ set_current_state(TASK_RUNNING);
+
+ if (unlikely(waiter.task))
+ remove_waiter(lock, &waiter __IP__);
+
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit
+ * unconditionally. We might have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ spin_unlock(&lock->wait_lock);
+
+ /* Remove pending timer: */
+ if (unlikely(timeout))
+ hrtimer_cancel(&timeout->timer);
+
+ /*
+ * Readjust priority, when we did not get the lock. We might
+ * have been the pending owner and boosted. Since we did not
+ * take the lock, the PI boost has to go.
+ */
+ if (unlikely(ret))
+ rt_mutex_adjust_prio(current);
+
+ debug_rt_mutex_free_waiter(&waiter);
+
+ return ret;
+}
+
+/*
+ * Slow path try-lock function:
+ */
+static inline int
+rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__)
+{
+ int ret = 0;
+
+ spin_lock(&lock->wait_lock);
+
+ if (likely(rt_mutex_owner(lock) != current)) {
+
+ ret = try_to_take_rt_mutex(lock __IP__);
+ /*
+ * try_to_take_rt_mutex() sets the lock waiters
+ * bit unconditionally. Clean this up.
+ */
+ fixup_rt_mutex_waiters(lock);
+ }
+
+ spin_unlock(&lock->wait_lock);
+
+ return ret;
+}
+
+/*
+ * Slow path to release a rt-mutex:
+ */
+static void __sched
+rt_mutex_slowunlock(struct rt_mutex *lock)
+{
+ spin_lock(&lock->wait_lock);
+
+ debug_rt_mutex_unlock(lock);
+
+ rt_mutex_deadlock_account_unlock(current);
+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ spin_unlock(&lock->wait_lock);
+ return;
+ }
+
+ wakeup_next_waiter(lock);
+
+ spin_unlock(&lock->wait_lock);
+
+ /* Undo pi boosting if necessary: */
+ rt_mutex_adjust_prio(current);
+}
+
+/*
+ * debug aware fast / slowpath lock,trylock,unlock
+ *
+ * The atomic acquire/release ops are compiled away, when either the
+ * architecture does not support cmpxchg or when debugging is enabled.
+ */
+static inline int
+rt_mutex_fastlock(struct rt_mutex *lock, int state,
+ int detect_deadlock,
+ int (*slowfn)(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ int detect_deadlock __IP_DECL__))
+{
+ if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ rt_mutex_deadlock_account_lock(lock, current);
+ return 0;
+ } else
+ return slowfn(lock, state, NULL, detect_deadlock __RET_IP__);
+}
+
+static inline int
+rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout, int detect_deadlock,
+ int (*slowfn)(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ int detect_deadlock __IP_DECL__))
+{
+ if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ rt_mutex_deadlock_account_lock(lock, current);
+ return 0;
+ } else
+ return slowfn(lock, state, timeout, detect_deadlock __RET_IP__);
+}
+
+static inline int
+rt_mutex_fasttrylock(struct rt_mutex *lock,
+ int (*slowfn)(struct rt_mutex *lock __IP_DECL__))
+{
+ if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ rt_mutex_deadlock_account_lock(lock, current);
+ return 1;
+ }
+ return slowfn(lock __RET_IP__);
+}
+
+static inline void
+rt_mutex_fastunlock(struct rt_mutex *lock,
+ void (*slowfn)(struct rt_mutex *lock))
+{
+ if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+ rt_mutex_deadlock_account_unlock(current);
+ else
+ slowfn(lock);
+}
+
+/**
+ * rt_mutex_lock - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ */
+void __sched rt_mutex_lock(struct rt_mutex *lock)
+{
+ might_sleep();
+
+ rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock);
+
+/**
+ * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
+ *
+ * @lock: the rt_mutex to be locked
+ * @detect_deadlock: deadlock detection on/off
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ * -EDEADLK when the lock would deadlock (when deadlock detection is on)
+ */
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
+ int detect_deadlock)
+{
+ might_sleep();
+
+ return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
+ detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+
+/**
+ * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
+ * the timeout structure is provided
+ * by the caller
+ *
+ * @lock: the rt_mutex to be locked
+ * @timeout: timeout structure or NULL (no timeout)
+ * @detect_deadlock: deadlock detection on/off
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ * -ETIMEOUT when the timeout expired
+ * -EDEADLK when the lock would deadlock (when deadlock detection is on)
+ */
+int
+rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
+ int detect_deadlock)
+{
+ might_sleep();
+
+ return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+ detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
+
+/**
+ * rt_mutex_trylock - try to lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * Returns 1 on success and 0 on contention
+ */
+int __sched rt_mutex_trylock(struct rt_mutex *lock)
+{
+ return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_trylock);
+
+/**
+ * rt_mutex_unlock - unlock a rt_mutex
+ *
+ * @lock: the rt_mutex to be unlocked
+ */
+void __sched rt_mutex_unlock(struct rt_mutex *lock)
+{
+ rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+
+/***
+ * rt_mutex_destroy - mark a mutex unusable
+ * @lock: the mutex to be destroyed
+ *
+ * This function marks the mutex uninitialized, and any subsequent
+ * use of the mutex is forbidden. The mutex must not be locked when
+ * this function is called.
+ */
+void rt_mutex_destroy(struct rt_mutex *lock)
+{
+ WARN_ON(rt_mutex_is_locked(lock));
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ lock->magic = NULL;
+#endif
+}
+
+EXPORT_SYMBOL_GPL(rt_mutex_destroy);
+
+/**
+ * __rt_mutex_init - initialize the rt lock
+ *
+ * @lock: the rt lock to be initialized
+ *
+ * Initialize the rt lock to unlocked state.
+ *
+ * Initializing of a locked rt lock is not allowed
+ */
+void __rt_mutex_init(struct rt_mutex *lock, const char *name)
+{
+ lock->owner = NULL;
+ spin_lock_init(&lock->wait_lock);
+ plist_head_init(&lock->wait_list, &lock->wait_lock);
+
+ debug_rt_mutex_init(lock, name);
+}
+EXPORT_SYMBOL_GPL(__rt_mutex_init);
+
+/**
+ * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
+ * proxy owner
+ *
+ * @lock: the rt_mutex to be locked
+ * @proxy_owner:the task to set as owner
+ *
+ * No locking. Caller has to do serializing itself
+ * Special API call for PI-futex support
+ */
+void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+ struct task_struct *proxy_owner)
+{
+ __rt_mutex_init(lock, NULL);
+ debug_rt_mutex_proxy_lock(lock, proxy_owner __RET_IP__);
+ rt_mutex_set_owner(lock, proxy_owner, 0);
+ rt_mutex_deadlock_account_lock(lock, proxy_owner);
+}
+
+/**
+ * rt_mutex_proxy_unlock - release a lock on behalf of owner
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * No locking. Caller has to do serializing itself
+ * Special API call for PI-futex support
+ */
+void rt_mutex_proxy_unlock(struct rt_mutex *lock,
+ struct task_struct *proxy_owner)
+{
+ debug_rt_mutex_proxy_unlock(lock);
+ rt_mutex_set_owner(lock, NULL, 0);
+ rt_mutex_deadlock_account_unlock(proxy_owner);
+}
+
+/**
+ * rt_mutex_next_owner - return the next owner of the lock
+ *
+ * @lock: the rt lock query
+ *
+ * Returns the next owner of the lock or NULL
+ *
+ * Caller has to serialize against other accessors to the lock
+ * itself.
+ *
+ * Special API call for PI-futex support
+ */
+struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
+{
+ if (!rt_mutex_has_waiters(lock))
+ return NULL;
+
+ return rt_mutex_top_waiter(lock)->task;
+}
diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h
new file mode 100644
index 00000000000..1e0fca13ff7
--- /dev/null
+++ b/kernel/rtmutex.h
@@ -0,0 +1,29 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c.
+ * Non-debug version.
+ */
+
+#define __IP_DECL__
+#define __IP__
+#define __RET_IP__
+#define rt_mutex_deadlock_check(l) (0)
+#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
+#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
+#define debug_rt_mutex_init_waiter(w) do { } while (0)
+#define debug_rt_mutex_free_waiter(w) do { } while (0)
+#define debug_rt_mutex_lock(l) do { } while (0)
+#define debug_rt_mutex_proxy_lock(l,p) do { } while (0)
+#define debug_rt_mutex_proxy_unlock(l) do { } while (0)
+#define debug_rt_mutex_unlock(l) do { } while (0)
+#define debug_rt_mutex_init(m, n) do { } while (0)
+#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
+#define debug_rt_mutex_print_deadlock(w) do { } while (0)
+#define debug_rt_mutex_detect_deadlock(w,d) (d)
+#define debug_rt_mutex_reset_waiter(w) do { } while (0)
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
new file mode 100644
index 00000000000..9c75856e791
--- /dev/null
+++ b/kernel/rtmutex_common.h
@@ -0,0 +1,123 @@
+/*
+ * RT Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains the private data structure and API definitions.
+ */
+
+#ifndef __KERNEL_RTMUTEX_COMMON_H
+#define __KERNEL_RTMUTEX_COMMON_H
+
+#include <linux/rtmutex.h>
+
+/*
+ * The rtmutex in kernel tester is independent of rtmutex debugging. We
+ * call schedule_rt_mutex_test() instead of schedule() for the tasks which
+ * belong to the tester. That way we can delay the wakeup path of those
+ * threads to provoke lock stealing and testing of complex boosting scenarios.
+ */
+#ifdef CONFIG_RT_MUTEX_TESTER
+
+extern void schedule_rt_mutex_test(struct rt_mutex *lock);
+
+#define schedule_rt_mutex(_lock) \
+ do { \
+ if (!(current->flags & PF_MUTEX_TESTER)) \
+ schedule(); \
+ else \
+ schedule_rt_mutex_test(_lock); \
+ } while (0)
+
+#else
+# define schedule_rt_mutex(_lock) schedule()
+#endif
+
+/*
+ * This is the control structure for tasks blocked on a rt_mutex,
+ * which is allocated on the kernel stack on of the blocked task.
+ *
+ * @list_entry: pi node to enqueue into the mutex waiters list
+ * @pi_list_entry: pi node to enqueue into the mutex owner waiters list
+ * @task: task reference to the blocked task
+ */
+struct rt_mutex_waiter {
+ struct plist_node list_entry;
+ struct plist_node pi_list_entry;
+ struct task_struct *task;
+ struct rt_mutex *lock;
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ unsigned long ip;
+ pid_t deadlock_task_pid;
+ struct rt_mutex *deadlock_lock;
+#endif
+};
+
+/*
+ * Various helpers to access the waiters-plist:
+ */
+static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+{
+ return !plist_head_empty(&lock->wait_list);
+}
+
+static inline struct rt_mutex_waiter *
+rt_mutex_top_waiter(struct rt_mutex *lock)
+{
+ struct rt_mutex_waiter *w;
+
+ w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
+ list_entry);
+ BUG_ON(w->lock != lock);
+
+ return w;
+}
+
+static inline int task_has_pi_waiters(struct task_struct *p)
+{
+ return !plist_head_empty(&p->pi_waiters);
+}
+
+static inline struct rt_mutex_waiter *
+task_top_pi_waiter(struct task_struct *p)
+{
+ return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter,
+ pi_list_entry);
+}
+
+/*
+ * lock->owner state tracking:
+ */
+#define RT_MUTEX_OWNER_PENDING 1UL
+#define RT_MUTEX_HAS_WAITERS 2UL
+#define RT_MUTEX_OWNER_MASKALL 3UL
+
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
+{
+ return (struct task_struct *)
+ ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
+}
+
+static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
+{
+ return (struct task_struct *)
+ ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
+}
+
+static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
+{
+ return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
+}
+
+/*
+ * PI-futex support (proxy locking functions, etc.):
+ */
+extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
+extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+ struct task_struct *proxy_owner);
+extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
+ struct task_struct *proxy_owner);
+#endif
diff --git a/kernel/sched.c b/kernel/sched.c
index a856040c200..2629c1711fd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -168,15 +168,21 @@
*/
#define SCALE_PRIO(x, prio) \
- max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
+ max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
-static unsigned int task_timeslice(task_t *p)
+static unsigned int static_prio_timeslice(int static_prio)
{
- if (p->static_prio < NICE_TO_PRIO(0))
- return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
+ if (static_prio < NICE_TO_PRIO(0))
+ return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
else
- return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
+ return SCALE_PRIO(DEF_TIMESLICE, static_prio);
}
+
+static inline unsigned int task_timeslice(task_t *p)
+{
+ return static_prio_timeslice(p->static_prio);
+}
+
#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
< (long long) (sd)->cache_hot_time)
@@ -184,13 +190,11 @@ static unsigned int task_timeslice(task_t *p)
* These are the runqueue data structures:
*/
-#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
-
typedef struct runqueue runqueue_t;
struct prio_array {
unsigned int nr_active;
- unsigned long bitmap[BITMAP_SIZE];
+ DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
struct list_head queue[MAX_PRIO];
};
@@ -209,6 +213,7 @@ struct runqueue {
* remote CPUs use both these fields when doing load calculation.
*/
unsigned long nr_running;
+ unsigned long raw_weighted_load;
#ifdef CONFIG_SMP
unsigned long cpu_load[3];
#endif
@@ -239,7 +244,6 @@ struct runqueue {
task_t *migration_thread;
struct list_head migration_queue;
- int cpu;
#endif
#ifdef CONFIG_SCHEDSTATS
@@ -351,11 +355,30 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
/*
+ * __task_rq_lock - lock the runqueue a given task resides on.
+ * Must be called interrupts disabled.
+ */
+static inline runqueue_t *__task_rq_lock(task_t *p)
+ __acquires(rq->lock)
+{
+ struct runqueue *rq;
+
+repeat_lock_task:
+ rq = task_rq(p);
+ spin_lock(&rq->lock);
+ if (unlikely(rq != task_rq(p))) {
+ spin_unlock(&rq->lock);
+ goto repeat_lock_task;
+ }
+ return rq;
+}
+
+/*
* task_rq_lock - lock the runqueue a given task resides on and disable
* interrupts. Note the ordering: we can safely lookup the task_rq without
* explicitly disabling preemption.
*/
-static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
+static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
__acquires(rq->lock)
{
struct runqueue *rq;
@@ -371,6 +394,12 @@ repeat_lock_task:
return rq;
}
+static inline void __task_rq_unlock(runqueue_t *rq)
+ __releases(rq->lock)
+{
+ spin_unlock(&rq->lock);
+}
+
static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
__releases(rq->lock)
{
@@ -634,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
}
/*
- * effective_prio - return the priority that is based on the static
+ * __normal_prio - return the priority that is based on the static
* priority but is modified by bonuses/penalties.
*
* We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
@@ -647,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
*
* Both properties are important to certain workloads.
*/
-static int effective_prio(task_t *p)
+
+static inline int __normal_prio(task_t *p)
{
int bonus, prio;
- if (rt_task(p))
- return p->prio;
-
bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
prio = p->static_prio - bonus;
@@ -665,6 +692,106 @@ static int effective_prio(task_t *p)
}
/*
+ * To aid in avoiding the subversion of "niceness" due to uneven distribution
+ * of tasks with abnormal "nice" values across CPUs the contribution that
+ * each task makes to its run queue's load is weighted according to its
+ * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
+ * scaled version of the new time slice allocation that they receive on time
+ * slice expiry etc.
+ */
+
+/*
+ * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
+ * If static_prio_timeslice() is ever changed to break this assumption then
+ * this code will need modification
+ */
+#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
+#define LOAD_WEIGHT(lp) \
+ (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
+#define PRIO_TO_LOAD_WEIGHT(prio) \
+ LOAD_WEIGHT(static_prio_timeslice(prio))
+#define RTPRIO_TO_LOAD_WEIGHT(rp) \
+ (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
+
+static void set_load_weight(task_t *p)
+{
+ if (has_rt_policy(p)) {
+#ifdef CONFIG_SMP
+ if (p == task_rq(p)->migration_thread)
+ /*
+ * The migration thread does the actual balancing.
+ * Giving its load any weight will skew balancing
+ * adversely.
+ */
+ p->load_weight = 0;
+ else
+#endif
+ p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
+ } else
+ p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+}
+
+static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
+{
+ rq->raw_weighted_load += p->load_weight;
+}
+
+static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p)
+{
+ rq->raw_weighted_load -= p->load_weight;
+}
+
+static inline void inc_nr_running(task_t *p, runqueue_t *rq)
+{
+ rq->nr_running++;
+ inc_raw_weighted_load(rq, p);
+}
+
+static inline void dec_nr_running(task_t *p, runqueue_t *rq)
+{
+ rq->nr_running--;
+ dec_raw_weighted_load(rq, p);
+}
+
+/*
+ * Calculate the expected normal priority: i.e. priority
+ * without taking RT-inheritance into account. Might be
+ * boosted by interactivity modifiers. Changes upon fork,
+ * setprio syscalls, and whenever the interactivity
+ * estimator recalculates.
+ */
+static inline int normal_prio(task_t *p)
+{
+ int prio;
+
+ if (has_rt_policy(p))
+ prio = MAX_RT_PRIO-1 - p->rt_priority;
+ else
+ prio = __normal_prio(p);
+ return prio;
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks, or might be boosted by
+ * interactivity modifiers. Will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(task_t *p)
+{
+ p->normal_prio = normal_prio(p);
+ /*
+ * If we are RT tasks or we were boosted to RT priority,
+ * keep the priority unchanged. Otherwise, update priority
+ * to the normal priority:
+ */
+ if (!rt_prio(p->prio))
+ return p->normal_prio;
+ return p->prio;
+}
+
+/*
* __activate_task - move a task to the runqueue.
*/
static void __activate_task(task_t *p, runqueue_t *rq)
@@ -674,7 +801,7 @@ static void __activate_task(task_t *p, runqueue_t *rq)
if (batch_task(p))
target = rq->expired;
enqueue_task(p, target);
- rq->nr_running++;
+ inc_nr_running(p, rq);
}
/*
@@ -683,39 +810,45 @@ static void __activate_task(task_t *p, runqueue_t *rq)
static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
{
enqueue_task_head(p, rq->active);
- rq->nr_running++;
+ inc_nr_running(p, rq);
}
+/*
+ * Recalculate p->normal_prio and p->prio after having slept,
+ * updating the sleep-average too:
+ */
static int recalc_task_prio(task_t *p, unsigned long long now)
{
/* Caller must always ensure 'now >= p->timestamp' */
- unsigned long long __sleep_time = now - p->timestamp;
- unsigned long sleep_time;
+ unsigned long sleep_time = now - p->timestamp;
if (batch_task(p))
sleep_time = 0;
- else {
- if (__sleep_time > NS_MAX_SLEEP_AVG)
- sleep_time = NS_MAX_SLEEP_AVG;
- else
- sleep_time = (unsigned long)__sleep_time;
- }
if (likely(sleep_time > 0)) {
/*
- * User tasks that sleep a long time are categorised as
- * idle. They will only have their sleep_avg increased to a
- * level that makes them just interactive priority to stay
- * active yet prevent them suddenly becoming cpu hogs and
- * starving other processes.
+ * This ceiling is set to the lowest priority that would allow
+ * a task to be reinserted into the active array on timeslice
+ * completion.
*/
- if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) {
- unsigned long ceiling;
+ unsigned long ceiling = INTERACTIVE_SLEEP(p);
- ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG -
- DEF_TIMESLICE);
- if (p->sleep_avg < ceiling)
- p->sleep_avg = ceiling;
+ if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
+ /*
+ * Prevents user tasks from achieving best priority
+ * with one single large enough sleep.
+ */
+ p->sleep_avg = ceiling;
+ /*
+ * Using INTERACTIVE_SLEEP() as a ceiling places a
+ * nice(0) task 1ms sleep away from promotion, and
+ * gives it 700ms to round-robin with no chance of
+ * being demoted. This is more than generous, so
+ * mark this sleep as non-interactive to prevent the
+ * on-runqueue bonus logic from intervening should
+ * this task not receive cpu immediately.
+ */
+ p->sleep_type = SLEEP_NONINTERACTIVE;
} else {
/*
* Tasks waking from uninterruptible sleep are
@@ -723,12 +856,12 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
* are likely to be waiting on I/O
*/
if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
- if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
+ if (p->sleep_avg >= ceiling)
sleep_time = 0;
else if (p->sleep_avg + sleep_time >=
- INTERACTIVE_SLEEP(p)) {
- p->sleep_avg = INTERACTIVE_SLEEP(p);
- sleep_time = 0;
+ ceiling) {
+ p->sleep_avg = ceiling;
+ sleep_time = 0;
}
}
@@ -742,9 +875,9 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
*/
p->sleep_avg += sleep_time;
- if (p->sleep_avg > NS_MAX_SLEEP_AVG)
- p->sleep_avg = NS_MAX_SLEEP_AVG;
}
+ if (p->sleep_avg > NS_MAX_SLEEP_AVG)
+ p->sleep_avg = NS_MAX_SLEEP_AVG;
}
return effective_prio(p);
@@ -805,7 +938,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
*/
static void deactivate_task(struct task_struct *p, runqueue_t *rq)
{
- rq->nr_running--;
+ dec_nr_running(p, rq);
dequeue_task(p, p->array);
p->array = NULL;
}
@@ -860,6 +993,12 @@ inline int task_curr(const task_t *p)
return cpu_curr(task_cpu(p)) == p;
}
+/* Used instead of source_load when we know the type == 0 */
+unsigned long weighted_cpuload(const int cpu)
+{
+ return cpu_rq(cpu)->raw_weighted_load;
+}
+
#ifdef CONFIG_SMP
typedef struct {
struct list_head list;
@@ -949,7 +1088,8 @@ void kick_process(task_t *p)
}
/*
- * Return a low guess at the load of a migration-source cpu.
+ * Return a low guess at the load of a migration-source cpu weighted
+ * according to the scheduling class and "nice" value.
*
* We want to under-estimate the load of migration sources, to
* balance conservatively.
@@ -957,24 +1097,36 @@ void kick_process(task_t *p)
static inline unsigned long source_load(int cpu, int type)
{
runqueue_t *rq = cpu_rq(cpu);
- unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+
if (type == 0)
- return load_now;
+ return rq->raw_weighted_load;
- return min(rq->cpu_load[type-1], load_now);
+ return min(rq->cpu_load[type-1], rq->raw_weighted_load);
}
/*
- * Return a high guess at the load of a migration-target cpu
+ * Return a high guess at the load of a migration-target cpu weighted
+ * according to the scheduling class and "nice" value.
*/
static inline unsigned long target_load(int cpu, int type)
{
runqueue_t *rq = cpu_rq(cpu);
- unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+
if (type == 0)
- return load_now;
+ return rq->raw_weighted_load;
+
+ return max(rq->cpu_load[type-1], rq->raw_weighted_load);
+}
+
+/*
+ * Return the average load per task on the cpu's run queue
+ */
+static inline unsigned long cpu_avg_load_per_task(int cpu)
+{
+ runqueue_t *rq = cpu_rq(cpu);
+ unsigned long n = rq->nr_running;
- return max(rq->cpu_load[type-1], load_now);
+ return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
}
/*
@@ -1047,7 +1199,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
cpus_and(tmp, group->cpumask, p->cpus_allowed);
for_each_cpu_mask(i, tmp) {
- load = source_load(i, 0);
+ load = weighted_cpuload(i);
if (load < min_load || (load == min_load && i == this_cpu)) {
min_load = load;
@@ -1074,9 +1226,15 @@ static int sched_balance_self(int cpu, int flag)
struct task_struct *t = current;
struct sched_domain *tmp, *sd = NULL;
- for_each_domain(cpu, tmp)
+ for_each_domain(cpu, tmp) {
+ /*
+ * If power savings logic is enabled for a domain, stop there.
+ */
+ if (tmp->flags & SD_POWERSAVINGS_BALANCE)
+ break;
if (tmp->flags & flag)
sd = tmp;
+ }
while (sd) {
cpumask_t span;
@@ -1226,17 +1384,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
if (this_sd->flags & SD_WAKE_AFFINE) {
unsigned long tl = this_load;
+ unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
+
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
* of the current CPU:
*/
if (sync)
- tl -= SCHED_LOAD_SCALE;
+ tl -= current->load_weight;
if ((tl <= load &&
- tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
- 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
+ tl + target_load(cpu, idx) <= tl_per_task) ||
+ 100*(tl + p->load_weight) <= imbalance*load) {
/*
* This domain has SD_WAKE_AFFINE and
* p is cache cold in this domain, and
@@ -1353,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags)
* event cannot wake it up and insert it on the runqueue either.
*/
p->state = TASK_RUNNING;
+
+ /*
+ * Make sure we do not leak PI boosting priority to the child:
+ */
+ p->prio = current->normal_prio;
+
INIT_LIST_HEAD(&p->run_list);
p->array = NULL;
#ifdef CONFIG_SCHEDSTATS
@@ -1432,10 +1598,11 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
__activate_task(p, rq);
else {
p->prio = current->prio;
+ p->normal_prio = current->normal_prio;
list_add_tail(&p->run_list, &current->run_list);
p->array = current->array;
p->array->nr_active++;
- rq->nr_running++;
+ inc_nr_running(p, rq);
}
set_need_resched();
} else
@@ -1653,7 +1820,8 @@ unsigned long nr_uninterruptible(void)
unsigned long long nr_context_switches(void)
{
- unsigned long long i, sum = 0;
+ int i;
+ unsigned long long sum = 0;
for_each_possible_cpu(i)
sum += cpu_rq(i)->nr_switches;
@@ -1691,9 +1859,6 @@ unsigned long nr_active(void)
/*
* double_rq_lock - safely lock two runqueues
*
- * We must take them in cpu order to match code in
- * dependent_sleeper and wake_dependent_sleeper.
- *
* Note this does not disable interrupts like task_rq_lock,
* you need to do so manually before calling.
*/
@@ -1705,7 +1870,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
spin_lock(&rq1->lock);
__acquire(rq2->lock); /* Fake it out ;) */
} else {
- if (rq1->cpu < rq2->cpu) {
+ if (rq1 < rq2) {
spin_lock(&rq1->lock);
spin_lock(&rq2->lock);
} else {
@@ -1741,7 +1906,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
__acquires(this_rq->lock)
{
if (unlikely(!spin_trylock(&busiest->lock))) {
- if (busiest->cpu < this_rq->cpu) {
+ if (busiest < this_rq) {
spin_unlock(&this_rq->lock);
spin_lock(&busiest->lock);
spin_lock(&this_rq->lock);
@@ -1804,9 +1969,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
{
dequeue_task(p, src_array);
- src_rq->nr_running--;
+ dec_nr_running(p, src_rq);
set_task_cpu(p, this_cpu);
- this_rq->nr_running++;
+ inc_nr_running(p, this_rq);
enqueue_task(p, this_array);
p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
+ this_rq->timestamp_last_tick;
@@ -1853,26 +2018,42 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
return 1;
}
+#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
/*
- * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
- * as part of a balancing operation within "domain". Returns the number of
- * tasks moved.
+ * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
+ * load from busiest to this_rq, as part of a balancing operation within
+ * "domain". Returns the number of tasks moved.
*
* Called with both runqueues locked.
*/
static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
- unsigned long max_nr_move, struct sched_domain *sd,
- enum idle_type idle, int *all_pinned)
+ unsigned long max_nr_move, unsigned long max_load_move,
+ struct sched_domain *sd, enum idle_type idle,
+ int *all_pinned)
{
prio_array_t *array, *dst_array;
struct list_head *head, *curr;
- int idx, pulled = 0, pinned = 0;
+ int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio;
+ int busiest_best_prio_seen;
+ int skip_for_load; /* skip the task based on weighted load issues */
+ long rem_load_move;
task_t *tmp;
- if (max_nr_move == 0)
+ if (max_nr_move == 0 || max_load_move == 0)
goto out;
+ rem_load_move = max_load_move;
pinned = 1;
+ this_best_prio = rq_best_prio(this_rq);
+ busiest_best_prio = rq_best_prio(busiest);
+ /*
+ * Enable handling of the case where there is more than one task
+ * with the best priority. If the current running task is one
+ * of those with prio==busiest_best_prio we know it won't be moved
+ * and therefore it's safe to override the skip (based on load) of
+ * any task we find with that prio.
+ */
+ busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio;
/*
* We first consider expired tasks. Those will likely not be
@@ -1912,7 +2093,17 @@ skip_queue:
curr = curr->prev;
- if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
+ /*
+ * To help distribute high priority tasks accross CPUs we don't
+ * skip a task if it will be the highest priority task (i.e. smallest
+ * prio value) on its new queue regardless of its load weight
+ */
+ skip_for_load = tmp->load_weight > rem_load_move;
+ if (skip_for_load && idx < this_best_prio)
+ skip_for_load = !busiest_best_prio_seen && idx == busiest_best_prio;
+ if (skip_for_load ||
+ !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
+ busiest_best_prio_seen |= idx == busiest_best_prio;
if (curr != head)
goto skip_queue;
idx++;
@@ -1926,9 +2117,15 @@ skip_queue:
pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
pulled++;
+ rem_load_move -= tmp->load_weight;
- /* We only want to steal up to the prescribed number of tasks. */
- if (pulled < max_nr_move) {
+ /*
+ * We only want to steal up to the prescribed number of tasks
+ * and the prescribed amount of weighted load.
+ */
+ if (pulled < max_nr_move && rem_load_move > 0) {
+ if (idx < this_best_prio)
+ this_best_prio = idx;
if (curr != head)
goto skip_queue;
idx++;
@@ -1949,7 +2146,7 @@ out:
/*
* find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the number of tasks which should be
+ * domain. It calculates and returns the amount of weighted load which should be
* moved to restore balance via the imbalance parameter.
*/
static struct sched_group *
@@ -1959,9 +2156,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
unsigned long max_pull;
+ unsigned long busiest_load_per_task, busiest_nr_running;
+ unsigned long this_load_per_task, this_nr_running;
int load_idx;
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ int power_savings_balance = 1;
+ unsigned long leader_nr_running = 0, min_load_per_task = 0;
+ unsigned long min_nr_running = ULONG_MAX;
+ struct sched_group *group_min = NULL, *group_leader = NULL;
+#endif
max_load = this_load = total_load = total_pwr = 0;
+ busiest_load_per_task = busiest_nr_running = 0;
+ this_load_per_task = this_nr_running = 0;
if (idle == NOT_IDLE)
load_idx = sd->busy_idx;
else if (idle == NEWLY_IDLE)
@@ -1970,16 +2177,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
load_idx = sd->idle_idx;
do {
- unsigned long load;
+ unsigned long load, group_capacity;
int local_group;
int i;
+ unsigned long sum_nr_running, sum_weighted_load;
local_group = cpu_isset(this_cpu, group->cpumask);
/* Tally up the load of all CPUs in the group */
- avg_load = 0;
+ sum_weighted_load = sum_nr_running = avg_load = 0;
for_each_cpu_mask(i, group->cpumask) {
+ runqueue_t *rq = cpu_rq(i);
+
if (*sd_idle && !idle_cpu(i))
*sd_idle = 0;
@@ -1990,6 +2200,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
load = source_load(i, load_idx);
avg_load += load;
+ sum_nr_running += rq->nr_running;
+ sum_weighted_load += rq->raw_weighted_load;
}
total_load += avg_load;
@@ -1998,17 +2210,80 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
/* Adjust by relative CPU power of the group */
avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
+
if (local_group) {
this_load = avg_load;
this = group;
- } else if (avg_load > max_load) {
+ this_nr_running = sum_nr_running;
+ this_load_per_task = sum_weighted_load;
+ } else if (avg_load > max_load &&
+ sum_nr_running > group_capacity) {
max_load = avg_load;
busiest = group;
+ busiest_nr_running = sum_nr_running;
+ busiest_load_per_task = sum_weighted_load;
}
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ /*
+ * Busy processors will not participate in power savings
+ * balance.
+ */
+ if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ goto group_next;
+
+ /*
+ * If the local group is idle or completely loaded
+ * no need to do power savings balance at this domain
+ */
+ if (local_group && (this_nr_running >= group_capacity ||
+ !this_nr_running))
+ power_savings_balance = 0;
+
+ /*
+ * If a group is already running at full capacity or idle,
+ * don't include that group in power savings calculations
+ */
+ if (!power_savings_balance || sum_nr_running >= group_capacity
+ || !sum_nr_running)
+ goto group_next;
+
+ /*
+ * Calculate the group which has the least non-idle load.
+ * This is the group from where we need to pick up the load
+ * for saving power
+ */
+ if ((sum_nr_running < min_nr_running) ||
+ (sum_nr_running == min_nr_running &&
+ first_cpu(group->cpumask) <
+ first_cpu(group_min->cpumask))) {
+ group_min = group;
+ min_nr_running = sum_nr_running;
+ min_load_per_task = sum_weighted_load /
+ sum_nr_running;
+ }
+
+ /*
+ * Calculate the group which is almost near its
+ * capacity but still has some space to pick up some load
+ * from other group and save more power
+ */
+ if (sum_nr_running <= group_capacity - 1)
+ if (sum_nr_running > leader_nr_running ||
+ (sum_nr_running == leader_nr_running &&
+ first_cpu(group->cpumask) >
+ first_cpu(group_leader->cpumask))) {
+ group_leader = group;
+ leader_nr_running = sum_nr_running;
+ }
+
+group_next:
+#endif
group = group->next;
} while (group != sd->groups);
- if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
+ if (!busiest || this_load >= max_load || busiest_nr_running == 0)
goto out_balanced;
avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -2017,6 +2292,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
100*max_load <= sd->imbalance_pct*this_load)
goto out_balanced;
+ busiest_load_per_task /= busiest_nr_running;
/*
* We're trying to get all the cpus to the average_load, so we don't
* want to push ourselves above the average load, nor do we wish to
@@ -2028,21 +2304,50 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* by pulling tasks to us. Be careful of negative numbers as they'll
* appear as very large values with unsigned longs.
*/
+ if (max_load <= busiest_load_per_task)
+ goto out_balanced;
+
+ /*
+ * In the presence of smp nice balancing, certain scenarios can have
+ * max load less than avg load(as we skip the groups at or below
+ * its cpu_power, while calculating max_load..)
+ */
+ if (max_load < avg_load) {
+ *imbalance = 0;
+ goto small_imbalance;
+ }
/* Don't want to pull so many tasks that a group would go idle */
- max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
+ max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
/* How much load to actually move to equalise the imbalance */
*imbalance = min(max_pull * busiest->cpu_power,
(avg_load - this_load) * this->cpu_power)
/ SCHED_LOAD_SCALE;
- if (*imbalance < SCHED_LOAD_SCALE) {
- unsigned long pwr_now = 0, pwr_move = 0;
+ /*
+ * if *imbalance is less than the average load per runnable task
+ * there is no gaurantee that any tasks will be moved so we'll have
+ * a think about bumping its value to force at least one task to be
+ * moved
+ */
+ if (*imbalance < busiest_load_per_task) {
+ unsigned long pwr_now, pwr_move;
unsigned long tmp;
+ unsigned int imbn;
+
+small_imbalance:
+ pwr_move = pwr_now = 0;
+ imbn = 2;
+ if (this_nr_running) {
+ this_load_per_task /= this_nr_running;
+ if (busiest_load_per_task > this_load_per_task)
+ imbn = 1;
+ } else
+ this_load_per_task = SCHED_LOAD_SCALE;
- if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
- *imbalance = 1;
+ if (max_load - this_load >= busiest_load_per_task * imbn) {
+ *imbalance = busiest_load_per_task;
return busiest;
}
@@ -2052,39 +2357,47 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* moving them.
*/
- pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
- pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
+ pwr_now += busiest->cpu_power *
+ min(busiest_load_per_task, max_load);
+ pwr_now += this->cpu_power *
+ min(this_load_per_task, this_load);
pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */
- tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
+ tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
if (max_load > tmp)
- pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
- max_load - tmp);
+ pwr_move += busiest->cpu_power *
+ min(busiest_load_per_task, max_load - tmp);
/* Amount of load we'd add */
if (max_load*busiest->cpu_power <
- SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
+ busiest_load_per_task*SCHED_LOAD_SCALE)
tmp = max_load*busiest->cpu_power/this->cpu_power;
else
- tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
- pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
+ tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
+ pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE;
/* Move if we gain throughput */
if (pwr_move <= pwr_now)
goto out_balanced;
- *imbalance = 1;
- return busiest;
+ *imbalance = busiest_load_per_task;
}
- /* Get rid of the scaling factor, rounding down as we divide */
- *imbalance = *imbalance / SCHED_LOAD_SCALE;
return busiest;
out_balanced:
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ goto ret;
+ if (this == group_leader && group_leader != group_min) {
+ *imbalance = min_load_per_task;
+ return group_min;
+ }
+ret:
+#endif
*imbalance = 0;
return NULL;
}
@@ -2093,18 +2406,21 @@ out_balanced:
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
static runqueue_t *find_busiest_queue(struct sched_group *group,
- enum idle_type idle)
+ enum idle_type idle, unsigned long imbalance)
{
- unsigned long load, max_load = 0;
- runqueue_t *busiest = NULL;
+ unsigned long max_load = 0;
+ runqueue_t *busiest = NULL, *rqi;
int i;
for_each_cpu_mask(i, group->cpumask) {
- load = source_load(i, 0);
+ rqi = cpu_rq(i);
+
+ if (rqi->nr_running == 1 && rqi->raw_weighted_load > imbalance)
+ continue;
- if (load > max_load) {
- max_load = load;
- busiest = cpu_rq(i);
+ if (rqi->raw_weighted_load > max_load) {
+ max_load = rqi->raw_weighted_load;
+ busiest = rqi;
}
}
@@ -2117,6 +2433,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group,
*/
#define MAX_PINNED_INTERVAL 512
+#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0)
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
@@ -2133,7 +2450,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
int active_balance = 0;
int sd_idle = 0;
- if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
+ if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+ !sched_smt_power_savings)
sd_idle = 1;
schedstat_inc(sd, lb_cnt[idle]);
@@ -2144,7 +2462,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
goto out_balanced;
}
- busiest = find_busiest_queue(group, idle);
+ busiest = find_busiest_queue(group, idle, imbalance);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
@@ -2164,6 +2482,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
*/
double_rq_lock(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
+ minus_1_or_zero(busiest->nr_running),
imbalance, sd, idle, &all_pinned);
double_rq_unlock(this_rq, busiest);
@@ -2221,7 +2540,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
sd->balance_interval *= 2;
}
- if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+ !sched_smt_power_savings)
return -1;
return nr_moved;
@@ -2236,7 +2556,7 @@ out_one_pinned:
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
- if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
return -1;
return 0;
}
@@ -2257,7 +2577,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
int nr_moved = 0;
int sd_idle = 0;
- if (sd->flags & SD_SHARE_CPUPOWER)
+ if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
sd_idle = 1;
schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2267,7 +2587,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
goto out_balanced;
}
- busiest = find_busiest_queue(group, NEWLY_IDLE);
+ busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
goto out_balanced;
@@ -2282,6 +2602,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
/* Attempt to move tasks */
double_lock_balance(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
+ minus_1_or_zero(busiest->nr_running),
imbalance, sd, NEWLY_IDLE, NULL);
spin_unlock(&busiest->lock);
}
@@ -2297,7 +2618,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
out_balanced:
schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
- if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
return -1;
sd->nr_balance_failed = 0;
return 0;
@@ -2352,17 +2673,19 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
double_lock_balance(busiest_rq, target_rq);
/* Search for an sd spanning us and the target CPU. */
- for_each_domain(target_cpu, sd)
+ for_each_domain(target_cpu, sd) {
if ((sd->flags & SD_LOAD_BALANCE) &&
cpu_isset(busiest_cpu, sd->span))
break;
+ }
if (unlikely(sd == NULL))
goto out;
schedstat_inc(sd, alb_cnt);
- if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
+ if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
+ RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL))
schedstat_inc(sd, alb_pushed);
else
schedstat_inc(sd, alb_failed);
@@ -2390,7 +2713,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
struct sched_domain *sd;
int i;
- this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
+ this_load = this_rq->raw_weighted_load;
/* Update our load */
for (i = 0; i < 3; i++) {
unsigned long new_load = this_load;
@@ -2691,48 +3014,35 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq)
resched_task(rq->idle);
}
-static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
+/*
+ * Called with interrupt disabled and this_rq's runqueue locked.
+ */
+static void wake_sleeping_dependent(int this_cpu)
{
struct sched_domain *tmp, *sd = NULL;
- cpumask_t sibling_map;
int i;
- for_each_domain(this_cpu, tmp)
- if (tmp->flags & SD_SHARE_CPUPOWER)
+ for_each_domain(this_cpu, tmp) {
+ if (tmp->flags & SD_SHARE_CPUPOWER) {
sd = tmp;
+ break;
+ }
+ }
if (!sd)
return;
- /*
- * Unlock the current runqueue because we have to lock in
- * CPU order to avoid deadlocks. Caller knows that we might
- * unlock. We keep IRQs disabled.
- */
- spin_unlock(&this_rq->lock);
-
- sibling_map = sd->span;
-
- for_each_cpu_mask(i, sibling_map)
- spin_lock(&cpu_rq(i)->lock);
- /*
- * We clear this CPU from the mask. This both simplifies the
- * inner loop and keps this_rq locked when we exit:
- */
- cpu_clear(this_cpu, sibling_map);
-
- for_each_cpu_mask(i, sibling_map) {
+ for_each_cpu_mask(i, sd->span) {
runqueue_t *smt_rq = cpu_rq(i);
+ if (i == this_cpu)
+ continue;
+ if (unlikely(!spin_trylock(&smt_rq->lock)))
+ continue;
+
wakeup_busy_runqueue(smt_rq);
+ spin_unlock(&smt_rq->lock);
}
-
- for_each_cpu_mask(i, sibling_map)
- spin_unlock(&cpu_rq(i)->lock);
- /*
- * We exit with this_cpu's rq still held and IRQs
- * still disabled:
- */
}
/*
@@ -2745,52 +3055,46 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
return p->time_slice * (100 - sd->per_cpu_gain) / 100;
}
-static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+/*
+ * To minimise lock contention and not have to drop this_rq's runlock we only
+ * trylock the sibling runqueues and bypass those runqueues if we fail to
+ * acquire their lock. As we only trylock the normal locking order does not
+ * need to be obeyed.
+ */
+static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
{
struct sched_domain *tmp, *sd = NULL;
- cpumask_t sibling_map;
- prio_array_t *array;
int ret = 0, i;
- task_t *p;
- for_each_domain(this_cpu, tmp)
- if (tmp->flags & SD_SHARE_CPUPOWER)
+ /* kernel/rt threads do not participate in dependent sleeping */
+ if (!p->mm || rt_task(p))
+ return 0;
+
+ for_each_domain(this_cpu, tmp) {
+ if (tmp->flags & SD_SHARE_CPUPOWER) {
sd = tmp;
+ break;
+ }
+ }
if (!sd)
return 0;
- /*
- * The same locking rules and details apply as for
- * wake_sleeping_dependent():
- */
- spin_unlock(&this_rq->lock);
- sibling_map = sd->span;
- for_each_cpu_mask(i, sibling_map)
- spin_lock(&cpu_rq(i)->lock);
- cpu_clear(this_cpu, sibling_map);
+ for_each_cpu_mask(i, sd->span) {
+ runqueue_t *smt_rq;
+ task_t *smt_curr;
- /*
- * Establish next task to be run - it might have gone away because
- * we released the runqueue lock above:
- */
- if (!this_rq->nr_running)
- goto out_unlock;
- array = this_rq->active;
- if (!array->nr_active)
- array = this_rq->expired;
- BUG_ON(!array->nr_active);
+ if (i == this_cpu)
+ continue;
- p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
- task_t, run_list);
+ smt_rq = cpu_rq(i);
+ if (unlikely(!spin_trylock(&smt_rq->lock)))
+ continue;
- for_each_cpu_mask(i, sibling_map) {
- runqueue_t *smt_rq = cpu_rq(i);
- task_t *smt_curr = smt_rq->curr;
+ smt_curr = smt_rq->curr;
- /* Kernel threads do not participate in dependent sleeping */
- if (!p->mm || !smt_curr->mm || rt_task(p))
- goto check_smt_task;
+ if (!smt_curr->mm)
+ goto unlock;
/*
* If a user task with lower static priority than the
@@ -2808,49 +3112,24 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
if ((jiffies % DEF_TIMESLICE) >
(sd->per_cpu_gain * DEF_TIMESLICE / 100))
ret = 1;
- } else
+ } else {
if (smt_curr->static_prio < p->static_prio &&
!TASK_PREEMPTS_CURR(p, smt_rq) &&
smt_slice(smt_curr, sd) > task_timeslice(p))
ret = 1;
-
-check_smt_task:
- if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
- rt_task(smt_curr))
- continue;
- if (!p->mm) {
- wakeup_busy_runqueue(smt_rq);
- continue;
- }
-
- /*
- * Reschedule a lower priority task on the SMT sibling for
- * it to be put to sleep, or wake it up if it has been put to
- * sleep for priority reasons to see if it should run now.
- */
- if (rt_task(p)) {
- if ((jiffies % DEF_TIMESLICE) >
- (sd->per_cpu_gain * DEF_TIMESLICE / 100))
- resched_task(smt_curr);
- } else {
- if (TASK_PREEMPTS_CURR(p, smt_rq) &&
- smt_slice(p, sd) > task_timeslice(smt_curr))
- resched_task(smt_curr);
- else
- wakeup_busy_runqueue(smt_rq);
}
+unlock:
+ spin_unlock(&smt_rq->lock);
}
-out_unlock:
- for_each_cpu_mask(i, sibling_map)
- spin_unlock(&cpu_rq(i)->lock);
return ret;
}
#else
-static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
+static inline void wake_sleeping_dependent(int this_cpu)
{
}
-static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq,
+ task_t *p)
{
return 0;
}
@@ -2972,32 +3251,13 @@ need_resched_nonpreemptible:
cpu = smp_processor_id();
if (unlikely(!rq->nr_running)) {
-go_idle:
idle_balance(cpu, rq);
if (!rq->nr_running) {
next = rq->idle;
rq->expired_timestamp = 0;
- wake_sleeping_dependent(cpu, rq);
- /*
- * wake_sleeping_dependent() might have released
- * the runqueue, so break out if we got new
- * tasks meanwhile:
- */
- if (!rq->nr_running)
- goto switch_tasks;
- }
- } else {
- if (dependent_sleeper(cpu, rq)) {
- next = rq->idle;
+ wake_sleeping_dependent(cpu);
goto switch_tasks;
}
- /*
- * dependent_sleeper() releases and reacquires the runqueue
- * lock, hence go into the idle loop if the rq went
- * empty meanwhile:
- */
- if (unlikely(!rq->nr_running))
- goto go_idle;
}
array = rq->active;
@@ -3035,6 +3295,8 @@ go_idle:
}
}
next->sleep_type = SLEEP_NORMAL;
+ if (dependent_sleeper(cpu, rq, next))
+ next = rq->idle;
switch_tasks:
if (next == rq->idle)
schedstat_inc(rq, sched_goidle);
@@ -3478,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
EXPORT_SYMBOL(sleep_on_timeout);
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance logic.
+ */
+void rt_mutex_setprio(task_t *p, int prio)
+{
+ unsigned long flags;
+ prio_array_t *array;
+ runqueue_t *rq;
+ int oldprio;
+
+ BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+ rq = task_rq_lock(p, &flags);
+
+ oldprio = p->prio;
+ array = p->array;
+ if (array)
+ dequeue_task(p, array);
+ p->prio = prio;
+
+ if (array) {
+ /*
+ * If changing to an RT priority then queue it
+ * in the active array!
+ */
+ if (rt_task(p))
+ array = rq->active;
+ enqueue_task(p, array);
+ /*
+ * Reschedule if we are currently running on this runqueue and
+ * our priority decreased, or if we are not currently running on
+ * this runqueue and our priority is higher than the current's
+ */
+ if (task_running(rq, p)) {
+ if (p->prio > oldprio)
+ resched_task(rq->curr);
+ } else if (TASK_PREEMPTS_CURR(p, rq))
+ resched_task(rq->curr);
+ }
+ task_rq_unlock(rq, &flags);
+}
+
+#endif
+
void set_user_nice(task_t *p, long nice)
{
unsigned long flags;
prio_array_t *array;
runqueue_t *rq;
- int old_prio, new_prio, delta;
+ int old_prio, delta;
if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
return;
@@ -3498,22 +3813,25 @@ void set_user_nice(task_t *p, long nice)
* it wont have any effect on scheduling until the task is
* not SCHED_NORMAL/SCHED_BATCH:
*/
- if (rt_task(p)) {
+ if (has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
array = p->array;
- if (array)
+ if (array) {
dequeue_task(p, array);
+ dec_raw_weighted_load(rq, p);
+ }
- old_prio = p->prio;
- new_prio = NICE_TO_PRIO(nice);
- delta = new_prio - old_prio;
p->static_prio = NICE_TO_PRIO(nice);
- p->prio += delta;
+ set_load_weight(p);
+ old_prio = p->prio;
+ p->prio = effective_prio(p);
+ delta = p->prio - old_prio;
if (array) {
enqueue_task(p, array);
+ inc_raw_weighted_load(rq, p);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -3524,7 +3842,6 @@ void set_user_nice(task_t *p, long nice)
out_unlock:
task_rq_unlock(rq, &flags);
}
-
EXPORT_SYMBOL(set_user_nice);
/*
@@ -3639,16 +3956,15 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
BUG_ON(p->array);
p->policy = policy;
p->rt_priority = prio;
- if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
- p->prio = MAX_RT_PRIO-1 - p->rt_priority;
- } else {
- p->prio = p->static_prio;
- /*
- * SCHED_BATCH tasks are treated as perpetual CPU hogs:
- */
- if (policy == SCHED_BATCH)
- p->sleep_avg = 0;
- }
+ p->normal_prio = normal_prio(p);
+ /* we are holding p->pi_lock already */
+ p->prio = rt_mutex_getprio(p);
+ /*
+ * SCHED_BATCH tasks are treated as perpetual CPU hogs:
+ */
+ if (policy == SCHED_BATCH)
+ p->sleep_avg = 0;
+ set_load_weight(p);
}
/**
@@ -3667,6 +3983,8 @@ int sched_setscheduler(struct task_struct *p, int policy,
unsigned long flags;
runqueue_t *rq;
+ /* may grab non-irq protected spin_locks */
+ BUG_ON(in_interrupt());
recheck:
/* double check policy once rq lock held */
if (policy < 0)
@@ -3715,14 +4033,20 @@ recheck:
if (retval)
return retval;
/*
+ * make sure no PI-waiters arrive (or leave) while we are
+ * changing the priority of the task:
+ */
+ spin_lock_irqsave(&p->pi_lock, flags);
+ /*
* To be able to change p->policy safely, the apropriate
* runqueue lock must be held.
*/
- rq = task_rq_lock(p, &flags);
+ rq = __task_rq_lock(p);
/* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
- task_rq_unlock(rq, &flags);
+ __task_rq_unlock(rq);
+ spin_unlock_irqrestore(&p->pi_lock, flags);
goto recheck;
}
array = p->array;
@@ -3743,7 +4067,11 @@ recheck:
} else if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
}
- task_rq_unlock(rq, &flags);
+ __task_rq_unlock(rq);
+ spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ rt_mutex_adjust_pi(p);
+
return 0;
}
EXPORT_SYMBOL_GPL(sched_setscheduler);
@@ -3765,8 +4093,10 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
read_unlock_irq(&tasklist_lock);
return -ESRCH;
}
- retval = sched_setscheduler(p, policy, &lparam);
+ get_task_struct(p);
read_unlock_irq(&tasklist_lock);
+ retval = sched_setscheduler(p, policy, &lparam);
+ put_task_struct(p);
return retval;
}
@@ -4378,7 +4708,7 @@ void __devinit init_idle(task_t *idle, int cpu)
idle->timestamp = sched_clock();
idle->sleep_avg = 0;
idle->array = NULL;
- idle->prio = MAX_PRIO;
+ idle->prio = idle->normal_prio = MAX_PRIO;
idle->state = TASK_RUNNING;
idle->cpus_allowed = cpumask_of_cpu(cpu);
set_task_cpu(idle, cpu);
@@ -4474,13 +4804,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
*
* So we race with normal scheduler movements, but that's OK, as long
* as the task is no longer on this CPU.
+ *
+ * Returns non-zero if task was successfully migrated.
*/
-static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
{
runqueue_t *rq_dest, *rq_src;
+ int ret = 0;
if (unlikely(cpu_is_offline(dest_cpu)))
- return;
+ return ret;
rq_src = cpu_rq(src_cpu);
rq_dest = cpu_rq(dest_cpu);
@@ -4508,9 +4841,10 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
if (TASK_PREEMPTS_CURR(p, rq_dest))
resched_task(rq_dest->curr);
}
-
+ ret = 1;
out:
double_rq_unlock(rq_src, rq_dest);
+ return ret;
}
/*
@@ -4580,9 +4914,12 @@ wait_to_die:
/* Figure out where task on dead CPU should go, use force if neccessary. */
static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
{
+ runqueue_t *rq;
+ unsigned long flags;
int dest_cpu;
cpumask_t mask;
+restart:
/* On same node? */
mask = node_to_cpumask(cpu_to_node(dead_cpu));
cpus_and(mask, mask, tsk->cpus_allowed);
@@ -4594,8 +4931,10 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
/* No more Mr. Nice Guy. */
if (dest_cpu == NR_CPUS) {
+ rq = task_rq_lock(tsk, &flags);
cpus_setall(tsk->cpus_allowed);
dest_cpu = any_online_cpu(tsk->cpus_allowed);
+ task_rq_unlock(rq, &flags);
/*
* Don't tell them about moving exiting tasks or
@@ -4607,7 +4946,8 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
"longer affine to cpu%d\n",
tsk->pid, tsk->comm, dead_cpu);
}
- __migrate_task(tsk, dead_cpu, dest_cpu);
+ if (!__migrate_task(tsk, dead_cpu, dest_cpu))
+ goto restart;
}
/*
@@ -4734,8 +5074,9 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
* migration_call - callback that gets triggered when a CPU is added.
* Here we can start up the necessary migration thread for the new CPU.
*/
-static int migration_call(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
+static int __cpuinit migration_call(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
{
int cpu = (long)hcpu;
struct task_struct *p;
@@ -4805,7 +5146,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
/* Register at highest priority so that task migration (migrate_all_tasks)
* happens before everything else.
*/
-static struct notifier_block migration_notifier = {
+static struct notifier_block __cpuinitdata migration_notifier = {
.notifier_call = migration_call,
.priority = 10
};
@@ -5606,6 +5947,7 @@ static cpumask_t sched_domain_node_span(int node)
}
#endif
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
/*
* At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
* can switch it on easily if needed.
@@ -5621,7 +5963,7 @@ static int cpu_to_cpu_group(int cpu)
#ifdef CONFIG_SCHED_MC
static DEFINE_PER_CPU(struct sched_domain, core_domains);
-static struct sched_group sched_group_core[NR_CPUS];
+static struct sched_group *sched_group_core_bycpu[NR_CPUS];
#endif
#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
@@ -5637,7 +5979,7 @@ static int cpu_to_core_group(int cpu)
#endif
static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
+static struct sched_group *sched_group_phys_bycpu[NR_CPUS];
static int cpu_to_phys_group(int cpu)
{
#if defined(CONFIG_SCHED_MC)
@@ -5694,13 +6036,74 @@ next_sg:
}
#endif
+/* Free memory allocated for various sched_group structures */
+static void free_sched_groups(const cpumask_t *cpu_map)
+{
+ int cpu;
+#ifdef CONFIG_NUMA
+ int i;
+
+ for_each_cpu_mask(cpu, *cpu_map) {
+ struct sched_group *sched_group_allnodes
+ = sched_group_allnodes_bycpu[cpu];
+ struct sched_group **sched_group_nodes
+ = sched_group_nodes_bycpu[cpu];
+
+ if (sched_group_allnodes) {
+ kfree(sched_group_allnodes);
+ sched_group_allnodes_bycpu[cpu] = NULL;
+ }
+
+ if (!sched_group_nodes)
+ continue;
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ cpumask_t nodemask = node_to_cpumask(i);
+ struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+ cpus_and(nodemask, nodemask, *cpu_map);
+ if (cpus_empty(nodemask))
+ continue;
+
+ if (sg == NULL)
+ continue;
+ sg = sg->next;
+next_sg:
+ oldsg = sg;
+ sg = sg->next;
+ kfree(oldsg);
+ if (oldsg != sched_group_nodes[i])
+ goto next_sg;
+ }
+ kfree(sched_group_nodes);
+ sched_group_nodes_bycpu[cpu] = NULL;
+ }
+#endif
+ for_each_cpu_mask(cpu, *cpu_map) {
+ if (sched_group_phys_bycpu[cpu]) {
+ kfree(sched_group_phys_bycpu[cpu]);
+ sched_group_phys_bycpu[cpu] = NULL;
+ }
+#ifdef CONFIG_SCHED_MC
+ if (sched_group_core_bycpu[cpu]) {
+ kfree(sched_group_core_bycpu[cpu]);
+ sched_group_core_bycpu[cpu] = NULL;
+ }
+#endif
+ }
+}
+
/*
* Build sched domains for a given set of cpus and attach the sched domains
* to the individual cpus
*/
-void build_sched_domains(const cpumask_t *cpu_map)
+static int build_sched_domains(const cpumask_t *cpu_map)
{
int i;
+ struct sched_group *sched_group_phys = NULL;
+#ifdef CONFIG_SCHED_MC
+ struct sched_group *sched_group_core = NULL;
+#endif
#ifdef CONFIG_NUMA
struct sched_group **sched_group_nodes = NULL;
struct sched_group *sched_group_allnodes = NULL;
@@ -5708,11 +6111,11 @@ void build_sched_domains(const cpumask_t *cpu_map)
/*
* Allocate the per-node list of sched groups
*/
- sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
- GFP_ATOMIC);
+ sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+ GFP_KERNEL);
if (!sched_group_nodes) {
printk(KERN_WARNING "Can not alloc sched group node list\n");
- return;
+ return -ENOMEM;
}
sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
#endif
@@ -5738,7 +6141,7 @@ void build_sched_domains(const cpumask_t *cpu_map)
if (!sched_group_allnodes) {
printk(KERN_WARNING
"Can not alloc allnodes sched group\n");
- break;
+ goto error;
}
sched_group_allnodes_bycpu[i]
= sched_group_allnodes;
@@ -5759,6 +6162,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
cpus_and(sd->span, sd->span, *cpu_map);
#endif
+ if (!sched_group_phys) {
+ sched_group_phys
+ = kmalloc(sizeof(struct sched_group) * NR_CPUS,
+ GFP_KERNEL);
+ if (!sched_group_phys) {
+ printk (KERN_WARNING "Can not alloc phys sched"
+ "group\n");
+ goto error;
+ }
+ sched_group_phys_bycpu[i] = sched_group_phys;
+ }
+
p = sd;
sd = &per_cpu(phys_domains, i);
group = cpu_to_phys_group(i);
@@ -5768,6 +6183,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
sd->groups = &sched_group_phys[group];
#ifdef CONFIG_SCHED_MC
+ if (!sched_group_core) {
+ sched_group_core
+ = kmalloc(sizeof(struct sched_group) * NR_CPUS,
+ GFP_KERNEL);
+ if (!sched_group_core) {
+ printk (KERN_WARNING "Can not alloc core sched"
+ "group\n");
+ goto error;
+ }
+ sched_group_core_bycpu[i] = sched_group_core;
+ }
+
p = sd;
sd = &per_cpu(core_domains, i);
group = cpu_to_core_group(i);
@@ -5851,24 +6278,21 @@ void build_sched_domains(const cpumask_t *cpu_map)
domainspan = sched_domain_node_span(i);
cpus_and(domainspan, domainspan, *cpu_map);
- sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+ sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
+ if (!sg) {
+ printk(KERN_WARNING "Can not alloc domain group for "
+ "node %d\n", i);
+ goto error;
+ }
sched_group_nodes[i] = sg;
for_each_cpu_mask(j, nodemask) {
struct sched_domain *sd;
sd = &per_cpu(node_domains, j);
sd->groups = sg;
- if (sd->groups == NULL) {
- /* Turn off balancing if we have no groups */
- sd->flags = 0;
- }
- }
- if (!sg) {
- printk(KERN_WARNING
- "Can not alloc domain group for node %d\n", i);
- continue;
}
sg->cpu_power = 0;
sg->cpumask = nodemask;
+ sg->next = sg;
cpus_or(covered, covered, nodemask);
prev = sg;
@@ -5887,54 +6311,90 @@ void build_sched_domains(const cpumask_t *cpu_map)
if (cpus_empty(tmp))
continue;
- sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+ sg = kmalloc_node(sizeof(struct sched_group),
+ GFP_KERNEL, i);
if (!sg) {
printk(KERN_WARNING
"Can not alloc domain group for node %d\n", j);
- break;
+ goto error;
}
sg->cpu_power = 0;
sg->cpumask = tmp;
+ sg->next = prev->next;
cpus_or(covered, covered, tmp);
prev->next = sg;
prev = sg;
}
- prev->next = sched_group_nodes[i];
}
#endif
/* Calculate CPU power for physical packages and nodes */
+#ifdef CONFIG_SCHED_SMT
for_each_cpu_mask(i, *cpu_map) {
- int power;
struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
sd = &per_cpu(cpu_domains, i);
- power = SCHED_LOAD_SCALE;
- sd->groups->cpu_power = power;
+ sd->groups->cpu_power = SCHED_LOAD_SCALE;
+ }
#endif
#ifdef CONFIG_SCHED_MC
+ for_each_cpu_mask(i, *cpu_map) {
+ int power;
+ struct sched_domain *sd;
sd = &per_cpu(core_domains, i);
- power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+ if (sched_smt_power_savings)
+ power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
+ else
+ power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
* SCHED_LOAD_SCALE / 10;
sd->groups->cpu_power = power;
+ }
+#endif
+ for_each_cpu_mask(i, *cpu_map) {
+ struct sched_domain *sd;
+#ifdef CONFIG_SCHED_MC
sd = &per_cpu(phys_domains, i);
+ if (i != first_cpu(sd->groups->cpumask))
+ continue;
- /*
- * This has to be < 2 * SCHED_LOAD_SCALE
- * Lets keep it SCHED_LOAD_SCALE, so that
- * while calculating NUMA group's cpu_power
- * we can simply do
- * numa_group->cpu_power += phys_group->cpu_power;
- *
- * See "only add power once for each physical pkg"
- * comment below
- */
- sd->groups->cpu_power = SCHED_LOAD_SCALE;
+ sd->groups->cpu_power = 0;
+ if (sched_mc_power_savings || sched_smt_power_savings) {
+ int j;
+
+ for_each_cpu_mask(j, sd->groups->cpumask) {
+ struct sched_domain *sd1;
+ sd1 = &per_cpu(core_domains, j);
+ /*
+ * for each core we will add once
+ * to the group in physical domain
+ */
+ if (j != first_cpu(sd1->groups->cpumask))
+ continue;
+
+ if (sched_smt_power_savings)
+ sd->groups->cpu_power += sd1->groups->cpu_power;
+ else
+ sd->groups->cpu_power += SCHED_LOAD_SCALE;
+ }
+ } else
+ /*
+ * This has to be < 2 * SCHED_LOAD_SCALE
+ * Lets keep it SCHED_LOAD_SCALE, so that
+ * while calculating NUMA group's cpu_power
+ * we can simply do
+ * numa_group->cpu_power += phys_group->cpu_power;
+ *
+ * See "only add power once for each physical pkg"
+ * comment below
+ */
+ sd->groups->cpu_power = SCHED_LOAD_SCALE;
#else
+ int power;
sd = &per_cpu(phys_domains, i);
- power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
- (cpus_weight(sd->groups->cpumask)-1) / 10;
+ if (sched_smt_power_savings)
+ power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
+ else
+ power = SCHED_LOAD_SCALE;
sd->groups->cpu_power = power;
#endif
}
@@ -5962,13 +6422,20 @@ void build_sched_domains(const cpumask_t *cpu_map)
* Tune cache-hot values:
*/
calibrate_migration_costs(cpu_map);
+
+ return 0;
+
+error:
+ free_sched_groups(cpu_map);
+ return -ENOMEM;
}
/*
* Set up scheduler domains and groups. Callers must hold the hotplug lock.
*/
-static void arch_init_sched_domains(const cpumask_t *cpu_map)
+static int arch_init_sched_domains(const cpumask_t *cpu_map)
{
cpumask_t cpu_default_map;
+ int err;
/*
* Setup mask for cpus without special case scheduling requirements.
@@ -5977,51 +6444,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map)
*/
cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
- build_sched_domains(&cpu_default_map);
+ err = build_sched_domains(&cpu_default_map);
+
+ return err;
}
static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
{
-#ifdef CONFIG_NUMA
- int i;
- int cpu;
-
- for_each_cpu_mask(cpu, *cpu_map) {
- struct sched_group *sched_group_allnodes
- = sched_group_allnodes_bycpu[cpu];
- struct sched_group **sched_group_nodes
- = sched_group_nodes_bycpu[cpu];
-
- if (sched_group_allnodes) {
- kfree(sched_group_allnodes);
- sched_group_allnodes_bycpu[cpu] = NULL;
- }
-
- if (!sched_group_nodes)
- continue;
-
- for (i = 0; i < MAX_NUMNODES; i++) {
- cpumask_t nodemask = node_to_cpumask(i);
- struct sched_group *oldsg, *sg = sched_group_nodes[i];
-
- cpus_and(nodemask, nodemask, *cpu_map);
- if (cpus_empty(nodemask))
- continue;
-
- if (sg == NULL)
- continue;
- sg = sg->next;
-next_sg:
- oldsg = sg;
- sg = sg->next;
- kfree(oldsg);
- if (oldsg != sched_group_nodes[i])
- goto next_sg;
- }
- kfree(sched_group_nodes);
- sched_group_nodes_bycpu[cpu] = NULL;
- }
-#endif
+ free_sched_groups(cpu_map);
}
/*
@@ -6046,9 +6476,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
* correct sched domains
* Call with hotplug lock held
*/
-void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
{
cpumask_t change_map;
+ int err = 0;
cpus_and(*partition1, *partition1, cpu_online_map);
cpus_and(*partition2, *partition2, cpu_online_map);
@@ -6057,10 +6488,86 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
/* Detach sched domains from all of the affected cpus */
detach_destroy_domains(&change_map);
if (!cpus_empty(*partition1))
- build_sched_domains(partition1);
- if (!cpus_empty(*partition2))
- build_sched_domains(partition2);
+ err = build_sched_domains(partition1);
+ if (!err && !cpus_empty(*partition2))
+ err = build_sched_domains(partition2);
+
+ return err;
+}
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+int arch_reinit_sched_domains(void)
+{
+ int err;
+
+ lock_cpu_hotplug();
+ detach_destroy_domains(&cpu_online_map);
+ err = arch_init_sched_domains(&cpu_online_map);
+ unlock_cpu_hotplug();
+
+ return err;
+}
+
+static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
+{
+ int ret;
+
+ if (buf[0] != '0' && buf[0] != '1')
+ return -EINVAL;
+
+ if (smt)
+ sched_smt_power_savings = (buf[0] == '1');
+ else
+ sched_mc_power_savings = (buf[0] == '1');
+
+ ret = arch_reinit_sched_domains();
+
+ return ret ? ret : count;
+}
+
+int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+{
+ int err = 0;
+#ifdef CONFIG_SCHED_SMT
+ if (smt_capable())
+ err = sysfs_create_file(&cls->kset.kobj,
+ &attr_sched_smt_power_savings.attr);
+#endif
+#ifdef CONFIG_SCHED_MC
+ if (!err && mc_capable())
+ err = sysfs_create_file(&cls->kset.kobj,
+ &attr_sched_mc_power_savings.attr);
+#endif
+ return err;
+}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
+{
+ return sprintf(page, "%u\n", sched_mc_power_savings);
+}
+static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
+{
+ return sched_power_savings_store(buf, count, 0);
+}
+SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
+ sched_mc_power_savings_store);
+#endif
+
+#ifdef CONFIG_SCHED_SMT
+static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
+{
+ return sprintf(page, "%u\n", sched_smt_power_savings);
+}
+static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
+{
+ return sched_power_savings_store(buf, count, 1);
}
+SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+ sched_smt_power_savings_store);
+#endif
+
#ifdef CONFIG_HOTPLUG_CPU
/*
@@ -6143,7 +6650,6 @@ void __init sched_init(void)
rq->push_cpu = 0;
rq->migration_thread = NULL;
INIT_LIST_HEAD(&rq->migration_queue);
- rq->cpu = i;
#endif
atomic_set(&rq->nr_iowait, 0);
@@ -6158,6 +6664,7 @@ void __init sched_init(void)
}
}
+ set_load_weight(&init_task);
/*
* The boot idle thread does lazy MMU switching as well:
*/
@@ -6204,11 +6711,12 @@ void normalize_rt_tasks(void)
runqueue_t *rq;
read_lock_irq(&tasklist_lock);
- for_each_process (p) {
+ for_each_process(p) {
if (!rt_task(p))
continue;
- rq = task_rq_lock(p, &flags);
+ spin_lock_irqsave(&p->pi_lock, flags);
+ rq = __task_rq_lock(p);
array = p->array;
if (array)
@@ -6219,7 +6727,8 @@ void normalize_rt_tasks(void)
resched_task(rq->curr);
}
- task_rq_unlock(rq, &flags);
+ __task_rq_unlock(rq);
+ spin_unlock_irqrestore(&p->pi_lock, flags);
}
read_unlock_irq(&tasklist_lock);
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9e2f1c6e73d..8f03e3b89b5 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -446,7 +446,7 @@ static void takeover_tasklets(unsigned int cpu)
}
#endif /* CONFIG_HOTPLUG_CPU */
-static int cpu_callback(struct notifier_block *nfb,
+static int __devinit cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
@@ -486,7 +486,7 @@ static int cpu_callback(struct notifier_block *nfb,
return NOTIFY_OK;
}
-static struct notifier_block cpu_nfb = {
+static struct notifier_block __devinitdata cpu_nfb = {
.notifier_call = cpu_callback
};
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index b5c3b94e01c..6b76caa2298 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu)
/*
* Create/destroy watchdog threads as CPUs come and go:
*/
-static int
+static int __devinit
cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
int hotcpu = (unsigned long)hcpu;
@@ -142,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
return NOTIFY_OK;
}
-static struct notifier_block cpu_nfb = {
+static struct notifier_block __devinitdata cpu_nfb = {
.notifier_call = cpu_callback
};
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f1a4eb1a655..93a2c539864 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -133,6 +133,10 @@ extern int acct_parm[];
extern int no_unaligned_warning;
#endif
+#ifdef CONFIG_RT_MUTEXES
+extern int max_lock_depth;
+#endif
+
static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
ctl_table *, void **);
static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
@@ -688,6 +692,17 @@ static ctl_table kern_table[] = {
.proc_handler = &proc_dointvec,
},
#endif
+#ifdef CONFIG_RT_MUTEXES
+ {
+ .ctl_name = KERN_MAX_LOCK_DEPTH,
+ .procname = "max_lock_depth",
+ .data = &max_lock_depth,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+
{ .ctl_name = 0 }
};
@@ -928,6 +943,18 @@ static ctl_table vm_table[] = {
.strategy = &sysctl_jiffies,
},
#endif
+#ifdef CONFIG_X86_32
+ {
+ .ctl_name = VM_VDSO_ENABLED,
+ .procname = "vdso_enabled",
+ .data = &vdso_enabled,
+ .maxlen = sizeof(vdso_enabled),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ },
+#endif
{ .ctl_name = 0 }
};
diff --git a/kernel/timer.c b/kernel/timer.c
index 5bb6b7976ee..5a896025306 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1652,7 +1652,7 @@ static void __devinit migrate_timers(int cpu)
}
#endif /* CONFIG_HOTPLUG_CPU */
-static int timer_cpu_notify(struct notifier_block *self,
+static int __devinit timer_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
@@ -1672,7 +1672,7 @@ static int timer_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block timers_nb = {
+static struct notifier_block __devinitdata timers_nb = {
.notifier_call = timer_cpu_notify,
};
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 565cf7a1feb..59f0b42bd89 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -559,7 +559,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
}
/* We're holding the cpucontrol mutex here */
-static int workqueue_cpu_callback(struct notifier_block *nfb,
+static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{