From 48e6484d49020dba3578ad117b461e8a391e8f0f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 26 Jun 2006 00:25:48 -0700 Subject: [PATCH] proc: Rewrite the proc dentry flush on exit optimization To keep the dcache from filling up with dead /proc entries we flush them on process exit. However over the years that code has gotten hairy with a dentry_pointer and a lock in task_struct and misdocumented as a correctness feature. I have rewritten this code to look and see if we have a corresponding entry in the dcache and if so flush it on process exit. This removes the extra fields in the task_struct and allows me to trivially handle the case of a /proc//task/ entry as well as the current /proc/ entries. Signed-off-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index dfd10cb370c..79e91046f36 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -993,13 +993,10 @@ static task_t *copy_process(unsigned long clone_flags, if (put_user(p->pid, parent_tidptr)) goto bad_fork_cleanup; - p->proc_dentry = NULL; - INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); - spin_lock_init(&p->proc_lock); clear_tsk_thread_flag(p, TIF_SIGPENDING); init_sigpending(&p->pending); -- cgit v1.2.3-70-g09d2 From cf2dfbfbf4c5cb489ea12defd85a484307b955b8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 26 Jun 2006 00:26:10 -0700 Subject: [PATCH] coredump: copy_process: don't check SIGNAL_GROUP_EXIT After the previous patch SIGNAL_GROUP_EXIT implies a pending SIGKILL, we can remove this check from copy_process() because we already checked !signal_pending(). Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 79e91046f36..9b4e54ef022 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1156,18 +1156,6 @@ static task_t *copy_process(unsigned long clone_flags, } if (clone_flags & CLONE_THREAD) { - /* - * Important: if an exit-all has been started then - * do not create this new thread - the whole thread - * group is supposed to exit anyway. - */ - if (current->signal->flags & SIGNAL_GROUP_EXIT) { - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); - retval = -EAGAIN; - goto bad_fork_cleanup_namespace; - } - p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); -- cgit v1.2.3-70-g09d2 From 23f78d4a03c53cbd75d87a795378ea540aa08c86 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Jun 2006 02:54:53 -0700 Subject: [PATCH] pi-futex: rt mutex core Core functions for the rt-mutex subsystem. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/init_task.h | 1 + include/linux/rtmutex.h | 104 ++++++ include/linux/sched.h | 12 + include/linux/sysctl.h | 1 + init/Kconfig | 5 + kernel/Makefile | 1 + kernel/fork.c | 16 + kernel/rtmutex.c | 904 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/rtmutex.h | 29 ++ kernel/rtmutex_common.h | 93 +++++ kernel/sysctl.c | 15 + 11 files changed, 1181 insertions(+) create mode 100644 include/linux/rtmutex.h create mode 100644 kernel/rtmutex.c create mode 100644 kernel/rtmutex.h create mode 100644 kernel/rtmutex_common.h (limited to 'kernel/fork.c') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 678c1a90380..3a256957fb5 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -124,6 +124,7 @@ extern struct group_info init_groups; .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .fs_excl = ATOMIC_INIT(0), \ .pi_lock = SPIN_LOCK_UNLOCKED, \ + INIT_RT_MUTEXES(tsk) \ } diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h new file mode 100644 index 00000000000..12309c916c6 --- /dev/null +++ b/include/linux/rtmutex.h @@ -0,0 +1,104 @@ +/* + * RT Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + * This file contains the public data structure and API definitions. + */ + +#ifndef __LINUX_RT_MUTEX_H +#define __LINUX_RT_MUTEX_H + +#include +#include +#include + +/* + * The rt_mutex structure + * + * @wait_lock: spinlock to protect the structure + * @wait_list: pilist head to enqueue waiters in priority order + * @owner: the mutex owner + */ +struct rt_mutex { + spinlock_t wait_lock; + struct plist_head wait_list; + struct task_struct *owner; +#ifdef CONFIG_DEBUG_RT_MUTEXES + int save_state; + struct list_head held_list_entry; + unsigned long acquire_ip; + const char *name, *file; + int line; + void *magic; +#endif +}; + +struct rt_mutex_waiter; +struct hrtimer_sleeper; + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \ + , .name = #mutexname, .file = __FILE__, .line = __LINE__ +# define rt_mutex_init(mutex) __rt_mutex_init(mutex, __FUNCTION__) + extern void rt_mutex_debug_task_free(struct task_struct *tsk); +#else +# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) +# define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL) +# define rt_mutex_debug_task_free(t) do { } while (0) +#endif + +#define __RT_MUTEX_INITIALIZER(mutexname) \ + { .wait_lock = SPIN_LOCK_UNLOCKED \ + , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, mutexname.wait_lock) \ + , .owner = NULL \ + __DEBUG_RT_MUTEX_INITIALIZER(mutexname)} + +#define DEFINE_RT_MUTEX(mutexname) \ + struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname) + +/*** + * rt_mutex_is_locked - is the mutex locked + * @lock: the mutex to be queried + * + * Returns 1 if the mutex is locked, 0 if unlocked. + */ +static inline int rt_mutex_is_locked(struct rt_mutex *lock) +{ + return lock->owner != NULL; +} + +extern void __rt_mutex_init(struct rt_mutex *lock, const char *name); +extern void rt_mutex_destroy(struct rt_mutex *lock); + +extern void rt_mutex_lock(struct rt_mutex *lock); +extern int rt_mutex_lock_interruptible(struct rt_mutex *lock, + int detect_deadlock); +extern int rt_mutex_timed_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *timeout, + int detect_deadlock); + +extern int rt_mutex_trylock(struct rt_mutex *lock); + +extern void rt_mutex_unlock(struct rt_mutex *lock); + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# define INIT_RT_MUTEX_DEBUG(tsk) \ + .held_list_head = LIST_HEAD_INIT(tsk.held_list_head), \ + .held_list_lock = SPIN_LOCK_UNLOCKED +#else +# define INIT_RT_MUTEX_DEBUG(tsk) +#endif + +#ifdef CONFIG_RT_MUTEXES +# define INIT_RT_MUTEXES(tsk) \ + .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters, tsk.pi_lock), \ + INIT_RT_MUTEX_DEBUG(tsk) +#else +# define INIT_RT_MUTEXES(tsk) +#endif + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 6f167645e7e..6ea23c9af41 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -73,6 +73,7 @@ struct sched_param { #include #include #include +#include #include #include @@ -858,6 +859,17 @@ struct task_struct { /* Protection of the PI data structures: */ spinlock_t pi_lock; +#ifdef CONFIG_RT_MUTEXES + /* PI waiters blocked on a rt_mutex held by this task */ + struct plist_head pi_waiters; + /* Deadlock detection and priority inheritance handling */ + struct rt_mutex_waiter *pi_blocked_on; +# ifdef CONFIG_DEBUG_RT_MUTEXES + spinlock_t held_list_lock; + struct list_head held_list_head; +# endif +#endif + #ifdef CONFIG_DEBUG_MUTEXES /* mutex deadlock detection */ struct mutex_waiter *blocked_on; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index bee12a7a057..46e4d8f2771 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -149,6 +149,7 @@ enum KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */ KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */ KERN_COMPAT_LOG=73, /* int: print compat layer messages */ + KERN_MAX_LOCK_DEPTH=74, }; diff --git a/init/Kconfig b/init/Kconfig index df55b366560..f70f2fd273c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -339,9 +339,14 @@ config BASE_FULL kernel data structures. This saves memory on small machines, but may reduce performance. +config RT_MUTEXES + boolean + select PLIST + config FUTEX bool "Enable futex support" if EMBEDDED default y + select RT_MUTEXES help Disabling this option will cause the kernel to be built without support for "fast userspace mutexes". The resulting kernel may not diff --git a/kernel/Makefile b/kernel/Makefile index 752bd7d383a..21df9a338ff 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_FUTEX) += futex.o ifeq ($(CONFIG_COMPAT),y) obj-$(CONFIG_FUTEX) += futex_compat.o endif +obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o diff --git a/kernel/fork.c b/kernel/fork.c index 9b4e54ef022..b664a081fff 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -104,6 +104,7 @@ static kmem_cache_t *mm_cachep; void free_task(struct task_struct *tsk) { free_thread_info(tsk->thread_info); + rt_mutex_debug_task_free(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -913,6 +914,19 @@ asmlinkage long sys_set_tid_address(int __user *tidptr) return current->pid; } +static inline void rt_mutex_init_task(struct task_struct *p) +{ +#ifdef CONFIG_RT_MUTEXES + spin_lock_init(&p->pi_lock); + plist_head_init(&p->pi_waiters, &p->pi_lock); + p->pi_blocked_on = NULL; +# ifdef CONFIG_DEBUG_RT_MUTEXES + spin_lock_init(&p->held_list_lock); + INIT_LIST_HEAD(&p->held_list_head); +# endif +#endif +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1034,6 +1048,8 @@ static task_t *copy_process(unsigned long clone_flags, mpol_fix_fork_child_flag(p); #endif + rt_mutex_init_task(p); + #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c new file mode 100644 index 00000000000..937a474fae9 --- /dev/null +++ b/kernel/rtmutex.c @@ -0,0 +1,904 @@ +/* + * RT-Mutexes: simple blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner. + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner + * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt + * Copyright (C) 2006 Esben Nielsen + */ +#include +#include +#include +#include + +#include "rtmutex_common.h" + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# include "rtmutex-debug.h" +#else +# include "rtmutex.h" +#endif + +/* + * lock->owner state tracking: + * + * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 + * are used to keep track of the "owner is pending" and "lock has + * waiters" state. + * + * owner bit1 bit0 + * NULL 0 0 lock is free (fast acquire possible) + * NULL 0 1 invalid state + * NULL 1 0 Transitional State* + * NULL 1 1 invalid state + * taskpointer 0 0 lock is held (fast release possible) + * taskpointer 0 1 task is pending owner + * taskpointer 1 0 lock is held and has waiters + * taskpointer 1 1 task is pending owner and lock has more waiters + * + * Pending ownership is assigned to the top (highest priority) + * waiter of the lock, when the lock is released. The thread is woken + * up and can now take the lock. Until the lock is taken (bit 0 + * cleared) a competing higher priority thread can steal the lock + * which puts the woken up thread back on the waiters list. + * + * The fast atomic compare exchange based acquire and release is only + * possible when bit 0 and 1 of lock->owner are 0. + * + * (*) There's a small time where the owner can be NULL and the + * "lock has waiters" bit is set. This can happen when grabbing the lock. + * To prevent a cmpxchg of the owner releasing the lock, we need to set this + * bit before looking at the lock, hence the reason this is a transitional + * state. + */ + +static void +rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, + unsigned long mask) +{ + unsigned long val = (unsigned long)owner | mask; + + if (rt_mutex_has_waiters(lock)) + val |= RT_MUTEX_HAS_WAITERS; + + lock->owner = (struct task_struct *)val; +} + +static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) +{ + lock->owner = (struct task_struct *) + ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); +} + +static void fixup_rt_mutex_waiters(struct rt_mutex *lock) +{ + if (!rt_mutex_has_waiters(lock)) + clear_rt_mutex_waiters(lock); +} + +/* + * We can speed up the acquire/release, if the architecture + * supports cmpxchg and if there's no debugging state to be set up + */ +#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) +# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ + unsigned long owner, *p = (unsigned long *) &lock->owner; + + do { + owner = *p; + } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); +} +#else +# define rt_mutex_cmpxchg(l,c,n) (0) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ + lock->owner = (struct task_struct *) + ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); +} +#endif + +/* + * Calculate task priority from the waiter list priority + * + * Return task->normal_prio when the waiter list is empty or when + * the waiter is not allowed to do priority boosting + */ +int rt_mutex_getprio(struct task_struct *task) +{ + if (likely(!task_has_pi_waiters(task))) + return task->normal_prio; + + return min(task_top_pi_waiter(task)->pi_list_entry.prio, + task->normal_prio); +} + +/* + * Adjust the priority of a task, after its pi_waiters got modified. + * + * This can be both boosting and unboosting. task->pi_lock must be held. + */ +static void __rt_mutex_adjust_prio(struct task_struct *task) +{ + int prio = rt_mutex_getprio(task); + + if (task->prio != prio) + rt_mutex_setprio(task, prio); +} + +/* + * Adjust task priority (undo boosting). Called from the exit path of + * rt_mutex_slowunlock() and rt_mutex_slowlock(). + * + * (Note: We do this outside of the protection of lock->wait_lock to + * allow the lock to be taken while or before we readjust the priority + * of task. We do not use the spin_xx_mutex() variants here as we are + * outside of the debug path.) + */ +static void rt_mutex_adjust_prio(struct task_struct *task) +{ + unsigned long flags; + + spin_lock_irqsave(&task->pi_lock, flags); + __rt_mutex_adjust_prio(task); + spin_unlock_irqrestore(&task->pi_lock, flags); +} + +/* + * Max number of times we'll walk the boosting chain: + */ +int max_lock_depth = 1024; + +/* + * Adjust the priority chain. Also used for deadlock detection. + * Decreases task's usage by one - may thus free the task. + * Returns 0 or -EDEADLK. + */ +static int rt_mutex_adjust_prio_chain(task_t *task, + int deadlock_detect, + struct rt_mutex *orig_lock, + struct rt_mutex_waiter *orig_waiter + __IP_DECL__) +{ + struct rt_mutex *lock; + struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; + int detect_deadlock, ret = 0, depth = 0; + unsigned long flags; + + detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, + deadlock_detect); + + /* + * The (de)boosting is a step by step approach with a lot of + * pitfalls. We want this to be preemptible and we want hold a + * maximum of two locks per step. So we have to check + * carefully whether things change under us. + */ + again: + if (++depth > max_lock_depth) { + static int prev_max; + + /* + * Print this only once. If the admin changes the limit, + * print a new message when reaching the limit again. + */ + if (prev_max != max_lock_depth) { + prev_max = max_lock_depth; + printk(KERN_WARNING "Maximum lock depth %d reached " + "task: %s (%d)\n", max_lock_depth, + current->comm, current->pid); + } + put_task_struct(task); + + return deadlock_detect ? -EDEADLK : 0; + } + retry: + /* + * Task can not go away as we did a get_task() before ! + */ + spin_lock_irqsave(&task->pi_lock, flags); + + waiter = task->pi_blocked_on; + /* + * Check whether the end of the boosting chain has been + * reached or the state of the chain has changed while we + * dropped the locks. + */ + if (!waiter || !waiter->task) + goto out_unlock_pi; + + if (top_waiter && (!task_has_pi_waiters(task) || + top_waiter != task_top_pi_waiter(task))) + goto out_unlock_pi; + + /* + * When deadlock detection is off then we check, if further + * priority adjustment is necessary. + */ + if (!detect_deadlock && waiter->list_entry.prio == task->prio) + goto out_unlock_pi; + + lock = waiter->lock; + if (!spin_trylock(&lock->wait_lock)) { + spin_unlock_irqrestore(&task->pi_lock, flags); + cpu_relax(); + goto retry; + } + + /* Deadlock detection */ + if (lock == orig_lock || rt_mutex_owner(lock) == current) { + debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); + spin_unlock(&lock->wait_lock); + ret = deadlock_detect ? -EDEADLK : 0; + goto out_unlock_pi; + } + + top_waiter = rt_mutex_top_waiter(lock); + + /* Requeue the waiter */ + plist_del(&waiter->list_entry, &lock->wait_list); + waiter->list_entry.prio = task->prio; + plist_add(&waiter->list_entry, &lock->wait_list); + + /* Release the task */ + spin_unlock_irqrestore(&task->pi_lock, flags); + put_task_struct(task); + + /* Grab the next task */ + task = rt_mutex_owner(lock); + spin_lock_irqsave(&task->pi_lock, flags); + + if (waiter == rt_mutex_top_waiter(lock)) { + /* Boost the owner */ + plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); + waiter->pi_list_entry.prio = waiter->list_entry.prio; + plist_add(&waiter->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + + } else if (top_waiter == waiter) { + /* Deboost the owner */ + plist_del(&waiter->pi_list_entry, &task->pi_waiters); + waiter = rt_mutex_top_waiter(lock); + waiter->pi_list_entry.prio = waiter->list_entry.prio; + plist_add(&waiter->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + } + + get_task_struct(task); + spin_unlock_irqrestore(&task->pi_lock, flags); + + top_waiter = rt_mutex_top_waiter(lock); + spin_unlock(&lock->wait_lock); + + if (!detect_deadlock && waiter != top_waiter) + goto out_put_task; + + goto again; + + out_unlock_pi: + spin_unlock_irqrestore(&task->pi_lock, flags); + out_put_task: + put_task_struct(task); + return ret; +} + +/* + * Optimization: check if we can steal the lock from the + * assigned pending owner [which might not have taken the + * lock yet]: + */ +static inline int try_to_steal_lock(struct rt_mutex *lock) +{ + struct task_struct *pendowner = rt_mutex_owner(lock); + struct rt_mutex_waiter *next; + unsigned long flags; + + if (!rt_mutex_owner_pending(lock)) + return 0; + + if (pendowner == current) + return 1; + + spin_lock_irqsave(&pendowner->pi_lock, flags); + if (current->prio >= pendowner->prio) { + spin_unlock_irqrestore(&pendowner->pi_lock, flags); + return 0; + } + + /* + * Check if a waiter is enqueued on the pending owners + * pi_waiters list. Remove it and readjust pending owners + * priority. + */ + if (likely(!rt_mutex_has_waiters(lock))) { + spin_unlock_irqrestore(&pendowner->pi_lock, flags); + return 1; + } + + /* No chain handling, pending owner is not blocked on anything: */ + next = rt_mutex_top_waiter(lock); + plist_del(&next->pi_list_entry, &pendowner->pi_waiters); + __rt_mutex_adjust_prio(pendowner); + spin_unlock_irqrestore(&pendowner->pi_lock, flags); + + /* + * We are going to steal the lock and a waiter was + * enqueued on the pending owners pi_waiters queue. So + * we have to enqueue this waiter into + * current->pi_waiters list. This covers the case, + * where current is boosted because it holds another + * lock and gets unboosted because the booster is + * interrupted, so we would delay a waiter with higher + * priority as current->normal_prio. + * + * Note: in the rare case of a SCHED_OTHER task changing + * its priority and thus stealing the lock, next->task + * might be current: + */ + if (likely(next->task != current)) { + spin_lock_irqsave(¤t->pi_lock, flags); + plist_add(&next->pi_list_entry, ¤t->pi_waiters); + __rt_mutex_adjust_prio(current); + spin_unlock_irqrestore(¤t->pi_lock, flags); + } + return 1; +} + +/* + * Try to take an rt-mutex + * + * This fails + * - when the lock has a real owner + * - when a different pending owner exists and has higher priority than current + * + * Must be called with lock->wait_lock held. + */ +static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__) +{ + /* + * We have to be careful here if the atomic speedups are + * enabled, such that, when + * - no other waiter is on the lock + * - the lock has been released since we did the cmpxchg + * the lock can be released or taken while we are doing the + * checks and marking the lock with RT_MUTEX_HAS_WAITERS. + * + * The atomic acquire/release aware variant of + * mark_rt_mutex_waiters uses a cmpxchg loop. After setting + * the WAITERS bit, the atomic release / acquire can not + * happen anymore and lock->wait_lock protects us from the + * non-atomic case. + * + * Note, that this might set lock->owner = + * RT_MUTEX_HAS_WAITERS in the case the lock is not contended + * any more. This is fixed up when we take the ownership. + * This is the transitional state explained at the top of this file. + */ + mark_rt_mutex_waiters(lock); + + if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) + return 0; + + /* We got the lock. */ + debug_rt_mutex_lock(lock __IP__); + + rt_mutex_set_owner(lock, current, 0); + + rt_mutex_deadlock_account_lock(lock, current); + + return 1; +} + +/* + * Task blocks on lock. + * + * Prepare waiter and propagate pi chain + * + * This must be called with lock->wait_lock held. + */ +static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + int detect_deadlock + __IP_DECL__) +{ + struct rt_mutex_waiter *top_waiter = waiter; + task_t *owner = rt_mutex_owner(lock); + int boost = 0, res; + unsigned long flags; + + spin_lock_irqsave(¤t->pi_lock, flags); + __rt_mutex_adjust_prio(current); + waiter->task = current; + waiter->lock = lock; + plist_node_init(&waiter->list_entry, current->prio); + plist_node_init(&waiter->pi_list_entry, current->prio); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) + top_waiter = rt_mutex_top_waiter(lock); + plist_add(&waiter->list_entry, &lock->wait_list); + + current->pi_blocked_on = waiter; + + spin_unlock_irqrestore(¤t->pi_lock, flags); + + if (waiter == rt_mutex_top_waiter(lock)) { + spin_lock_irqsave(&owner->pi_lock, flags); + plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); + plist_add(&waiter->pi_list_entry, &owner->pi_waiters); + + __rt_mutex_adjust_prio(owner); + if (owner->pi_blocked_on) { + boost = 1; + get_task_struct(owner); + } + spin_unlock_irqrestore(&owner->pi_lock, flags); + } + else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { + spin_lock_irqsave(&owner->pi_lock, flags); + if (owner->pi_blocked_on) { + boost = 1; + get_task_struct(owner); + } + spin_unlock_irqrestore(&owner->pi_lock, flags); + } + if (!boost) + return 0; + + spin_unlock(&lock->wait_lock); + + res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, + waiter __IP__); + + spin_lock(&lock->wait_lock); + + return res; +} + +/* + * Wake up the next waiter on the lock. + * + * Remove the top waiter from the current tasks waiter list and from + * the lock waiter list. Set it as pending owner. Then wake it up. + * + * Called with lock->wait_lock held. + */ +static void wakeup_next_waiter(struct rt_mutex *lock) +{ + struct rt_mutex_waiter *waiter; + struct task_struct *pendowner; + unsigned long flags; + + spin_lock_irqsave(¤t->pi_lock, flags); + + waiter = rt_mutex_top_waiter(lock); + plist_del(&waiter->list_entry, &lock->wait_list); + + /* + * Remove it from current->pi_waiters. We do not adjust a + * possible priority boost right now. We execute wakeup in the + * boosted mode and go back to normal after releasing + * lock->wait_lock. + */ + plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); + pendowner = waiter->task; + waiter->task = NULL; + + rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); + + spin_unlock_irqrestore(¤t->pi_lock, flags); + + /* + * Clear the pi_blocked_on variable and enqueue a possible + * waiter into the pi_waiters list of the pending owner. This + * prevents that in case the pending owner gets unboosted a + * waiter with higher priority than pending-owner->normal_prio + * is blocked on the unboosted (pending) owner. + */ + spin_lock_irqsave(&pendowner->pi_lock, flags); + + WARN_ON(!pendowner->pi_blocked_on); + WARN_ON(pendowner->pi_blocked_on != waiter); + WARN_ON(pendowner->pi_blocked_on->lock != lock); + + pendowner->pi_blocked_on = NULL; + + if (rt_mutex_has_waiters(lock)) { + struct rt_mutex_waiter *next; + + next = rt_mutex_top_waiter(lock); + plist_add(&next->pi_list_entry, &pendowner->pi_waiters); + } + spin_unlock_irqrestore(&pendowner->pi_lock, flags); + + wake_up_process(pendowner); +} + +/* + * Remove a waiter from a lock + * + * Must be called with lock->wait_lock held + */ +static void remove_waiter(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter __IP_DECL__) +{ + int first = (waiter == rt_mutex_top_waiter(lock)); + int boost = 0; + task_t *owner = rt_mutex_owner(lock); + unsigned long flags; + + spin_lock_irqsave(¤t->pi_lock, flags); + plist_del(&waiter->list_entry, &lock->wait_list); + waiter->task = NULL; + current->pi_blocked_on = NULL; + spin_unlock_irqrestore(¤t->pi_lock, flags); + + if (first && owner != current) { + + spin_lock_irqsave(&owner->pi_lock, flags); + + plist_del(&waiter->pi_list_entry, &owner->pi_waiters); + + if (rt_mutex_has_waiters(lock)) { + struct rt_mutex_waiter *next; + + next = rt_mutex_top_waiter(lock); + plist_add(&next->pi_list_entry, &owner->pi_waiters); + } + __rt_mutex_adjust_prio(owner); + + if (owner->pi_blocked_on) { + boost = 1; + get_task_struct(owner); + } + spin_unlock_irqrestore(&owner->pi_lock, flags); + } + + WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + + if (!boost) + return; + + spin_unlock(&lock->wait_lock); + + rt_mutex_adjust_prio_chain(owner, 0, lock, NULL __IP__); + + spin_lock(&lock->wait_lock); +} + +/* + * Slow path lock function: + */ +static int __sched +rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock __IP_DECL__) +{ + struct rt_mutex_waiter waiter; + int ret = 0; + + debug_rt_mutex_init_waiter(&waiter); + waiter.task = NULL; + + spin_lock(&lock->wait_lock); + + /* Try to acquire the lock again: */ + if (try_to_take_rt_mutex(lock __IP__)) { + spin_unlock(&lock->wait_lock); + return 0; + } + + set_current_state(state); + + /* Setup the timer, when timeout != NULL */ + if (unlikely(timeout)) + hrtimer_start(&timeout->timer, timeout->timer.expires, + HRTIMER_ABS); + + for (;;) { + /* Try to acquire the lock: */ + if (try_to_take_rt_mutex(lock __IP__)) + break; + + /* + * TASK_INTERRUPTIBLE checks for signals and + * timeout. Ignored otherwise. + */ + if (unlikely(state == TASK_INTERRUPTIBLE)) { + /* Signal pending? */ + if (signal_pending(current)) + ret = -EINTR; + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + if (ret) + break; + } + + /* + * waiter.task is NULL the first time we come here and + * when we have been woken up by the previous owner + * but the lock got stolen by a higher prio task. + */ + if (!waiter.task) { + ret = task_blocks_on_rt_mutex(lock, &waiter, + detect_deadlock __IP__); + /* + * If we got woken up by the owner then start loop + * all over without going into schedule to try + * to get the lock now: + */ + if (unlikely(!waiter.task)) + continue; + + if (unlikely(ret)) + break; + } + spin_unlock(&lock->wait_lock); + + debug_rt_mutex_print_deadlock(&waiter); + + schedule(); + + spin_lock(&lock->wait_lock); + set_current_state(state); + } + + set_current_state(TASK_RUNNING); + + if (unlikely(waiter.task)) + remove_waiter(lock, &waiter __IP__); + + /* + * try_to_take_rt_mutex() sets the waiter bit + * unconditionally. We might have to fix that up. + */ + fixup_rt_mutex_waiters(lock); + + spin_unlock(&lock->wait_lock); + + /* Remove pending timer: */ + if (unlikely(timeout)) + hrtimer_cancel(&timeout->timer); + + /* + * Readjust priority, when we did not get the lock. We might + * have been the pending owner and boosted. Since we did not + * take the lock, the PI boost has to go. + */ + if (unlikely(ret)) + rt_mutex_adjust_prio(current); + + debug_rt_mutex_free_waiter(&waiter); + + return ret; +} + +/* + * Slow path try-lock function: + */ +static inline int +rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__) +{ + int ret = 0; + + spin_lock(&lock->wait_lock); + + if (likely(rt_mutex_owner(lock) != current)) { + + ret = try_to_take_rt_mutex(lock __IP__); + /* + * try_to_take_rt_mutex() sets the lock waiters + * bit unconditionally. Clean this up. + */ + fixup_rt_mutex_waiters(lock); + } + + spin_unlock(&lock->wait_lock); + + return ret; +} + +/* + * Slow path to release a rt-mutex: + */ +static void __sched +rt_mutex_slowunlock(struct rt_mutex *lock) +{ + spin_lock(&lock->wait_lock); + + debug_rt_mutex_unlock(lock); + + rt_mutex_deadlock_account_unlock(current); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + spin_unlock(&lock->wait_lock); + return; + } + + wakeup_next_waiter(lock); + + spin_unlock(&lock->wait_lock); + + /* Undo pi boosting if necessary: */ + rt_mutex_adjust_prio(current); +} + +/* + * debug aware fast / slowpath lock,trylock,unlock + * + * The atomic acquire/release ops are compiled away, when either the + * architecture does not support cmpxchg or when debugging is enabled. + */ +static inline int +rt_mutex_fastlock(struct rt_mutex *lock, int state, + int detect_deadlock, + int (*slowfn)(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock __IP_DECL__)) +{ + if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 0; + } else + return slowfn(lock, state, NULL, detect_deadlock __RET_IP__); +} + +static inline int +rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, int detect_deadlock, + int (*slowfn)(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock __IP_DECL__)) +{ + if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 0; + } else + return slowfn(lock, state, timeout, detect_deadlock __RET_IP__); +} + +static inline int +rt_mutex_fasttrylock(struct rt_mutex *lock, + int (*slowfn)(struct rt_mutex *lock __IP_DECL__)) +{ + if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 1; + } + return slowfn(lock __RET_IP__); +} + +static inline void +rt_mutex_fastunlock(struct rt_mutex *lock, + void (*slowfn)(struct rt_mutex *lock)) +{ + if (likely(rt_mutex_cmpxchg(lock, current, NULL))) + rt_mutex_deadlock_account_unlock(current); + else + slowfn(lock); +} + +/** + * rt_mutex_lock - lock a rt_mutex + * + * @lock: the rt_mutex to be locked + */ +void __sched rt_mutex_lock(struct rt_mutex *lock) +{ + might_sleep(); + + rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock); + +/** + * rt_mutex_lock_interruptible - lock a rt_mutex interruptible + * + * @lock: the rt_mutex to be locked + * @detect_deadlock: deadlock detection on/off + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + * -EDEADLK when the lock would deadlock (when deadlock detection is on) + */ +int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, + int detect_deadlock) +{ + might_sleep(); + + return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, + detect_deadlock, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); + +/** + * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible + * the timeout structure is provided + * by the caller + * + * @lock: the rt_mutex to be locked + * @timeout: timeout structure or NULL (no timeout) + * @detect_deadlock: deadlock detection on/off + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + * -ETIMEOUT when the timeout expired + * -EDEADLK when the lock would deadlock (when deadlock detection is on) + */ +int +rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, + int detect_deadlock) +{ + might_sleep(); + + return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, + detect_deadlock, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); + +/** + * rt_mutex_trylock - try to lock a rt_mutex + * + * @lock: the rt_mutex to be locked + * + * Returns 1 on success and 0 on contention + */ +int __sched rt_mutex_trylock(struct rt_mutex *lock) +{ + return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); +} +EXPORT_SYMBOL_GPL(rt_mutex_trylock); + +/** + * rt_mutex_unlock - unlock a rt_mutex + * + * @lock: the rt_mutex to be unlocked + */ +void __sched rt_mutex_unlock(struct rt_mutex *lock) +{ + rt_mutex_fastunlock(lock, rt_mutex_slowunlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_unlock); + +/*** + * rt_mutex_destroy - mark a mutex unusable + * @lock: the mutex to be destroyed + * + * This function marks the mutex uninitialized, and any subsequent + * use of the mutex is forbidden. The mutex must not be locked when + * this function is called. + */ +void rt_mutex_destroy(struct rt_mutex *lock) +{ + WARN_ON(rt_mutex_is_locked(lock)); +#ifdef CONFIG_DEBUG_RT_MUTEXES + lock->magic = NULL; +#endif +} + +EXPORT_SYMBOL_GPL(rt_mutex_destroy); + +/** + * __rt_mutex_init - initialize the rt lock + * + * @lock: the rt lock to be initialized + * + * Initialize the rt lock to unlocked state. + * + * Initializing of a locked rt lock is not allowed + */ +void __rt_mutex_init(struct rt_mutex *lock, const char *name) +{ + lock->owner = NULL; + spin_lock_init(&lock->wait_lock); + plist_head_init(&lock->wait_list, &lock->wait_lock); + + debug_rt_mutex_init(lock, name); +} +EXPORT_SYMBOL_GPL(__rt_mutex_init); diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h new file mode 100644 index 00000000000..1e0fca13ff7 --- /dev/null +++ b/kernel/rtmutex.h @@ -0,0 +1,29 @@ +/* + * RT-Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + * This file contains macros used solely by rtmutex.c. + * Non-debug version. + */ + +#define __IP_DECL__ +#define __IP__ +#define __RET_IP__ +#define rt_mutex_deadlock_check(l) (0) +#define rt_mutex_deadlock_account_lock(m, t) do { } while (0) +#define rt_mutex_deadlock_account_unlock(l) do { } while (0) +#define debug_rt_mutex_init_waiter(w) do { } while (0) +#define debug_rt_mutex_free_waiter(w) do { } while (0) +#define debug_rt_mutex_lock(l) do { } while (0) +#define debug_rt_mutex_proxy_lock(l,p) do { } while (0) +#define debug_rt_mutex_proxy_unlock(l) do { } while (0) +#define debug_rt_mutex_unlock(l) do { } while (0) +#define debug_rt_mutex_init(m, n) do { } while (0) +#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) +#define debug_rt_mutex_print_deadlock(w) do { } while (0) +#define debug_rt_mutex_detect_deadlock(w,d) (d) +#define debug_rt_mutex_reset_waiter(w) do { } while (0) diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h new file mode 100644 index 00000000000..50eed60eb08 --- /dev/null +++ b/kernel/rtmutex_common.h @@ -0,0 +1,93 @@ +/* + * RT Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + * This file contains the private data structure and API definitions. + */ + +#ifndef __KERNEL_RTMUTEX_COMMON_H +#define __KERNEL_RTMUTEX_COMMON_H + +#include + +/* + * This is the control structure for tasks blocked on a rt_mutex, + * which is allocated on the kernel stack on of the blocked task. + * + * @list_entry: pi node to enqueue into the mutex waiters list + * @pi_list_entry: pi node to enqueue into the mutex owner waiters list + * @task: task reference to the blocked task + */ +struct rt_mutex_waiter { + struct plist_node list_entry; + struct plist_node pi_list_entry; + struct task_struct *task; + struct rt_mutex *lock; +#ifdef CONFIG_DEBUG_RT_MUTEXES + unsigned long ip; + pid_t deadlock_task_pid; + struct rt_mutex *deadlock_lock; +#endif +}; + +/* + * Various helpers to access the waiters-plist: + */ +static inline int rt_mutex_has_waiters(struct rt_mutex *lock) +{ + return !plist_head_empty(&lock->wait_list); +} + +static inline struct rt_mutex_waiter * +rt_mutex_top_waiter(struct rt_mutex *lock) +{ + struct rt_mutex_waiter *w; + + w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, + list_entry); + BUG_ON(w->lock != lock); + + return w; +} + +static inline int task_has_pi_waiters(struct task_struct *p) +{ + return !plist_head_empty(&p->pi_waiters); +} + +static inline struct rt_mutex_waiter * +task_top_pi_waiter(struct task_struct *p) +{ + return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, + pi_list_entry); +} + +/* + * lock->owner state tracking: + */ +#define RT_MUTEX_OWNER_PENDING 1UL +#define RT_MUTEX_HAS_WAITERS 2UL +#define RT_MUTEX_OWNER_MASKALL 3UL + +static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) +{ + return (struct task_struct *) + ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); +} + +static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) +{ + return (struct task_struct *) + ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); +} + +static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) +{ + return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; +} + +#endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f54afed8426..93a2c539864 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -133,6 +133,10 @@ extern int acct_parm[]; extern int no_unaligned_warning; #endif +#ifdef CONFIG_RT_MUTEXES +extern int max_lock_depth; +#endif + static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, ctl_table *, void **); static int proc_doutsstring(ctl_table *table, int write, struct file *filp, @@ -688,6 +692,17 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_RT_MUTEXES + { + .ctl_name = KERN_MAX_LOCK_DEPTH, + .procname = "max_lock_depth", + .data = &max_lock_depth, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { .ctl_name = 0 } }; -- cgit v1.2.3-70-g09d2 From c87e2837be82df479a6bae9f155c43516d2feebc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Jun 2006 02:54:58 -0700 Subject: [PATCH] pi-futex: futex_lock_pi/futex_unlock_pi support This adds the actual pi-futex implementation, based on rt-mutexes. [dino@in.ibm.com: fix an oops-causing race] Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Arjan van de Ven Signed-off-by: Dinakar Guniguntala Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/futex.h | 7 + include/linux/sched.h | 3 + kernel/exit.c | 8 + kernel/fork.c | 3 + kernel/futex.c | 829 +++++++++++++++++++++++++++++++++++++++++++++--- kernel/futex_compat.c | 11 +- kernel/rtmutex_common.h | 8 + 7 files changed, 828 insertions(+), 41 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/futex.h b/include/linux/futex.h index f05a3f46932..34c3a215f2c 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -12,6 +12,9 @@ #define FUTEX_REQUEUE 3 #define FUTEX_CMP_REQUEUE 4 #define FUTEX_WAKE_OP 5 +#define FUTEX_LOCK_PI 6 +#define FUTEX_UNLOCK_PI 7 +#define FUTEX_TRYLOCK_PI 8 /* * Support for robust futexes: the kernel cleans up held futexes at @@ -97,10 +100,14 @@ extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr); #ifdef CONFIG_FUTEX extern void exit_robust_list(struct task_struct *curr); +extern void exit_pi_state_list(struct task_struct *curr); #else static inline void exit_robust_list(struct task_struct *curr) { } +static inline void exit_pi_state_list(struct task_struct *curr) +{ +} #endif #define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */ diff --git a/include/linux/sched.h b/include/linux/sched.h index edadd13cf53..b4e6be7de5a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -84,6 +84,7 @@ struct sched_param { #include struct exec_domain; +struct futex_pi_state; /* * List of flags we want to share for kernel threads, @@ -915,6 +916,8 @@ struct task_struct { #ifdef CONFIG_COMPAT struct compat_robust_list_head __user *compat_robust_list; #endif + struct list_head pi_state_list; + struct futex_pi_state *pi_state_cache; atomic_t fs_excl; /* holding fs exclusive resources */ struct rcu_head rcu; diff --git a/kernel/exit.c b/kernel/exit.c index 3e8a0282e9a..ab06b9f88f6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -925,6 +925,14 @@ fastcall NORET_TYPE void do_exit(long code) mpol_free(tsk->mempolicy); tsk->mempolicy = NULL; #endif + /* + * This must happen late, after the PID is not + * hashed anymore: + */ + if (unlikely(!list_empty(&tsk->pi_state_list))) + exit_pi_state_list(tsk); + if (unlikely(current->pi_state_cache)) + kfree(current->pi_state_cache); /* * If DEBUG_MUTEXES is on, make sure we are holding no locks: */ diff --git a/kernel/fork.c b/kernel/fork.c index b664a081fff..628198a4f28 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1092,6 +1092,9 @@ static task_t *copy_process(unsigned long clone_flags, #ifdef CONFIG_COMPAT p->compat_robust_list = NULL; #endif + INIT_LIST_HEAD(&p->pi_state_list); + p->pi_state_cache = NULL; + /* * sigaltstack should be cleared when sharing the same VM */ diff --git a/kernel/futex.c b/kernel/futex.c index 50356fb5d72..b305b7f8dad 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -12,6 +12,10 @@ * (C) Copyright 2006 Red Hat Inc, All Rights Reserved * Thanks to Thomas Gleixner for suggestions, analysis and fixes. * + * PI-futex support started by Ingo Molnar and Thomas Gleixner + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006 Timesys Corp., Thomas Gleixner + * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. @@ -46,6 +50,8 @@ #include #include +#include "rtmutex_common.h" + #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) /* @@ -74,6 +80,27 @@ union futex_key { } both; }; +/* + * Priority Inheritance state: + */ +struct futex_pi_state { + /* + * list of 'owned' pi_state instances - these have to be + * cleaned up in do_exit() if the task exits prematurely: + */ + struct list_head list; + + /* + * The PI object: + */ + struct rt_mutex pi_mutex; + + struct task_struct *owner; + atomic_t refcount; + + union futex_key key; +}; + /* * We use this hashed waitqueue instead of a normal wait_queue_t, so * we can wake only the relevant ones (hashed queues may be shared). @@ -96,6 +123,10 @@ struct futex_q { /* For fd, sigio sent using these: */ int fd; struct file *filp; + + /* Optional priority inheritance state: */ + struct futex_pi_state *pi_state; + struct task_struct *task; }; /* @@ -258,6 +289,232 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from) return ret ? -EFAULT : 0; } +/* + * Fault handling. Called with current->mm->mmap_sem held. + */ +static int futex_handle_fault(unsigned long address, int attempt) +{ + struct vm_area_struct * vma; + struct mm_struct *mm = current->mm; + + if (attempt >= 2 || !(vma = find_vma(mm, address)) || + vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) + return -EFAULT; + + switch (handle_mm_fault(mm, vma, address, 1)) { + case VM_FAULT_MINOR: + current->min_flt++; + break; + case VM_FAULT_MAJOR: + current->maj_flt++; + break; + default: + return -EFAULT; + } + return 0; +} + +/* + * PI code: + */ +static int refill_pi_state_cache(void) +{ + struct futex_pi_state *pi_state; + + if (likely(current->pi_state_cache)) + return 0; + + pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); + + if (!pi_state) + return -ENOMEM; + + memset(pi_state, 0, sizeof(*pi_state)); + INIT_LIST_HEAD(&pi_state->list); + /* pi_mutex gets initialized later */ + pi_state->owner = NULL; + atomic_set(&pi_state->refcount, 1); + + current->pi_state_cache = pi_state; + + return 0; +} + +static struct futex_pi_state * alloc_pi_state(void) +{ + struct futex_pi_state *pi_state = current->pi_state_cache; + + WARN_ON(!pi_state); + current->pi_state_cache = NULL; + + return pi_state; +} + +static void free_pi_state(struct futex_pi_state *pi_state) +{ + if (!atomic_dec_and_test(&pi_state->refcount)) + return; + + /* + * If pi_state->owner is NULL, the owner is most probably dying + * and has cleaned up the pi_state already + */ + if (pi_state->owner) { + spin_lock_irq(&pi_state->owner->pi_lock); + list_del_init(&pi_state->list); + spin_unlock_irq(&pi_state->owner->pi_lock); + + rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); + } + + if (current->pi_state_cache) + kfree(pi_state); + else { + /* + * pi_state->list is already empty. + * clear pi_state->owner. + * refcount is at 0 - put it back to 1. + */ + pi_state->owner = NULL; + atomic_set(&pi_state->refcount, 1); + current->pi_state_cache = pi_state; + } +} + +/* + * Look up the task based on what TID userspace gave us. + * We dont trust it. + */ +static struct task_struct * futex_find_get_task(pid_t pid) +{ + struct task_struct *p; + + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + if (!p) + goto out_unlock; + if ((current->euid != p->euid) && (current->euid != p->uid)) { + p = NULL; + goto out_unlock; + } + if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) { + p = NULL; + goto out_unlock; + } + get_task_struct(p); +out_unlock: + read_unlock(&tasklist_lock); + + return p; +} + +/* + * This task is holding PI mutexes at exit time => bad. + * Kernel cleans up PI-state, but userspace is likely hosed. + * (Robust-futex cleanup is separate and might save the day for userspace.) + */ +void exit_pi_state_list(struct task_struct *curr) +{ + struct futex_hash_bucket *hb; + struct list_head *next, *head = &curr->pi_state_list; + struct futex_pi_state *pi_state; + union futex_key key; + + /* + * We are a ZOMBIE and nobody can enqueue itself on + * pi_state_list anymore, but we have to be careful + * versus waiters unqueueing themselfs + */ + spin_lock_irq(&curr->pi_lock); + while (!list_empty(head)) { + + next = head->next; + pi_state = list_entry(next, struct futex_pi_state, list); + key = pi_state->key; + spin_unlock_irq(&curr->pi_lock); + + hb = hash_futex(&key); + spin_lock(&hb->lock); + + spin_lock_irq(&curr->pi_lock); + if (head->next != next) { + spin_unlock(&hb->lock); + continue; + } + + list_del_init(&pi_state->list); + + WARN_ON(pi_state->owner != curr); + + pi_state->owner = NULL; + spin_unlock_irq(&curr->pi_lock); + + rt_mutex_unlock(&pi_state->pi_mutex); + + spin_unlock(&hb->lock); + + spin_lock_irq(&curr->pi_lock); + } + spin_unlock_irq(&curr->pi_lock); +} + +static int +lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) +{ + struct futex_pi_state *pi_state = NULL; + struct futex_q *this, *next; + struct list_head *head; + struct task_struct *p; + pid_t pid; + + head = &hb->chain; + + list_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &me->key)) { + /* + * Another waiter already exists - bump up + * the refcount and return its pi_state: + */ + pi_state = this->pi_state; + atomic_inc(&pi_state->refcount); + me->pi_state = pi_state; + + return 0; + } + } + + /* + * We are the first waiter - try to look up the real owner and + * attach the new pi_state to it: + */ + pid = uval & FUTEX_TID_MASK; + p = futex_find_get_task(pid); + if (!p) + return -ESRCH; + + pi_state = alloc_pi_state(); + + /* + * Initialize the pi_mutex in locked state and make 'p' + * the owner of it: + */ + rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); + + /* Store the key for possible exit cleanups: */ + pi_state->key = me->key; + + spin_lock_irq(&p->pi_lock); + list_add(&pi_state->list, &p->pi_state_list); + pi_state->owner = p; + spin_unlock_irq(&p->pi_lock); + + put_task_struct(p); + + me->pi_state = pi_state; + + return 0; +} + /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. @@ -285,6 +542,70 @@ static void wake_futex(struct futex_q *q) q->lock_ptr = NULL; } +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) +{ + struct task_struct *new_owner; + struct futex_pi_state *pi_state = this->pi_state; + u32 curval, newval; + + if (!pi_state) + return -EINVAL; + + new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); + + /* + * This happens when we have stolen the lock and the original + * pending owner did not enqueue itself back on the rt_mutex. + * Thats not a tragedy. We know that way, that a lock waiter + * is on the fly. We make the futex_q waiter the pending owner. + */ + if (!new_owner) + new_owner = this->task; + + /* + * We pass it to the next owner. (The WAITERS bit is always + * kept enabled while there is PI state around. We must also + * preserve the owner died bit.) + */ + newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + dec_preempt_count(); + + if (curval == -EFAULT) + return -EFAULT; + if (curval != uval) + return -EINVAL; + + list_del_init(&pi_state->owner->pi_state_list); + list_add(&pi_state->list, &new_owner->pi_state_list); + pi_state->owner = new_owner; + rt_mutex_unlock(&pi_state->pi_mutex); + + return 0; +} + +static int unlock_futex_pi(u32 __user *uaddr, u32 uval) +{ + u32 oldval; + + /* + * There is no waiter, so we unlock the futex. The owner died + * bit has not to be preserved here. We are the owner: + */ + inc_preempt_count(); + oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); + dec_preempt_count(); + + if (oldval == -EFAULT) + return oldval; + if (oldval != uval) + return -EAGAIN; + + return 0; +} + /* * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: @@ -309,6 +630,8 @@ static int futex_wake(u32 __user *uaddr, int nr_wake) list_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key)) { + if (this->pi_state) + return -EINVAL; wake_futex(this); if (++ret >= nr_wake) break; @@ -385,27 +708,9 @@ retry: * still holding the mmap_sem. */ if (attempt++) { - struct vm_area_struct * vma; - struct mm_struct *mm = current->mm; - unsigned long address = (unsigned long)uaddr2; - - ret = -EFAULT; - if (attempt >= 2 || - !(vma = find_vma(mm, address)) || - vma->vm_start > address || - !(vma->vm_flags & VM_WRITE)) + if (futex_handle_fault((unsigned long)uaddr2, + attempt)) goto out; - - switch (handle_mm_fault(mm, vma, address, 1)) { - case VM_FAULT_MINOR: - current->min_flt++; - break; - case VM_FAULT_MAJOR: - current->maj_flt++; - break; - default: - goto out; - } goto retry; } @@ -572,6 +877,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { list_add_tail(&q->list, &hb->chain); + q->task = current; spin_unlock(&hb->lock); } @@ -626,6 +932,9 @@ static int unqueue_me(struct futex_q *q) } WARN_ON(list_empty(&q->list)); list_del(&q->list); + + BUG_ON(q->pi_state); + spin_unlock(lock_ptr); ret = 1; } @@ -634,16 +943,36 @@ static int unqueue_me(struct futex_q *q) return ret; } +/* + * PI futexes can not be requeued and must remove themself from the + * hash bucket. The hash bucket lock is held on entry and dropped here. + */ +static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) +{ + WARN_ON(list_empty(&q->list)); + list_del(&q->list); + + BUG_ON(!q->pi_state); + free_pi_state(q->pi_state); + q->pi_state = NULL; + + spin_unlock(&hb->lock); + + drop_key_refs(&q->key); +} + static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) { - DECLARE_WAITQUEUE(wait, current); + struct task_struct *curr = current; + DECLARE_WAITQUEUE(wait, curr); struct futex_hash_bucket *hb; struct futex_q q; u32 uval; int ret; + q.pi_state = NULL; retry: - down_read(¤t->mm->mmap_sem); + down_read(&curr->mm->mmap_sem); ret = get_futex_key(uaddr, &q.key); if (unlikely(ret != 0)) @@ -680,7 +1009,7 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) * If we would have faulted, release mmap_sem, fault it in and * start all over again. */ - up_read(¤t->mm->mmap_sem); + up_read(&curr->mm->mmap_sem); ret = get_user(uval, uaddr); @@ -688,11 +1017,9 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) goto retry; return ret; } - if (uval != val) { - ret = -EWOULDBLOCK; - queue_unlock(&q, hb); - goto out_release_sem; - } + ret = -EWOULDBLOCK; + if (uval != val) + goto out_unlock_release_sem; /* Only actually queue if *uaddr contained val. */ __queue_me(&q, hb); @@ -700,8 +1027,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) /* * Now the futex is queued and we have checked the data, we * don't want to hold mmap_sem while we sleep. - */ - up_read(¤t->mm->mmap_sem); + */ + up_read(&curr->mm->mmap_sem); /* * There might have been scheduling since the queue_me(), as we @@ -739,8 +1066,415 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) */ return -EINTR; + out_unlock_release_sem: + queue_unlock(&q, hb); + out_release_sem: + up_read(&curr->mm->mmap_sem); + return ret; +} + +/* + * Userspace tried a 0 -> TID atomic transition of the futex value + * and failed. The kernel side here does the whole locking operation: + * if there are waiters then it will block, it does PI, etc. (Due to + * races the kernel might see a 0 value of the futex too.) + */ +static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, + struct hrtimer_sleeper *to) +{ + struct task_struct *curr = current; + struct futex_hash_bucket *hb; + u32 uval, newval, curval; + struct futex_q q; + int ret, attempt = 0; + + if (refill_pi_state_cache()) + return -ENOMEM; + + q.pi_state = NULL; + retry: + down_read(&curr->mm->mmap_sem); + + ret = get_futex_key(uaddr, &q.key); + if (unlikely(ret != 0)) + goto out_release_sem; + + hb = queue_lock(&q, -1, NULL); + + retry_locked: + /* + * To avoid races, we attempt to take the lock here again + * (by doing a 0 -> TID atomic cmpxchg), while holding all + * the locks. It will most likely not succeed. + */ + newval = current->pid; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); + dec_preempt_count(); + + if (unlikely(curval == -EFAULT)) + goto uaddr_faulted; + + /* We own the lock already */ + if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { + if (!detect && 0) + force_sig(SIGKILL, current); + ret = -EDEADLK; + goto out_unlock_release_sem; + } + + /* + * Surprise - we got the lock. Just return + * to userspace: + */ + if (unlikely(!curval)) + goto out_unlock_release_sem; + + uval = curval; + newval = uval | FUTEX_WAITERS; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + dec_preempt_count(); + + if (unlikely(curval == -EFAULT)) + goto uaddr_faulted; + if (unlikely(curval != uval)) + goto retry_locked; + + /* + * We dont have the lock. Look up the PI state (or create it if + * we are the first waiter): + */ + ret = lookup_pi_state(uval, hb, &q); + + if (unlikely(ret)) { + /* + * There were no waiters and the owner task lookup + * failed. When the OWNER_DIED bit is set, then we + * know that this is a robust futex and we actually + * take the lock. This is safe as we are protected by + * the hash bucket lock. We also set the waiters bit + * unconditionally here, to simplify glibc handling of + * multiple tasks racing to acquire the lock and + * cleanup the problems which were left by the dead + * owner. + */ + if (curval & FUTEX_OWNER_DIED) { + uval = newval; + newval = current->pid | + FUTEX_OWNER_DIED | FUTEX_WAITERS; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, + uval, newval); + dec_preempt_count(); + + if (unlikely(curval == -EFAULT)) + goto uaddr_faulted; + if (unlikely(curval != uval)) + goto retry_locked; + ret = 0; + } + goto out_unlock_release_sem; + } + + /* + * Only actually queue now that the atomic ops are done: + */ + __queue_me(&q, hb); + + /* + * Now the futex is queued and we have checked the data, we + * don't want to hold mmap_sem while we sleep. + */ + up_read(&curr->mm->mmap_sem); + + WARN_ON(!q.pi_state); + /* + * Block on the PI mutex: + */ + if (!trylock) + ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); + else { + ret = rt_mutex_trylock(&q.pi_state->pi_mutex); + /* Fixup the trylock return value: */ + ret = ret ? 0 : -EWOULDBLOCK; + } + + down_read(&curr->mm->mmap_sem); + hb = queue_lock(&q, -1, NULL); + + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case. + */ + if (!ret && q.pi_state->owner != curr) { + u32 newtid = current->pid | FUTEX_WAITERS; + + /* Owner died? */ + if (q.pi_state->owner != NULL) { + spin_lock_irq(&q.pi_state->owner->pi_lock); + list_del_init(&q.pi_state->list); + spin_unlock_irq(&q.pi_state->owner->pi_lock); + } else + newtid |= FUTEX_OWNER_DIED; + + q.pi_state->owner = current; + + spin_lock_irq(¤t->pi_lock); + list_add(&q.pi_state->list, ¤t->pi_state_list); + spin_unlock_irq(¤t->pi_lock); + + /* Unqueue and drop the lock */ + unqueue_me_pi(&q, hb); + up_read(&curr->mm->mmap_sem); + /* + * We own it, so we have to replace the pending owner + * TID. This must be atomic as we have preserve the + * owner died bit here. + */ + ret = get_user(uval, uaddr); + while (!ret) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + curval = futex_atomic_cmpxchg_inatomic(uaddr, + uval, newval); + if (curval == -EFAULT) + ret = -EFAULT; + if (curval == uval) + break; + uval = curval; + } + } else { + /* + * Catch the rare case, where the lock was released + * when we were on the way back before we locked + * the hash bucket. + */ + if (ret && q.pi_state->owner == curr) { + if (rt_mutex_trylock(&q.pi_state->pi_mutex)) + ret = 0; + } + /* Unqueue and drop the lock */ + unqueue_me_pi(&q, hb); + up_read(&curr->mm->mmap_sem); + } + + if (!detect && ret == -EDEADLK && 0) + force_sig(SIGKILL, current); + + return ret; + + out_unlock_release_sem: + queue_unlock(&q, hb); + + out_release_sem: + up_read(&curr->mm->mmap_sem); + return ret; + + uaddr_faulted: + /* + * We have to r/w *(int __user *)uaddr, but we can't modify it + * non-atomically. Therefore, if get_user below is not + * enough, we need to handle the fault ourselves, while + * still holding the mmap_sem. + */ + if (attempt++) { + if (futex_handle_fault((unsigned long)uaddr, attempt)) + goto out_unlock_release_sem; + + goto retry_locked; + } + + queue_unlock(&q, hb); + up_read(&curr->mm->mmap_sem); + + ret = get_user(uval, uaddr); + if (!ret && (uval != -EFAULT)) + goto retry; + + return ret; +} + +/* + * Restart handler + */ +static long futex_lock_pi_restart(struct restart_block *restart) +{ + struct hrtimer_sleeper timeout, *to = NULL; + int ret; + + restart->fn = do_no_restart_syscall; + + if (restart->arg2 || restart->arg3) { + to = &timeout; + hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); + hrtimer_init_sleeper(to, current); + to->timer.expires.tv64 = ((u64)restart->arg1 << 32) | + (u64) restart->arg0; + } + + pr_debug("lock_pi restart: %p, %d (%d)\n", + (u32 __user *)restart->arg0, current->pid); + + ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1, + 0, to); + + if (ret != -EINTR) + return ret; + + restart->fn = futex_lock_pi_restart; + + /* The other values are filled in */ + return -ERESTART_RESTARTBLOCK; +} + +/* + * Called from the syscall entry below. + */ +static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, + long nsec, int trylock) +{ + struct hrtimer_sleeper timeout, *to = NULL; + struct restart_block *restart; + int ret; + + if (sec != MAX_SCHEDULE_TIMEOUT) { + to = &timeout; + hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); + hrtimer_init_sleeper(to, current); + to->timer.expires = ktime_set(sec, nsec); + } + + ret = do_futex_lock_pi(uaddr, detect, trylock, to); + + if (ret != -EINTR) + return ret; + + pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid); + + restart = ¤t_thread_info()->restart_block; + restart->fn = futex_lock_pi_restart; + restart->arg0 = (unsigned long) uaddr; + restart->arg1 = detect; + if (to) { + restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF; + restart->arg3 = to->timer.expires.tv64 >> 32; + } else + restart->arg2 = restart->arg3 = 0; + + return -ERESTART_RESTARTBLOCK; +} + +/* + * Userspace attempted a TID -> 0 atomic transition, and failed. + * This is the in-kernel slowpath: we look up the PI state (if any), + * and do the rt-mutex unlock. + */ +static int futex_unlock_pi(u32 __user *uaddr) +{ + struct futex_hash_bucket *hb; + struct futex_q *this, *next; + u32 uval; + struct list_head *head; + union futex_key key; + int ret, attempt = 0; + +retry: + if (get_user(uval, uaddr)) + return -EFAULT; + /* + * We release only a lock we actually own: + */ + if ((uval & FUTEX_TID_MASK) != current->pid) + return -EPERM; + /* + * First take all the futex related locks: + */ + down_read(¤t->mm->mmap_sem); + + ret = get_futex_key(uaddr, &key); + if (unlikely(ret != 0)) + goto out; + + hb = hash_futex(&key); + spin_lock(&hb->lock); + +retry_locked: + /* + * To avoid races, try to do the TID -> 0 atomic transition + * again. If it succeeds then we can return without waking + * anyone else up: + */ + inc_preempt_count(); + uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); + dec_preempt_count(); + + if (unlikely(uval == -EFAULT)) + goto pi_faulted; + /* + * Rare case: we managed to release the lock atomically, + * no need to wake anyone else up: + */ + if (unlikely(uval == current->pid)) + goto out_unlock; + + /* + * Ok, other tasks may need to be woken up - check waiters + * and do the wakeup if necessary: + */ + head = &hb->chain; + + list_for_each_entry_safe(this, next, head, list) { + if (!match_futex (&this->key, &key)) + continue; + ret = wake_futex_pi(uaddr, uval, this); + /* + * The atomic access to the futex value + * generated a pagefault, so retry the + * user-access and the wakeup: + */ + if (ret == -EFAULT) + goto pi_faulted; + goto out_unlock; + } + /* + * No waiters - kernel unlocks the futex: + */ + ret = unlock_futex_pi(uaddr, uval); + if (ret == -EFAULT) + goto pi_faulted; + +out_unlock: + spin_unlock(&hb->lock); +out: + up_read(¤t->mm->mmap_sem); + + return ret; + +pi_faulted: + /* + * We have to r/w *(int __user *)uaddr, but we can't modify it + * non-atomically. Therefore, if get_user below is not + * enough, we need to handle the fault ourselves, while + * still holding the mmap_sem. + */ + if (attempt++) { + if (futex_handle_fault((unsigned long)uaddr, attempt)) + goto out_unlock; + + goto retry_locked; + } + + spin_unlock(&hb->lock); up_read(¤t->mm->mmap_sem); + + ret = get_user(uval, uaddr); + if (!ret && (uval != -EFAULT)) + goto retry; + return ret; } @@ -819,6 +1553,7 @@ static int futex_fd(u32 __user *uaddr, int signal) err = -ENOMEM; goto error; } + q->pi_state = NULL; down_read(¤t->mm->mmap_sem); err = get_futex_key(uaddr, &q->key); @@ -856,7 +1591,7 @@ error: * Implementation: user-space maintains a per-thread list of locks it * is holding. Upon do_exit(), the kernel carefully walks this list, * and marks all locks that are owned by this thread with the - * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is + * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is * always manipulated with the lock held, so the list is private and * per-thread. Userspace also maintains a per-thread 'list_op_pending' * field, to allow the kernel to clean up if the thread dies after @@ -931,7 +1666,7 @@ err_unlock: */ int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) { - u32 uval; + u32 uval, nval; retry: if (get_user(uval, uaddr)) @@ -948,8 +1683,12 @@ retry: * thread-death.) The rest of the cleanup is done in * userspace. */ - if (futex_atomic_cmpxchg_inatomic(uaddr, uval, - uval | FUTEX_OWNER_DIED) != uval) + nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, + uval | FUTEX_OWNER_DIED); + if (nval == -EFAULT) + return -1; + + if (nval != uval) goto retry; if (uval & FUTEX_WAITERS) @@ -994,7 +1733,7 @@ void exit_robust_list(struct task_struct *curr) while (entry != &head->list) { /* * A pending lock might already be on the list, so - * dont process it twice: + * don't process it twice: */ if (entry != pending) if (handle_futex_death((void *)entry + futex_offset, @@ -1040,6 +1779,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, case FUTEX_WAKE_OP: ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); break; + case FUTEX_LOCK_PI: + ret = futex_lock_pi(uaddr, val, timeout, val2, 0); + break; + case FUTEX_UNLOCK_PI: + ret = futex_unlock_pi(uaddr); + break; + case FUTEX_TRYLOCK_PI: + ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); + break; default: ret = -ENOSYS; } @@ -1055,17 +1803,22 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout = MAX_SCHEDULE_TIMEOUT; u32 val2 = 0; - if (utime && (op == FUTEX_WAIT)) { + if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { if (copy_from_user(&t, utime, sizeof(t)) != 0) return -EFAULT; if (!timespec_valid(&t)) return -EINVAL; - timeout = timespec_to_jiffies(&t) + 1; + if (op == FUTEX_WAIT) + timeout = timespec_to_jiffies(&t) + 1; + else { + timeout = t.tv_sec; + val2 = t.tv_nsec; + } } /* * requeue parameter in 'utime' if op == FUTEX_REQUEUE. */ - if (op >= FUTEX_REQUEUE) + if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) val2 = (u32) (unsigned long) utime; return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 7e57c31670a..d1d92b441fb 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -129,14 +129,19 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout = MAX_SCHEDULE_TIMEOUT; int val2 = 0; - if (utime && (op == FUTEX_WAIT)) { + if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { if (get_compat_timespec(&t, utime)) return -EFAULT; if (!timespec_valid(&t)) return -EINVAL; - timeout = timespec_to_jiffies(&t) + 1; + if (op == FUTEX_WAIT) + timeout = timespec_to_jiffies(&t) + 1; + else { + timeout = t.tv_sec; + val2 = t.tv_nsec; + } } - if (op >= FUTEX_REQUEUE) + if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) val2 = (int) (unsigned long) utime; return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index e068024eeff..9c75856e791 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -112,4 +112,12 @@ static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; } +/* + * PI-futex support (proxy locking functions, etc.): + */ +extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); +extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner); +extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, + struct task_struct *proxy_owner); #endif -- cgit v1.2.3-70-g09d2