From a29c33f4e506e1dae7e0985b6328046535becbf8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 7 Feb 2012 18:51:01 -0800 Subject: userns: Convert setting and getting uid and gid system calls to use kuid and kgid Convert setregid, setgid, setreuid, setuid, setresuid, getresuid, setresgid, getresgid, setfsuid, setfsgid, getuid, geteuid, getgid, getegid, waitpid, waitid, wait4. Convert userspace uids and gids into kuids and kgids before being placed on struct cred. Convert struct cred kuids and kgids into userspace uids and gids when returning them. Signed-off-by: Eric W. Biederman --- kernel/exit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index d8bd3b425fa..789e3c5777f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1214,7 +1214,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) unsigned long state; int retval, status, traced; pid_t pid = task_pid_vnr(p); - uid_t uid = __task_cred(p)->uid; + uid_t uid = from_kuid_munged(current_user_ns(), __task_cred(p)->uid); struct siginfo __user *infop; if (!likely(wo->wo_flags & WEXITED)) @@ -1427,7 +1427,7 @@ static int wait_task_stopped(struct wait_opts *wo, if (!unlikely(wo->wo_flags & WNOWAIT)) *p_code = 0; - uid = task_uid(p); + uid = from_kuid_munged(current_user_ns(), __task_cred(p)->uid); unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) @@ -1500,7 +1500,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) } if (!unlikely(wo->wo_flags & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; - uid = task_uid(p); + uid = from_kuid_munged(current_user_ns(), __task_cred(p)->uid); spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); -- cgit v1.2.3-70-g09d2 From 8ca937a668d4be0b0c89c4fc9a912a5885ac06fe Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 17 May 2012 23:31:39 +0200 Subject: cred: use correct cred accessor with regards to rcu read lock Commit "userns: Convert setting and getting uid and gid system calls to use kuid and kgid has modified the accessors in wait_task_continued() and wait_task_stopped() to use __task_cred() instead of task_uid(). __task_cred() assumes that we're inside a rcu read lock, which is untrue for these two functions. Modify it to use task_uid() instead. Signed-off-by: Sasha Levin Signed-off-by: Eric W. Biederman --- kernel/exit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 789e3c5777f..910a0716e17 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1427,7 +1427,7 @@ static int wait_task_stopped(struct wait_opts *wo, if (!unlikely(wo->wo_flags & WNOWAIT)) *p_code = 0; - uid = from_kuid_munged(current_user_ns(), __task_cred(p)->uid); + uid = from_kuid_munged(current_user_ns(), task_uid(p)); unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) @@ -1500,7 +1500,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) } if (!unlikely(wo->wo_flags & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; - uid = from_kuid_munged(current_user_ns(), __task_cred(p)->uid); + uid = from_kuid_munged(current_user_ns(), task_uid(p)); spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); -- cgit v1.2.3-70-g09d2 From e73f8959af0439d114847eab5a8a5ce48f1217c4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 11 May 2012 10:59:07 +1000 Subject: task_work_add: generic process-context callbacks Provide a simple mechanism that allows running code in the (nonatomic) context of the arbitrary task. The caller does task_work_add(task, task_work) and this task executes task_work->func() either from do_notify_resume() or from do_exit(). The callback can rely on PF_EXITING to detect the latter case. "struct task_work" can be embedded in another struct, still it has "void *data" to handle the most common/simple case. This allows us to kill the ->replacement_session_keyring hack, and potentially this can have more users. Performance-wise, this adds 2 "unlikely(!hlist_empty())" checks into tracehook_notify_resume() and do_exit(). But at the same time we can remove the "replacement_session_keyring != NULL" checks from arch/*/signal.c and exit_creds(). Note: task_work_add/task_work_run abuses ->pi_lock. This is only because this lock is already used by lookup_pi_state() to synchronize with do_exit() setting PF_EXITING. Fortunately the scope of this lock in task_work.c is really tiny, and the code is unlikely anyway. Signed-off-by: Oleg Nesterov Acked-by: David Howells Cc: Thomas Gleixner Cc: Richard Kuo Cc: Linus Torvalds Cc: Alexander Gordeev Cc: Chris Zankel Cc: David Smith Cc: "Frank Ch. Eigler" Cc: Geert Uytterhoeven Cc: Larry Woodman Cc: Peter Zijlstra Cc: Tejun Heo Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- include/linux/sched.h | 2 ++ include/linux/task_work.h | 33 +++++++++++++++++++ include/linux/tracehook.h | 11 +++++++ kernel/Makefile | 2 +- kernel/exit.c | 5 ++- kernel/fork.c | 1 + kernel/task_work.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 136 insertions(+), 2 deletions(-) create mode 100644 include/linux/task_work.h create mode 100644 kernel/task_work.c (limited to 'kernel/exit.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5ea8baea938..7930131abc1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1400,6 +1400,8 @@ struct task_struct { int (*notifier)(void *priv); void *notifier_data; sigset_t *notifier_mask; + struct hlist_head task_works; + struct audit_context *audit_context; #ifdef CONFIG_AUDITSYSCALL uid_t loginuid; diff --git a/include/linux/task_work.h b/include/linux/task_work.h new file mode 100644 index 00000000000..294d5d5e90b --- /dev/null +++ b/include/linux/task_work.h @@ -0,0 +1,33 @@ +#ifndef _LINUX_TASK_WORK_H +#define _LINUX_TASK_WORK_H + +#include +#include + +struct task_work; +typedef void (*task_work_func_t)(struct task_work *); + +struct task_work { + struct hlist_node hlist; + task_work_func_t func; + void *data; +}; + +static inline void +init_task_work(struct task_work *twork, task_work_func_t func, void *data) +{ + twork->func = func; + twork->data = data; +} + +int task_work_add(struct task_struct *task, struct task_work *twork, bool); +struct task_work *task_work_cancel(struct task_struct *, task_work_func_t); +void task_work_run(void); + +static inline void exit_task_work(struct task_struct *task) +{ + if (unlikely(!hlist_empty(&task->task_works))) + task_work_run(); +} + +#endif /* _LINUX_TASK_WORK_H */ diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index b9ca903bb55..b2dd0917ca0 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -49,6 +49,7 @@ #include #include #include +#include struct linux_binprm; /* @@ -164,8 +165,10 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info, */ static inline void set_notify_resume(struct task_struct *task) { +#ifdef TIF_NOTIFY_RESUME if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_RESUME)) kick_process(task); +#endif } /** @@ -185,6 +188,14 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) { if (current->replacement_session_keyring) key_replace_session_keyring(); + /* + * The caller just cleared TIF_NOTIFY_RESUME. This barrier + * pairs with task_work_add()->set_notify_resume() after + * hlist_add_head(task->task_works); + */ + smp_mb__after_clear_bit(); + if (unlikely(!hlist_empty(¤t->task_works))) + task_work_run(); } #endif /* */ diff --git a/kernel/Makefile b/kernel/Makefile index 6c07f30fa9b..bf1034008ac 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,7 +5,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ - signal.o sys.o kmod.o workqueue.o pid.o \ + signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ diff --git a/kernel/exit.c b/kernel/exit.c index 910a0716e17..3d93325e0b1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -946,11 +946,14 @@ void do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against - * an exiting task cleaning up the robust pi futexes. + * an exiting task cleaning up the robust pi futexes, and in + * task_work_add() to avoid the race with exit_task_work(). */ smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); + exit_task_work(tsk); + exit_irq_thread(); if (unlikely(in_atomic())) diff --git a/kernel/fork.c b/kernel/fork.c index 05c813dc9ec..a46db217a58 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1411,6 +1411,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); + INIT_HLIST_HEAD(&p->task_works); /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible diff --git a/kernel/task_work.c b/kernel/task_work.c new file mode 100644 index 00000000000..82d1c794066 --- /dev/null +++ b/kernel/task_work.c @@ -0,0 +1,84 @@ +#include +#include +#include + +int +task_work_add(struct task_struct *task, struct task_work *twork, bool notify) +{ + unsigned long flags; + int err = -ESRCH; + +#ifndef TIF_NOTIFY_RESUME + if (notify) + return -ENOTSUPP; +#endif + /* + * We must not insert the new work if the task has already passed + * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait() + * and check PF_EXITING under pi_lock. + */ + raw_spin_lock_irqsave(&task->pi_lock, flags); + if (likely(!(task->flags & PF_EXITING))) { + hlist_add_head(&twork->hlist, &task->task_works); + err = 0; + } + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ + if (likely(!err) && notify) + set_notify_resume(task); + return err; +} + +struct task_work * +task_work_cancel(struct task_struct *task, task_work_func_t func) +{ + unsigned long flags; + struct task_work *twork; + struct hlist_node *pos; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + hlist_for_each_entry(twork, pos, &task->task_works, hlist) { + if (twork->func == func) { + hlist_del(&twork->hlist); + goto found; + } + } + twork = NULL; + found: + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + return twork; +} + +void task_work_run(void) +{ + struct task_struct *task = current; + struct hlist_head task_works; + struct hlist_node *pos; + + raw_spin_lock_irq(&task->pi_lock); + hlist_move_list(&task->task_works, &task_works); + raw_spin_unlock_irq(&task->pi_lock); + + if (unlikely(hlist_empty(&task_works))) + return; + /* + * We use hlist to save the space in task_struct, but we want fifo. + * Find the last entry, the list should be short, then process them + * in reverse order. + */ + for (pos = task_works.first; pos->next; pos = pos->next) + ; + + for (;;) { + struct hlist_node **pprev = pos->pprev; + struct task_work *twork = container_of(pos, struct task_work, + hlist); + twork->func(twork); + + if (pprev == &task_works.first) + break; + pos = container_of(pprev, struct hlist_node, next); + } +} -- cgit v1.2.3-70-g09d2 From 4d1d61a6b203d957777d73fcebf19d90b038b5b2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 11 May 2012 10:59:08 +1000 Subject: genirq: reimplement exit_irq_thread() hook via task_work_add() exit_irq_thread() and task->irq_thread are needed to handle the unexpected (and unlikely) exit of irq-thread. We can use task_work instead and make this all private to kernel/irq/manage.c, cleanup plus micro-optimization. 1. rename exit_irq_thread() to irq_thread_dtor(), make it static, and move it up before irq_thread(). 2. change irq_thread() to do task_work_add(irq_thread_dtor) at the start and task_work_cancel() before return. tracehook_notify_resume() can never play with kthreads, only do_exit()->exit_task_work() can call the callback and this is what we want. 3. remove task_struct->irq_thread and the special hook in do_exit(). Signed-off-by: Oleg Nesterov Reviewed-by: Thomas Gleixner Cc: David Howells Cc: Richard Kuo Cc: Linus Torvalds Cc: Alexander Gordeev Cc: Chris Zankel Cc: David Smith Cc: "Frank Ch. Eigler" Cc: Geert Uytterhoeven Cc: Larry Woodman Cc: Peter Zijlstra Cc: Tejun Heo Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- include/linux/interrupt.h | 4 --- include/linux/sched.h | 10 ++----- kernel/exit.c | 2 -- kernel/irq/manage.c | 68 +++++++++++++++++++++++------------------------ 4 files changed, 35 insertions(+), 49 deletions(-) (limited to 'kernel/exit.c') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index c91171599cb..e68a8e53bb5 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -142,8 +142,6 @@ request_any_context_irq(unsigned int irq, irq_handler_t handler, extern int __must_check request_percpu_irq(unsigned int irq, irq_handler_t handler, const char *devname, void __percpu *percpu_dev_id); - -extern void exit_irq_thread(void); #else extern int __must_check @@ -177,8 +175,6 @@ request_percpu_irq(unsigned int irq, irq_handler_t handler, { return request_irq(irq, handler, 0, devname, percpu_dev_id); } - -static inline void exit_irq_thread(void) { } #endif extern void free_irq(unsigned int, void *); diff --git a/include/linux/sched.h b/include/linux/sched.h index 7930131abc1..da013853a62 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1301,11 +1301,6 @@ struct task_struct { unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; -#ifdef CONFIG_GENERIC_HARDIRQS - /* IRQ handler threads */ - unsigned irq_thread:1; -#endif - pid_t pid; pid_t tgid; @@ -1313,10 +1308,9 @@ struct task_struct { /* Canary value for the -fstack-protector gcc feature */ unsigned long stack_canary; #endif - - /* + /* * pointers to (original) parent process, youngest child, younger sibling, - * older sibling, respectively. (p->father can be replaced with + * older sibling, respectively. (p->father can be replaced with * p->real_parent->pid) */ struct task_struct __rcu *real_parent; /* real parent process */ diff --git a/kernel/exit.c b/kernel/exit.c index 3d93325e0b1..3ecd096e5d4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -954,8 +954,6 @@ void do_exit(long code) exit_task_work(tsk); - exit_irq_thread(); - if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index bb32326afe8..4d1f8f89741 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "internals.h" @@ -773,11 +774,39 @@ static void wake_threads_waitq(struct irq_desc *desc) wake_up(&desc->wait_for_threads); } +static void irq_thread_dtor(struct task_work *unused) +{ + struct task_struct *tsk = current; + struct irq_desc *desc; + struct irqaction *action; + + if (WARN_ON_ONCE(!(current->flags & PF_EXITING))) + return; + + action = kthread_data(tsk); + + pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", + tsk->comm ? tsk->comm : "", tsk->pid, action->irq); + + + desc = irq_to_desc(action->irq); + /* + * If IRQTF_RUNTHREAD is set, we need to decrement + * desc->threads_active and wake possible waiters. + */ + if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + wake_threads_waitq(desc); + + /* Prevent a stale desc->threads_oneshot */ + irq_finalize_oneshot(desc, action); +} + /* * Interrupt handler thread */ static int irq_thread(void *data) { + struct task_work on_exit_work; static const struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; @@ -793,7 +822,9 @@ static int irq_thread(void *data) handler_fn = irq_thread_fn; sched_setscheduler(current, SCHED_FIFO, ¶m); - current->irq_thread = 1; + + init_task_work(&on_exit_work, irq_thread_dtor, NULL); + task_work_add(current, &on_exit_work, false); while (!irq_wait_for_interrupt(action)) { irqreturn_t action_ret; @@ -815,44 +846,11 @@ static int irq_thread(void *data) * cannot touch the oneshot mask at this point anymore as * __setup_irq() might have given out currents thread_mask * again. - * - * Clear irq_thread. Otherwise exit_irq_thread() would make - * fuzz about an active irq thread going into nirvana. */ - current->irq_thread = 0; + task_work_cancel(current, irq_thread_dtor); return 0; } -/* - * Called from do_exit() - */ -void exit_irq_thread(void) -{ - struct task_struct *tsk = current; - struct irq_desc *desc; - struct irqaction *action; - - if (!tsk->irq_thread) - return; - - action = kthread_data(tsk); - - pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", - tsk->comm ? tsk->comm : "", tsk->pid, action->irq); - - desc = irq_to_desc(action->irq); - - /* - * If IRQTF_RUNTHREAD is set, we need to decrement - * desc->threads_active and wake possible waiters. - */ - if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) - wake_threads_waitq(desc); - - /* Prevent a stale desc->threads_oneshot */ - irq_finalize_oneshot(desc, action); -} - static void irq_setup_forced_threading(struct irqaction *new) { if (!force_irqthreads) -- cgit v1.2.3-70-g09d2 From 43e13cc107cf6cd3c15fbe1cef849435c2223d50 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 31 May 2012 16:26:16 -0700 Subject: cred: remove task_is_dead() from __task_cred() validation Commit 8f92054e7ca1 ("CRED: Fix __task_cred()'s lockdep check and banner comment"): add the following validation condition: task->exit_state >= 0 to permit the access if the target task is dead and therefore unable to change its own credentials. OK, but afaics currently this can only help wait_task_zombie() which calls __task_cred() without rcu lock. Remove this validation and change wait_task_zombie() to use task_uid() instead. This means we do rcu_read_lock() only to shut up the lockdep, but we already do the same in, say, wait_task_stopped(). task_is_dead() should die, task->exit_state != 0 means that this task has passed exit_notify(), only do_wait-like code paths should use this. Unfortunately, we can't kill task_is_dead() right now, it has already acquired buggy users in drivers/staging. The fix already exists. Signed-off-by: Oleg Nesterov Reviewed-by: "Eric W. Biederman" Acked-by: David Howells Cc: Jiri Olsa Cc: Paul E. McKenney Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cred.h | 10 +++------- kernel/exit.c | 2 +- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel/exit.c') diff --git a/include/linux/cred.h b/include/linux/cred.h index 917dc5aeb1d..ebbed2ce663 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -277,17 +277,13 @@ static inline void put_cred(const struct cred *_cred) * @task: The task to query * * Access the objective credentials of a task. The caller must hold the RCU - * readlock or the task must be dead and unable to change its own credentials. + * readlock. * * The result of this function should not be passed directly to get_cred(); * rather get_task_cred() should be used instead. */ -#define __task_cred(task) \ - ({ \ - const struct task_struct *__t = (task); \ - rcu_dereference_check(__t->real_cred, \ - task_is_dead(__t)); \ - }) +#define __task_cred(task) \ + rcu_dereference((task)->real_cred) /** * get_current_cred - Get the current task's subjective credentials diff --git a/kernel/exit.c b/kernel/exit.c index 910a0716e17..3281493ce7a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1214,7 +1214,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) unsigned long state; int retval, status, traced; pid_t pid = task_pid_vnr(p); - uid_t uid = from_kuid_munged(current_user_ns(), __task_cred(p)->uid); + uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); struct siginfo __user *infop; if (!likely(wo->wo_flags & WEXITED)) -- cgit v1.2.3-70-g09d2 From 168eeccbc956d2ec083c3a513f7706784ee0dc5f Mon Sep 17 00:00:00 2001 From: Tim Bird Date: Thu, 31 May 2012 16:26:16 -0700 Subject: stack usage: add pid to warning printk in check_stack_usage In embedded systems, sometimes the same program (busybox) is the cause of multiple warnings. Outputting the pid with the program name in the warning printk helps distinguish which instances of a program are using the stack most. This is a small patch, but useful. Signed-off-by: Tim Bird Cc: Oleg Nesterov Cc: Frederic Weisbecker Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 3281493ce7a..6d85655353e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -884,9 +884,9 @@ static void check_stack_usage(void) spin_lock(&low_water_lock); if (free < lowest_to_date) { - printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " - "left\n", - current->comm, free); + printk(KERN_WARNING "%s (%d) used greatest stack depth: " + "%lu bytes left\n", + current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); -- cgit v1.2.3-70-g09d2