From 3f95aa81d265223fdb13ea2b59883766a05adbdf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Aug 2014 06:10:23 -0700 Subject: rcu: Make TASKS_RCU handle tasks that are almost done exiting Once a task has passed exit_notify() in the do_exit() code path, it is no longer on the task lists, and is therefore no longer visible to rcu_tasks_kthread(). This means that an almost-exited task might be preempted while within a trampoline, and this task won't be waited on by rcu_tasks_kthread(). This commit fixes this bug by adding an srcu_struct. An exiting task does srcu_read_lock() just before calling exit_notify(), and does the corresponding srcu_read_unlock() after doing the final preempt_disable(). This means that rcu_tasks_kthread() can do synchronize_srcu() to wait for all mostly-exited tasks to reach their final preempt_disable() region, and then use synchronize_sched() to wait for those tasks to finish exiting. Reported-by: Oleg Nesterov Suggested-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/exit.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 32c58f7433a..d13f2eec4bb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -667,6 +667,7 @@ void do_exit(long code) { struct task_struct *tsk = current; int group_dead; + TASKS_RCU(int tasks_rcu_i); profile_task_exit(tsk); @@ -775,6 +776,7 @@ void do_exit(long code) */ flush_ptrace_hw_breakpoint(tsk); + TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); exit_notify(tsk, group_dead); proc_exit_connector(tsk); #ifdef CONFIG_NUMA @@ -814,6 +816,7 @@ void do_exit(long code) if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); + TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); /* * The setting of TASK_RUNNING by try_to_wake_up() may be delayed -- cgit v1.2.3-70-g09d2 From 90ed9cbe765ad358b3151a12b8bf889a3cbcd573 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 15 Aug 2014 16:05:36 -0400 Subject: exit: Always reap resource stats in __exit_signal() Oleg pointed out that wait_task_zombie adds a task's usage statistics to the parent's signal struct, but the task's own signal struct should also propagate the statistics at exit time. This allows thread_group_cputime(reaped_zombie) to get the statistics after __unhash_process() has made the task invisible to for_each_thread, but before the thread has actually been rcu freed, making sure no non-monotonic results are returned inside that window. Suggested-by: Oleg Nesterov Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: David Rientjes Cc: Guillaume Morin Cc: Ionut Alexa Cc: Linus Torvalds Cc: Li Zefan Cc: Michal Hocko Cc: Michal Schmidt Cc: Oleg Nesterov Cc: umgwanakikbuti@gmail.com Cc: fweisbec@gmail.com Cc: srao@redhat.com Cc: lwoodman@redhat.com Cc: atheurer@redhat.com Link: http://lkml.kernel.org/r/1408133138-22048-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/exit.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 32c58f7433a..b93d46dab6f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -115,30 +115,29 @@ static void __exit_signal(struct task_struct *tsk) if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); - /* - * Accumulate here the counters for all threads but the - * group leader as they die, so they can be added into - * the process-wide totals when those are taken. - * The group leader stays around as a zombie as long - * as there are other threads. When it gets reaped, - * the exit.c code will add its counts into these totals. - * We won't ever get here for the group leader, since it - * will have been the last reference on the signal_struct. - */ - task_cputime(tsk, &utime, &stime); - sig->utime += utime; - sig->stime += stime; - sig->gtime += task_gtime(tsk); - sig->min_flt += tsk->min_flt; - sig->maj_flt += tsk->maj_flt; - sig->nvcsw += tsk->nvcsw; - sig->nivcsw += tsk->nivcsw; - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; } + /* + * Accumulate here the counters for all threads but the group leader + * as they die, so they can be added into the process-wide totals + * when those are taken. The group leader stays around as a zombie as + * long as there are other threads. When it gets reaped, the exit.c + * code will add its counts into these totals. We won't ever get here + * for the group leader, since it will have been the last reference on + * the signal_struct. + */ + task_cputime(tsk, &utime, &stime); + sig->utime += utime; + sig->stime += stime; + sig->gtime += task_gtime(tsk); + sig->min_flt += tsk->min_flt; + sig->maj_flt += tsk->maj_flt; + sig->nvcsw += tsk->nvcsw; + sig->nivcsw += tsk->nivcsw; + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(tsk, group_dead); -- cgit v1.2.3-70-g09d2 From e78c3496790ee8a36522a838b59b388e8a709e65 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Sat, 16 Aug 2014 13:40:10 -0400 Subject: time, signal: Protect resource use statistics with seqlock Both times() and clock_gettime(CLOCK_PROCESS_CPUTIME_ID) have scalability issues on large systems, due to both functions being serialized with a lock. The lock protects against reporting a wrong value, due to a thread in the task group exiting, its statistics reporting up to the signal struct, and that exited task's statistics being counted twice (or not at all). Protecting that with a lock results in times() and clock_gettime() being completely serialized on large systems. This can be fixed by using a seqlock around the events that gather and propagate statistics. As an additional benefit, the protection code can be moved into thread_group_cputime(), slightly simplifying the calling functions. In the case of posix_cpu_clock_get_task() things can be simplified a lot, because the calling function already ensures that the task sticks around, and the rest is now taken care of in thread_group_cputime(). This way the statistics reporting code can run lockless. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Alex Thorlton Cc: Andrew Morton Cc: Daeseok Youn Cc: David Rientjes Cc: Dongsheng Yang Cc: Geert Uytterhoeven Cc: Guillaume Morin Cc: Ionut Alexa Cc: Kees Cook Cc: Linus Torvalds Cc: Li Zefan Cc: Michal Hocko Cc: Michal Schmidt Cc: Oleg Nesterov Cc: Vladimir Davydov Cc: umgwanakikbuti@gmail.com Cc: fweisbec@gmail.com Cc: srao@redhat.com Cc: lwoodman@redhat.com Cc: atheurer@redhat.com Link: http://lkml.kernel.org/r/20140816134010.26a9b572@annuminas.surriel.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/exit.c | 4 ++++ kernel/fork.c | 1 + kernel/sched/cputime.c | 33 ++++++++++++++++++++------------- kernel/sys.c | 2 -- kernel/time/posix-cpu-timers.c | 14 -------------- 6 files changed, 26 insertions(+), 29 deletions(-) (limited to 'kernel/exit.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c2c885ee52..dd9eb480738 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -645,6 +645,7 @@ struct signal_struct { * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ + seqlock_t stats_lock; cputime_t utime, stime, cutime, cstime; cputime_t gtime; cputime_t cgtime; diff --git a/kernel/exit.c b/kernel/exit.c index b93d46dab6f..fa09b86609d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -127,6 +127,7 @@ static void __exit_signal(struct task_struct *tsk) * the signal_struct. */ task_cputime(tsk, &utime, &stime); + write_seqlock(&sig->stats_lock); sig->utime += utime; sig->stime += stime; sig->gtime += task_gtime(tsk); @@ -140,6 +141,7 @@ static void __exit_signal(struct task_struct *tsk) sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); /* * Do this under ->siglock, we can race with another thread @@ -1042,6 +1044,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; + write_seqlock(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; @@ -1064,6 +1067,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); + write_sequnlock(&psig->stats_lock); spin_unlock_irq(&p->real_parent->sighand->siglock); } diff --git a/kernel/fork.c b/kernel/fork.c index 0cf9cdb6e49..9387ae8ab04 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1068,6 +1068,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); + seqlock_init(&sig->stats_lock); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 3e52836359b..49b7cfe98f7 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -288,18 +288,28 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) struct signal_struct *sig = tsk->signal; cputime_t utime, stime; struct task_struct *t; - - times->utime = sig->utime; - times->stime = sig->stime; - times->sum_exec_runtime = sig->sum_sched_runtime; + unsigned int seq, nextseq; rcu_read_lock(); - for_each_thread(tsk, t) { - task_cputime(t, &utime, &stime); - times->utime += utime; - times->stime += stime; - times->sum_exec_runtime += task_sched_runtime(t); - } + /* Attempt a lockless read on the first round. */ + nextseq = 0; + do { + seq = nextseq; + read_seqbegin_or_lock(&sig->stats_lock, &seq); + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + for_each_thread(tsk, t) { + task_cputime(t, &utime, &stime); + times->utime += utime; + times->stime += stime; + times->sum_exec_runtime += task_sched_runtime(t); + } + /* If lockless access failed, take the lock. */ + nextseq = 1; + } while (need_seqretry(&sig->stats_lock, seq)); + done_seqretry(&sig->stats_lock, seq); rcu_read_unlock(); } @@ -611,9 +621,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) cputime_adjust(&cputime, &p->prev_cputime, ut, st); } -/* - * Must be called with siglock held. - */ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime; diff --git a/kernel/sys.c b/kernel/sys.c index ce8129192a2..b6636643cbd 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -862,11 +862,9 @@ void do_sys_times(struct tms *tms) { cputime_t tgutime, tgstime, cutime, cstime; - spin_lock_irq(¤t->sighand->siglock); thread_group_cputime_adjusted(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; - spin_unlock_irq(¤t->sighand->siglock); tms->tms_utime = cputime_to_clock_t(tgutime); tms->tms_stime = cputime_to_clock_t(tgstime); tms->tms_cutime = cputime_to_clock_t(cutime); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3b8946416a5..492b986195d 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, if (same_thread_group(tsk, current)) err = cpu_clock_sample(which_clock, tsk, &rtn); } else { - unsigned long flags; - struct sighand_struct *sighand; - - /* - * while_each_thread() is not yet entirely RCU safe, - * keep locking the group while sampling process - * clock for now. - */ - sighand = lock_task_sighand(tsk, &flags); - if (!sighand) - return err; - if (tsk == current || thread_group_leader(tsk)) err = cpu_clock_sample_group(which_clock, tsk, &rtn); - - unlock_task_sighand(tsk, &flags); } if (!err) -- cgit v1.2.3-70-g09d2 From 1029a2b52c09e479fd7b07275812ad97868c0fb0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Sep 2014 10:18:49 +0200 Subject: sched, exit: Deal with nested sleeps do_wait() is a big wait loop, but we set TASK_RUNNING too late; we end up calling potential sleeps before we reset it. Not strictly a bug since we're guaranteed to exit the loop and not call schedule(); put in annotations to quiet might_sleep(). WARNING: CPU: 0 PID: 1 at ../kernel/sched/core.c:7123 __might_sleep+0x7e/0x90() do not call blocking ops when !TASK_RUNNING; state=1 set at [] do_wait+0x88/0x270 Call Trace: [] dump_stack+0x4e/0x7a [] warn_slowpath_common+0x8c/0xc0 [] warn_slowpath_fmt+0x4c/0x50 [] __might_sleep+0x7e/0x90 [] might_fault+0x55/0xb0 [] wait_consider_task+0x90b/0xc10 [] do_wait+0x104/0x270 [] SyS_wait4+0x77/0x100 [] system_call_fastpath+0x16/0x1b Signed-off-by: Peter Zijlstra (Intel) Cc: tglx@linutronix.de Cc: umgwanakikbuti@gmail.com Cc: ilya.dryomov@inktank.com Cc: Alex Elder Cc: Andrew Morton Cc: Axel Lin Cc: Daniel Borkmann Cc: Dave Jones Cc: Guillaume Morin Cc: Ionut Alexa Cc: Jason Baron Cc: Linus Torvalds Cc: Michal Hocko Cc: Michal Schmidt Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Rik van Riel Cc: Rusty Russell Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20140924082242.186408915@infradead.org Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 2 ++ kernel/exit.c | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'kernel/exit.c') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3d770f5564b..5068a0d9fec 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -175,10 +175,12 @@ extern int _cond_resched(void); */ # define might_sleep() \ do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) +# define sched_annotate_sleep() __set_current_state(TASK_RUNNING) #else static inline void __might_sleep(const char *file, int line, int preempt_offset) { } # define might_sleep() do { might_resched(); } while (0) +# define sched_annotate_sleep() do { } while (0) #endif #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) diff --git a/kernel/exit.c b/kernel/exit.c index 5d30019ff95..232c4bc8bcc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -997,6 +997,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) get_task_struct(p); read_unlock(&tasklist_lock); + sched_annotate_sleep(); + if ((exit_code & 0x7f) == 0) { why = CLD_EXITED; status = exit_code >> 8; @@ -1079,6 +1081,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * thread can reap it because we its state == DEAD/TRACE. */ read_unlock(&tasklist_lock); + sched_annotate_sleep(); retval = wo->wo_rusage ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; @@ -1210,6 +1213,7 @@ unlock_sig: pid = task_pid_vnr(p); why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); + sched_annotate_sleep(); if (unlikely(wo->wo_flags & WNOWAIT)) return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); @@ -1272,6 +1276,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) pid = task_pid_vnr(p); get_task_struct(p); read_unlock(&tasklist_lock); + sched_annotate_sleep(); if (!wo->wo_info) { retval = wo->wo_rusage -- cgit v1.2.3-70-g09d2 From e1c2296c3485158304bfad5a80e89078463d70c8 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Thu, 16 Oct 2014 14:59:48 -0400 Subject: tty: Move session_of_pgrp() and make static tiocspgrp() is the lone caller of session_of_pgrp(); relocate and limit to file scope. Signed-off-by: Peter Hurley Reviewed-by: Alan Cox Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_io.c | 21 +++++++++++++++++++++ include/linux/kernel.h | 3 --- kernel/exit.c | 21 --------------------- 3 files changed, 21 insertions(+), 24 deletions(-) (limited to 'kernel/exit.c') diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 114854c5554..ae8f53c7972 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -2515,6 +2515,27 @@ struct pid *tty_get_pgrp(struct tty_struct *tty) } EXPORT_SYMBOL_GPL(tty_get_pgrp); +/* + * This checks not only the pgrp, but falls back on the pid if no + * satisfactory pgrp is found. I dunno - gdb doesn't work correctly + * without this... + * + * The caller must hold rcu lock or the tasklist lock. + */ +static struct pid *session_of_pgrp(struct pid *pgrp) +{ + struct task_struct *p; + struct pid *sid = NULL; + + p = pid_task(pgrp, PIDTYPE_PGID); + if (p == NULL) + p = pid_task(pgrp, PIDTYPE_PID); + if (p != NULL) + sid = task_session(p); + + return sid; +} + /** * tiocgpgrp - get process group * @tty: tty passed by user diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3d770f5564b..01bc530fbfc 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -411,9 +411,6 @@ extern int __kernel_text_address(unsigned long addr); extern int kernel_text_address(unsigned long addr); extern int func_ptr_is_kernel_text(void *ptr); -struct pid; -extern struct pid *session_of_pgrp(struct pid *pgrp); - unsigned long int_sqrt(unsigned long); extern void bust_spinlocks(int yes); diff --git a/kernel/exit.c b/kernel/exit.c index 5d30019ff95..6a3e2e5004b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -214,27 +214,6 @@ repeat: goto repeat; } -/* - * This checks not only the pgrp, but falls back on the pid if no - * satisfactory pgrp is found. I dunno - gdb doesn't work correctly - * without this... - * - * The caller must hold rcu lock or the tasklist lock. - */ -struct pid *session_of_pgrp(struct pid *pgrp) -{ - struct task_struct *p; - struct pid *sid = NULL; - - p = pid_task(pgrp, PIDTYPE_PGID); - if (p == NULL) - p = pid_task(pgrp, PIDTYPE_PID); - if (p != NULL) - sid = task_session(p); - - return sid; -} - /* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected -- cgit v1.2.3-70-g09d2 From dc2fd4b00946751ebd222d366fc64550e4188dc2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:45:24 -0800 Subject: exit: reparent: use ->ptrace_entry rather than ->sibling for EXIT_DEAD tasks reparent_leader() reuses ->sibling as a list node to add an EXIT_DEAD task into dead_children list we are going to release. This obviously removes the dead task from its real_parent->children list and this is even good; the parent can do nothing with the EXIT_DEAD reparented zombie, it only makes do_wait() slower. But, this also means that it can not be reparented once again, so if its new parent dies too nobody will update ->parent/real_parent, they can point to the freed memory even before release_task() we are going to call, this breaks the code which relies on pid_alive() to access ->real_parent/parent. Fortunately this is mostly theoretical, this can only happen if init or PR_SET_CHILD_SUBREAPER process ignores SIGCHLD and the new parent sub-thread exits right after we drop tasklist_lock. Change this code to use ->ptrace_entry instead, we know that the child is not traced so nobody can ever use this member. This also allows to unify this logic with exit_ptrace(), see the next changes. Note: we really need to change release_task() to nullify real_parent/ parent/group_leader pointers, but we need to change the current users first somehow. And it would be better to reap this zombie immediately but release_task_locked() we need is complicated by proc_flush_task(). Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: Alexey Dobriyan Cc: "Eric W. Biederman" , Cc: Sterling Alexander Cc: Peter Zijlstra Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 232c4bc8bcc..0272305bf85 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -548,7 +548,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { if (do_notify_parent(p, p->exit_signal)) { p->exit_state = EXIT_DEAD; - list_move_tail(&p->sibling, dead); + list_add(&p->ptrace_entry, dead); } } @@ -587,8 +587,8 @@ static void forget_original_parent(struct task_struct *father) BUG_ON(!list_empty(&father->children)); - list_for_each_entry_safe(p, n, &dead_children, sibling) { - list_del_init(&p->sibling); + list_for_each_entry_safe(p, n, &dead_children, ptrace_entry) { + list_del_init(&p->ptrace_entry); release_task(p); } } -- cgit v1.2.3-70-g09d2 From 57a059187d5ba5592e36c6f23d046bc37616f346 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:45:27 -0800 Subject: exit: reparent: cleanup the changing of ->parent 1. Cosmetic, but "if (t->parent == father)" looks a bit confusing. We need to change t->parent if and only if t is not traced. 2. If we actually want this BUG_ON() to ensure that parent/ptrace match each other, then we should also take ptrace_reparented() case into account too. 3. Change this code to use for_each_thread() instead of deprecated while_each_thread(). [dan.carpenter@oracle.com: silence a bogus static checker warning] Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: Alexey Dobriyan Cc: "Eric W. Biederman" , Cc: Sterling Alexander Cc: Peter Zijlstra Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 0272305bf85..464971e6923 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -557,7 +557,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, static void forget_original_parent(struct task_struct *father) { - struct task_struct *p, *n, *reaper; + struct task_struct *p, *t, *n, *reaper; LIST_HEAD(dead_children); write_lock_irq(&tasklist_lock); @@ -569,18 +569,15 @@ static void forget_original_parent(struct task_struct *father) reaper = find_new_reaper(father); list_for_each_entry_safe(p, n, &father->children, sibling) { - struct task_struct *t = p; - - do { + for_each_thread(p, t) { t->real_parent = reaper; - if (t->parent == father) { - BUG_ON(t->ptrace); + BUG_ON((!t->ptrace) != (t->parent == father)); + if (likely(!t->ptrace)) t->parent = t->real_parent; - } if (t->pdeath_signal) group_send_sig_info(t->pdeath_signal, SEND_SIG_NOINFO, t); - } while_each_thread(p, t); + } reparent_leader(father, p, &dead_children); } write_unlock_irq(&tasklist_lock); -- cgit v1.2.3-70-g09d2 From 2831096e21503897ee474c23131c3feb8db0ffb1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:45:30 -0800 Subject: exit: reparent: cleanup the usage of reparent_leader() 1. Now that reparent_leader() doesn't abuse ->sibling we can shift list_move_tail() from reparent_leader() to forget_original_parent() and turn it into a single list_splice_tail_init(). This also makes BUG_ON(!list_empty()) and list_for_each_entry_safe() unnecessary. 2. This also allows to shift the same_thread_group() check, it looks a bit more clear in the caller. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: Alexey Dobriyan Cc: "Eric W. Biederman" , Cc: Sterling Alexander Cc: Peter Zijlstra Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 464971e6923..772e9175735 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -529,15 +529,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father) static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { - list_move_tail(&p->sibling, &p->real_parent->children); - - if (p->exit_state == EXIT_DEAD) - return; - /* - * If this is a threaded reparent there is no need to - * notify anyone anything has happened. - */ - if (same_thread_group(p->real_parent, father)) + if (unlikely(p->exit_state == EXIT_DEAD)) return; /* We don't want people slaying init. */ @@ -568,7 +560,7 @@ static void forget_original_parent(struct task_struct *father) exit_ptrace(father); reaper = find_new_reaper(father); - list_for_each_entry_safe(p, n, &father->children, sibling) { + list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { t->real_parent = reaper; BUG_ON((!t->ptrace) != (t->parent == father)); @@ -578,12 +570,16 @@ static void forget_original_parent(struct task_struct *father) group_send_sig_info(t->pdeath_signal, SEND_SIG_NOINFO, t); } - reparent_leader(father, p, &dead_children); + /* + * If this is a threaded reparent there is no need to + * notify anyone anything has happened. + */ + if (!same_thread_group(reaper, father)) + reparent_leader(father, p, &dead_children); } + list_splice_tail_init(&father->children, &reaper->children); write_unlock_irq(&tasklist_lock); - BUG_ON(!list_empty(&father->children)); - list_for_each_entry_safe(p, n, &dead_children, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); -- cgit v1.2.3-70-g09d2 From 7c8bd2322c7fd973d089b27de55e29c92c667a06 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:45:33 -0800 Subject: exit: ptrace: shift "reap dead" code from exit_ptrace() to forget_original_parent() Now that forget_original_parent() uses ->ptrace_entry for EXIT_DEAD tasks, we can simply pass "dead_children" list to exit_ptrace() and remove another release_task() loop. Plus this way we do not need to drop and reacquire tasklist_lock. Also shift the list_empty(ptraced) check, if we want this optimization it makes sense to eliminate the function call altogether. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: Alexey Dobriyan Cc: "Eric W. Biederman" , Cc: Sterling Alexander Cc: Peter Zijlstra Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ptrace.h | 2 +- kernel/exit.c | 10 ++++------ kernel/ptrace.c | 23 +++-------------------- 3 files changed, 8 insertions(+), 27 deletions(-) (limited to 'kernel/exit.c') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index cc79eff4a1a..987a73a40ef 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -52,7 +52,7 @@ extern void ptrace_notify(int exit_code); extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent); extern void __ptrace_unlink(struct task_struct *child); -extern void exit_ptrace(struct task_struct *tracer); +extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_READ 0x01 #define PTRACE_MODE_ATTACH 0x02 #define PTRACE_MODE_NOAUDIT 0x04 diff --git a/kernel/exit.c b/kernel/exit.c index 772e9175735..9c9526d8727 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -553,13 +553,11 @@ static void forget_original_parent(struct task_struct *father) LIST_HEAD(dead_children); write_lock_irq(&tasklist_lock); - /* - * Note that exit_ptrace() and find_new_reaper() might - * drop tasklist_lock and reacquire it. - */ - exit_ptrace(father); - reaper = find_new_reaper(father); + if (unlikely(!list_empty(&father->ptraced))) + exit_ptrace(father, &dead_children); + /* Can drop and reacquire tasklist_lock */ + reaper = find_new_reaper(father); list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { t->real_parent = reaper; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 54e75226c2c..1eb9d90c3af 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -485,36 +485,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) /* * Detach all tasks we were using ptrace on. Called with tasklist held - * for writing, and returns with it held too. But note it can release - * and reacquire the lock. + * for writing. */ -void exit_ptrace(struct task_struct *tracer) - __releases(&tasklist_lock) - __acquires(&tasklist_lock) +void exit_ptrace(struct task_struct *tracer, struct list_head *dead) { struct task_struct *p, *n; - LIST_HEAD(ptrace_dead); - - if (likely(list_empty(&tracer->ptraced))) - return; list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { if (unlikely(p->ptrace & PT_EXITKILL)) send_sig_info(SIGKILL, SEND_SIG_FORCED, p); if (__ptrace_detach(tracer, p)) - list_add(&p->ptrace_entry, &ptrace_dead); - } - - write_unlock_irq(&tasklist_lock); - BUG_ON(!list_empty(&tracer->ptraced)); - - list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { - list_del_init(&p->ptrace_entry); - release_task(p); + list_add(&p->ptrace_entry, dead); } - - write_lock_irq(&tasklist_lock); } int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) -- cgit v1.2.3-70-g09d2 From f6507f83bccd4a5f7dc7091079bf58128dc56d66 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:54:45 -0800 Subject: exit: wait: cleanup the ptrace_reparented() checks Now that EXIT_DEAD is the terminal state we can kill "int traced" variable and check "state == EXIT_DEAD" instead to cleanup the code. In particular, this way it is clear that the check obviously doesn't need tasklist_lock. Also fix the type of "unsigned long state", "long" was always wrong although this doesn't matter because cmpxchg/xchg uses typeof(*ptr). [akpm@linux-foundation.org: don't make me google the C Operator Precedence table] Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: "Eric W. Biederman" Cc: Rik van Riel Cc: Sterling Alexander Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 9c9526d8727..20875d6398a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -973,8 +973,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, */ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) { - unsigned long state; - int retval, status, traced; + int state, retval, status; pid_t pid = task_pid_vnr(p); uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); struct siginfo __user *infop; @@ -999,19 +998,18 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) } return wait_noreap_copyout(wo, p, pid, uid, why, status); } - - traced = ptrace_reparented(p); /* * Move the task's state to DEAD/TRACE, only one thread can do this. */ - state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD; + state = (ptrace_reparented(p) && thread_group_leader(p)) ? + EXIT_TRACE : EXIT_DEAD; if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; + /* - * It can be ptraced but not reparented, check - * thread_group_leader() to filter out sub-threads. + * Check thread_group_leader() to exclude the traced sub-threads. */ - if (likely(!traced) && thread_group_leader(p)) { + if (state == EXIT_DEAD && thread_group_leader(p)) { struct signal_struct *psig; struct signal_struct *sig; unsigned long maxrss; -- cgit v1.2.3-70-g09d2 From f953ccd00615140b5e722ffe2b920da22dfb4db9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:54:48 -0800 Subject: exit: wait: don't use zombie->real_parent 1. wait_task_zombie() uses p->real_parent to get psig/siglock. This is correct but needs tasklist_lock, ->real_parent can exit. We can use "current" instead. This is our natural child, its parent must be our sub-thread. 2. Read psig/sig outside of ->siglock, ->signal is no longer protected by this lock. 3. Fix the outdated comments about tasklist_lock. We can not race with __exit_signal(), the whole thread group is dead, nobody but us can call it. Also clarify the usage of ->stats_lock and ->siglock. Note: thread_group_cputime_adjusted() is sub-optimal in this case, we probably want to export cputime_adjust() to avoid thread_group_cputime(). The comment says "all threads" but there are no other threads. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: "Eric W. Biederman" Cc: Rik van Riel Cc: Sterling Alexander Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 20875d6398a..457673d6593 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1010,8 +1010,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * Check thread_group_leader() to exclude the traced sub-threads. */ if (state == EXIT_DEAD && thread_group_leader(p)) { - struct signal_struct *psig; - struct signal_struct *sig; + struct signal_struct *sig = p->signal; + struct signal_struct *psig = current->signal; unsigned long maxrss; cputime_t tgutime, tgstime; @@ -1023,21 +1023,20 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * accumulate in the parent's signal_struct c* fields. * * We don't bother to take a lock here to protect these - * p->signal fields, because they are only touched by - * __exit_signal, which runs with tasklist_lock - * write-locked anyway, and so is excluded here. We do - * need to protect the access to parent->signal fields, - * as other threads in the parent group can be right - * here reaping other children at the same time. + * p->signal fields because the whole thread group is dead + * and nobody can change them. + * + * psig->stats_lock also protects us from our sub-theads + * which can reap other children at the same time. Until + * we change k_getrusage()-like users to rely on this lock + * we have to take ->siglock as well. * * We use thread_group_cputime_adjusted() to get times for * the thread group, which consolidates times for all threads * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); - spin_lock_irq(&p->real_parent->sighand->siglock); - psig = p->real_parent->signal; - sig = p->signal; + spin_lock_irq(¤t->sighand->siglock); write_seqlock(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; @@ -1062,7 +1061,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); write_sequnlock(&psig->stats_lock); - spin_unlock_irq(&p->real_parent->sighand->siglock); + spin_unlock_irq(¤t->sighand->siglock); } /* -- cgit v1.2.3-70-g09d2 From 986094dfe161b4346831547136d4e5ed7f94310e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:54:51 -0800 Subject: exit: wait: drop tasklist_lock before psig->c* accounting wait_task_zombie() no longer needs tasklist_lock to accumulate the psig->c* counters, we can drop it right after cmpxchg(exit_state). Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: "Eric W. Biederman" Cc: Rik van Riel Cc: Sterling Alexander Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 457673d6593..6297eb0f5bd 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1005,6 +1005,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) EXIT_TRACE : EXIT_DEAD; if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; + /* + * We own this thread, nobody else can reap it. + */ + read_unlock(&tasklist_lock); + sched_annotate_sleep(); /* * Check thread_group_leader() to exclude the traced sub-threads. @@ -1064,13 +1069,6 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) spin_unlock_irq(¤t->sighand->siglock); } - /* - * Now we are sure this task is interesting, and no other - * thread can reap it because we its state == DEAD/TRACE. - */ - read_unlock(&tasklist_lock); - sched_annotate_sleep(); - retval = wo->wo_rusage ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) -- cgit v1.2.3-70-g09d2 From 26e75b5c3d2226cb995fde064744aa93f63849c4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:54:54 -0800 Subject: exit: release_task: fix the comment about group leader accounting Contrary to what the comment in __exit_signal() says we do account the group leader. Fix this and explain why. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: "Eric W. Biederman" Cc: Rik van Riel Cc: Sterling Alexander Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 6297eb0f5bd..9a65f10dc9f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -118,13 +118,10 @@ static void __exit_signal(struct task_struct *tsk) } /* - * Accumulate here the counters for all threads but the group leader - * as they die, so they can be added into the process-wide totals - * when those are taken. The group leader stays around as a zombie as - * long as there are other threads. When it gets reaped, the exit.c - * code will add its counts into these totals. We won't ever get here - * for the group leader, since it will have been the last reference on - * the signal_struct. + * Accumulate here the counters for all threads as they die. We could + * skip the group leader because it is the last user of signal_struct, + * but we want to avoid the race with thread_group_cputime() which can + * see the empty ->thread_head list. */ task_cputime(tsk, &utime, &stime); write_seqlock(&sig->stats_lock); -- cgit v1.2.3-70-g09d2 From 8a1296aea4a319b33c3367ff3805835e949a229f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:54:59 -0800 Subject: exit: reparent: fix the dead-parent PR_SET_CHILD_SUBREAPER reparenting The ->has_child_subreaper code in find_new_reaper() finds alive "thread" but returns another "reaper" thread which can be dead. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: "Eric W. Biederman" Cc: Kay Sievers Cc: Lennart Poettering Cc: Sterling Alexander Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 9a65f10dc9f..fd38a8f0436 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -512,7 +512,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father) thread = reaper; do { if (!(thread->flags & PF_EXITING)) - return reaper; + return thread; } while_each_thread(reaper, thread); } } -- cgit v1.2.3-70-g09d2 From 7d24e2df52f596a1ea922e4f84a61f2fb24fbb70 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:55:02 -0800 Subject: exit: reparent: fix the cross-namespace PR_SET_CHILD_SUBREAPER reparenting find_new_reaper() assumes that "has_child_subreaper" logic is safe as long as we are not the exiting ->child_reaper and this is doubly wrong: 1. In fact it is safe if "pid_ns->child_reaper == father"; there must be no children after zap_pid_ns_processes() returns, so it doesn't matter what we return in this case and even pid_ns->child_reaper is wrong otherwise: we can't reparent to ->child_reaper == current. This is not a bug, but this is confusing. 2. It is not safe if we are not pid_ns->child_reaper but from the same thread group. We drop tasklist_lock before zap_pid_ns_processes(), so another thread can lock it and choose the new reaper from the upper namespace if has_child_subreaper == T, and this is obviously wrong. This is not that bad, zap_pid_ns_processes() won't return until the the new reaper reaps all zombies, but this should be fixed anyway. We could change for_each_thread() loop to use ->exit_state instead of PF_EXITING which we had to use until 8aac62706ada, or we could change copy_signal() to check CLONE_NEWPID before setting has_child_subreaper, but lets change this code so that it is clear we can't look outside of our namespace, otherwise same_thread_group(reaper, child_reaper) check will look wrong and confusing anyway. We can simply start from "father" and fix the problem. We can't wrongly return a thread from the same thread group if ->is_child_subreaper == T, we know that all threads have PF_EXITING set. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: "Eric W. Biederman" Cc: Kay Sievers Cc: Lennart Poettering Cc: Sterling Alexander Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index fd38a8f0436..9babd47a36e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -492,7 +492,9 @@ static struct task_struct *find_new_reaper(struct task_struct *father) zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); - } else if (father->signal->has_child_subreaper) { + } + + if (father->signal->has_child_subreaper) { struct task_struct *reaper; /* @@ -502,7 +504,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father) * PID namespace. However we still need the check above, see * http://marc.info/?l=linux-kernel&m=131385460420380 */ - for (reaper = father->real_parent; + for (reaper = father; reaper != &init_task; reaper = reaper->real_parent) { if (same_thread_group(reaper, pid_ns->child_reaper)) -- cgit v1.2.3-70-g09d2 From 3750ef979cfa1296630aa9f23e265c1bd721498a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:55:05 -0800 Subject: exit: reparent: s/while_each_thread/for_each_thread/ in find_new_reaper() Change find_new_reaper() to use for_each_thread() instead of deprecated while_each_thread(). We do not bother to check "thread != father" in the 1st loop, we can rely on PF_EXITING check. Note: this means the minor behavioural change: for_each_thread() starts from the group leader. But this should be fine, nobody should make any assumption about do_wait(__WNOTHREAD) when it comes to reparented tasks. And this can avoid the pointless reparenting to a short-living thread While zombie leaders are not that common. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: "Eric W. Biederman" Cc: Kay Sievers Cc: Lennart Poettering Cc: Sterling Alexander Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 9babd47a36e..a4204aaba8a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -473,8 +473,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father) struct pid_namespace *pid_ns = task_active_pid_ns(father); struct task_struct *thread; - thread = father; - while_each_thread(father, thread) { + for_each_thread(father, thread) { if (thread->flags & PF_EXITING) continue; if (unlikely(pid_ns->child_reaper == father)) @@ -511,11 +510,10 @@ static struct task_struct *find_new_reaper(struct task_struct *father) break; if (!reaper->signal->is_child_subreaper) continue; - thread = reaper; - do { + for_each_thread(reaper, thread) { if (!(thread->flags & PF_EXITING)) return thread; - } while_each_thread(reaper, thread); + } } } -- cgit v1.2.3-70-g09d2 From 175aed3f8d38b87d3287bb765c794205f2b511de Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:55:08 -0800 Subject: exit: reparent: document the ->has_child_subreaper checks Swap the "init_task" and same_thread_group() checks. This way it is more simple to document these checks and we can remove the link to the previous discussion on lkml. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: "Eric W. Biederman" Cc: Kay Sievers Cc: Lennart Poettering Cc: Sterling Alexander Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index a4204aaba8a..576949ce566 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -495,18 +495,16 @@ static struct task_struct *find_new_reaper(struct task_struct *father) if (father->signal->has_child_subreaper) { struct task_struct *reaper; - /* - * Find the first ancestor marked as child_subreaper. - * Note that the code below checks same_thread_group(reaper, - * pid_ns->child_reaper). This is what we need to DTRT in a - * PID namespace. However we still need the check above, see - * http://marc.info/?l=linux-kernel&m=131385460420380 + * Find the first ->is_child_subreaper ancestor in our pid_ns. + * We start from father to ensure we can not look into another + * namespace, this is safe because all its threads are dead. */ for (reaper = father; - reaper != &init_task; + !same_thread_group(reaper, pid_ns->child_reaper); reaper = reaper->real_parent) { - if (same_thread_group(reaper, pid_ns->child_reaper)) + /* call_usermodehelper() descendants need this check */ + if (reaper == &init_task) break; if (!reaper->signal->is_child_subreaper) continue; -- cgit v1.2.3-70-g09d2 From 1109909c7df08f55ff9104276bb9db1ee2e6e53d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:55:11 -0800 Subject: exit: reparent: introduce find_child_reaper() find_new_reaper() does 2 completely different things. Not only it finds a reaper, it also updates pid_ns->child_reaper or kills the whole namespace if the caller is ->child_reaper. Now that has_child_subreaper logic doesn't depend on child_reaper check we can move that pid_ns code into a separate helper. IMHO this makes the code more clean, and this allows the next changes. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: "Eric W. Biederman" Cc: Kay Sievers Cc: Lennart Poettering Cc: Sterling Alexander Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 56 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 21 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 576949ce566..930fbe1b5ee 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -459,6 +459,34 @@ static void exit_mm(struct task_struct *tsk) clear_thread_flag(TIF_MEMDIE); } +static struct task_struct *find_child_reaper(struct task_struct *father) + __releases(&tasklist_lock) + __acquires(&tasklist_lock) +{ + struct pid_namespace *pid_ns = task_active_pid_ns(father); + struct task_struct *reaper = pid_ns->child_reaper; + + if (likely(reaper != father)) + return reaper; + + for_each_thread(father, reaper) { + if (reaper->flags & PF_EXITING) + continue; + pid_ns->child_reaper = reaper; + return reaper; + } + + write_unlock_irq(&tasklist_lock); + if (unlikely(pid_ns == &init_pid_ns)) { + panic("Attempted to kill init! exitcode=0x%08x\n", + father->signal->group_exit_code ?: father->exit_code); + } + zap_pid_ns_processes(pid_ns); + write_lock_irq(&tasklist_lock); + + return father; +} + /* * When we die, we re-parent all our children, and try to: * 1. give them to another thread in our thread group, if such a member exists @@ -466,33 +494,17 @@ static void exit_mm(struct task_struct *tsk) * child_subreaper for its children (like a service manager) * 3. give it to the init process (PID 1) in our pid namespace */ -static struct task_struct *find_new_reaper(struct task_struct *father) - __releases(&tasklist_lock) - __acquires(&tasklist_lock) +static struct task_struct *find_new_reaper(struct task_struct *father, + struct task_struct *child_reaper) { - struct pid_namespace *pid_ns = task_active_pid_ns(father); struct task_struct *thread; for_each_thread(father, thread) { if (thread->flags & PF_EXITING) continue; - if (unlikely(pid_ns->child_reaper == father)) - pid_ns->child_reaper = thread; return thread; } - if (unlikely(pid_ns->child_reaper == father)) { - write_unlock_irq(&tasklist_lock); - if (unlikely(pid_ns == &init_pid_ns)) { - panic("Attempted to kill init! exitcode=0x%08x\n", - father->signal->group_exit_code ?: - father->exit_code); - } - - zap_pid_ns_processes(pid_ns); - write_lock_irq(&tasklist_lock); - } - if (father->signal->has_child_subreaper) { struct task_struct *reaper; /* @@ -501,7 +513,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father) * namespace, this is safe because all its threads are dead. */ for (reaper = father; - !same_thread_group(reaper, pid_ns->child_reaper); + !same_thread_group(reaper, child_reaper); reaper = reaper->real_parent) { /* call_usermodehelper() descendants need this check */ if (reaper == &init_task) @@ -515,7 +527,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father) } } - return pid_ns->child_reaper; + return child_reaper; } /* @@ -552,7 +564,9 @@ static void forget_original_parent(struct task_struct *father) exit_ptrace(father, &dead_children); /* Can drop and reacquire tasklist_lock */ - reaper = find_new_reaper(father); + reaper = find_child_reaper(father); + + reaper = find_new_reaper(father, reaper); list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { t->real_parent = reaper; -- cgit v1.2.3-70-g09d2 From c9dc05bfdb3f7fd7c00f3cbd33816c99d2cb9029 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:55:14 -0800 Subject: exit: reparent: introduce find_alive_thread() Add the new simple helper to factor out the for_each_thread() code in find_child_reaper() and find_new_reaper(). It can also simplify the potential PF_EXITING -> exit_state change, plus perhaps we can change this code to take SIGNAL_GROUP_EXIT into account. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: "Eric W. Biederman" Cc: Kay Sievers Cc: Lennart Poettering Cc: Sterling Alexander Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 930fbe1b5ee..b0f482f5daf 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -459,6 +459,17 @@ static void exit_mm(struct task_struct *tsk) clear_thread_flag(TIF_MEMDIE); } +static struct task_struct *find_alive_thread(struct task_struct *p) +{ + struct task_struct *t; + + for_each_thread(p, t) { + if (!(t->flags & PF_EXITING)) + return t; + } + return NULL; +} + static struct task_struct *find_child_reaper(struct task_struct *father) __releases(&tasklist_lock) __acquires(&tasklist_lock) @@ -469,9 +480,8 @@ static struct task_struct *find_child_reaper(struct task_struct *father) if (likely(reaper != father)) return reaper; - for_each_thread(father, reaper) { - if (reaper->flags & PF_EXITING) - continue; + reaper = find_alive_thread(father); + if (reaper) { pid_ns->child_reaper = reaper; return reaper; } @@ -497,16 +507,13 @@ static struct task_struct *find_child_reaper(struct task_struct *father) static struct task_struct *find_new_reaper(struct task_struct *father, struct task_struct *child_reaper) { - struct task_struct *thread; + struct task_struct *thread, *reaper; - for_each_thread(father, thread) { - if (thread->flags & PF_EXITING) - continue; + thread = find_alive_thread(father); + if (thread) return thread; - } if (father->signal->has_child_subreaper) { - struct task_struct *reaper; /* * Find the first ->is_child_subreaper ancestor in our pid_ns. * We start from father to ensure we can not look into another @@ -520,10 +527,9 @@ static struct task_struct *find_new_reaper(struct task_struct *father, break; if (!reaper->signal->is_child_subreaper) continue; - for_each_thread(reaper, thread) { - if (!(thread->flags & PF_EXITING)) - return thread; - } + thread = find_alive_thread(reaper); + if (thread) + return thread; } } -- cgit v1.2.3-70-g09d2 From ad9e206aefa56788b676ebcd6329e828f40d2238 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:55:17 -0800 Subject: exit: reparent: avoid find_new_reaper() if no children Now that pid_ns logic was isolated we can change forget_original_parent() to return right after find_child_reaper() when father->children is empty, there is nothing to reparent in this case. In particular this avoids find_alive_thread() and this can help if the whole process exits and it has a lot of PF_EXITING threads at the start of the thread list, this can easily lead to O(nr_threads ** 2) iterations. Trivial test case (tested under KVM, 2 CPUs): static void *tfunc(void *arg) { pause(); return NULL; } static int child(unsigned int nt) { pthread_t pt; while (nt--) assert(pthread_create(&pt, NULL, tfunc, NULL) == 0); pthread_kill(pt, SIGTRAP); pause(); return 0; } int main(int argc, const char *argv[]) { int stat; unsigned int nf = atoi(argv[1]); unsigned int nt = atoi(argv[2]); while (nf--) { if (!fork()) return child(nt); wait(&stat); assert(stat == SIGTRAP); } return 0; } $ time ./test 16 16536 shows: real user sys - 5m37.628s 0m4.437s 8m5.560s + 0m50.032s 0m7.130s 1m4.927s Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: "Eric W. Biederman" Cc: Sterling Alexander Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index b0f482f5daf..063745699f7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -571,6 +571,8 @@ static void forget_original_parent(struct task_struct *father) /* Can drop and reacquire tasklist_lock */ reaper = find_child_reaper(father); + if (list_empty(&father->children)) + goto unlock; reaper = find_new_reaper(father, reaper); list_for_each_entry(p, &father->children, sibling) { @@ -591,6 +593,7 @@ static void forget_original_parent(struct task_struct *father) reparent_leader(father, p, &dead_children); } list_splice_tail_init(&father->children, &reaper->children); + unlock: write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, &dead_children, ptrace_entry) { -- cgit v1.2.3-70-g09d2 From 482a3767e5087f6e6ad2486a6655aaa5f3d59301 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:55:20 -0800 Subject: exit: reparent: call forget_original_parent() under tasklist_lock Shift "release dead children" loop from forget_original_parent() to its caller, exit_notify(). It is safe to reap them even if our parent reaps us right after we drop tasklist_lock, those children no longer have any connection to the exiting task. And this allows us to avoid write_lock_irq(tasklist_lock) right after it was released by forget_original_parent(), we can simply call it with tasklist_lock held. While at it, move the comment about forget_original_parent() up to this function. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: "Eric W. Biederman" Cc: Sterling Alexander Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 063745699f7..8061891ddd9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -560,19 +560,26 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, kill_orphaned_pgrp(p, father); } -static void forget_original_parent(struct task_struct *father) +/* + * This does two things: + * + * A. Make init inherit all the child processes + * B. Check to see if any process groups have become orphaned + * as a result of our exiting, and if they have any stopped + * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + */ +static void forget_original_parent(struct task_struct *father, + struct list_head *dead) { - struct task_struct *p, *t, *n, *reaper; - LIST_HEAD(dead_children); + struct task_struct *p, *t, *reaper; - write_lock_irq(&tasklist_lock); if (unlikely(!list_empty(&father->ptraced))) - exit_ptrace(father, &dead_children); + exit_ptrace(father, dead); /* Can drop and reacquire tasklist_lock */ reaper = find_child_reaper(father); if (list_empty(&father->children)) - goto unlock; + return; reaper = find_new_reaper(father, reaper); list_for_each_entry(p, &father->children, sibling) { @@ -590,16 +597,9 @@ static void forget_original_parent(struct task_struct *father) * notify anyone anything has happened. */ if (!same_thread_group(reaper, father)) - reparent_leader(father, p, &dead_children); + reparent_leader(father, p, dead); } list_splice_tail_init(&father->children, &reaper->children); - unlock: - write_unlock_irq(&tasklist_lock); - - list_for_each_entry_safe(p, n, &dead_children, ptrace_entry) { - list_del_init(&p->ptrace_entry); - release_task(p); - } } /* @@ -609,18 +609,12 @@ static void forget_original_parent(struct task_struct *father) static void exit_notify(struct task_struct *tsk, int group_dead) { bool autoreap; - - /* - * This does two things: - * - * A. Make init inherit all the child processes - * B. Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) - */ - forget_original_parent(tsk); + struct task_struct *p, *n; + LIST_HEAD(dead); write_lock_irq(&tasklist_lock); + forget_original_parent(tsk, &dead); + if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); @@ -644,6 +638,11 @@ static void exit_notify(struct task_struct *tsk, int group_dead) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); + list_for_each_entry_safe(p, n, &dead, ptrace_entry) { + list_del_init(&p->ptrace_entry); + release_task(p); + } + /* If the process is dead, release it - nobody will wait for it */ if (autoreap) release_task(tsk); -- cgit v1.2.3-70-g09d2 From 6c66e7dba3d4419c8b973505679635efcd6b311c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:55:23 -0800 Subject: exit: exit_notify: re-use "dead" list to autoreap current After the previous change we can add just the exiting EXIT_DEAD task to the "dead" list and remove another release_task(tsk). Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: "Eric W. Biederman" Cc: Sterling Alexander Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 8061891ddd9..8714e5ded8b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -632,6 +632,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead) } tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; + if (tsk->exit_state == EXIT_DEAD) + list_add(&tsk->ptrace_entry, &dead); /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) @@ -642,10 +644,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) list_del_init(&p->ptrace_entry); release_task(p); } - - /* If the process is dead, release it - nobody will wait for it */ - if (autoreap) - release_task(tsk); } #ifdef CONFIG_DEBUG_STACK_USAGE -- cgit v1.2.3-70-g09d2 From 3245d6acab981a2388ffb877c7ecc97e763c59d4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 8 Jan 2015 14:32:12 -0800 Subject: exit: fix race between wait_consider_task() and wait_task_zombie() wait_consider_task() checks EXIT_ZOMBIE after EXIT_DEAD/EXIT_TRACE and both checks can fail if we race with EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE change in between, gcc needs to reload p->exit_state after security_task_wait(). In this case ->notask_error will be wrongly cleared and do_wait() can hang forever if it was the last eligible child. Many thanks to Arne who carefully investigated the problem. Note: this bug is very old but it was pure theoretical until commit b3ab03160dfa ("wait: completely ignore the EXIT_DEAD tasks"). Before this commit "-O2" was probably enough to guarantee that compiler won't read ->exit_state twice. Signed-off-by: Oleg Nesterov Reported-by: Arne Goedeke Tested-by: Arne Goedeke Cc: [3.15+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 1ea4369890a..6806c55475e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1287,9 +1287,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { + /* + * We can race with wait_task_zombie() from another thread. + * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition + * can't confuse the checks below. + */ + int exit_state = ACCESS_ONCE(p->exit_state); int ret; - if (unlikely(p->exit_state == EXIT_DEAD)) + if (unlikely(exit_state == EXIT_DEAD)) return 0; ret = eligible_child(wo, p); @@ -1310,7 +1316,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, return 0; } - if (unlikely(p->exit_state == EXIT_TRACE)) { + if (unlikely(exit_state == EXIT_TRACE)) { /* * ptrace == 0 means we are the natural parent. In this case * we should clear notask_error, debugger will notify us. @@ -1337,7 +1343,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, } /* slay zombie? */ - if (p->exit_state == EXIT_ZOMBIE) { + if (exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ if (!delay_group_leader(p)) { /* -- cgit v1.2.3-70-g09d2