From 3f95aa81d265223fdb13ea2b59883766a05adbdf Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 4 Aug 2014 06:10:23 -0700
Subject: rcu: Make TASKS_RCU handle tasks that are almost done exiting

Once a task has passed exit_notify() in the do_exit() code path, it
is no longer on the task lists, and is therefore no longer visible
to rcu_tasks_kthread().  This means that an almost-exited task might
be preempted while within a trampoline, and this task won't be waited
on by rcu_tasks_kthread().  This commit fixes this bug by adding an
srcu_struct.  An exiting task does srcu_read_lock() just before calling
exit_notify(), and does the corresponding srcu_read_unlock() after
doing the final preempt_disable().  This means that rcu_tasks_kthread()
can do synchronize_srcu() to wait for all mostly-exited tasks to reach
their final preempt_disable() region, and then use synchronize_sched()
to wait for those tasks to finish exiting.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Suggested-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/exit.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 32c58f7433a..d13f2eec4bb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -667,6 +667,7 @@ void do_exit(long code)
 {
 	struct task_struct *tsk = current;
 	int group_dead;
+	TASKS_RCU(int tasks_rcu_i);
 
 	profile_task_exit(tsk);
 
@@ -775,6 +776,7 @@ void do_exit(long code)
 	 */
 	flush_ptrace_hw_breakpoint(tsk);
 
+	TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
 	exit_notify(tsk, group_dead);
 	proc_exit_connector(tsk);
 #ifdef CONFIG_NUMA
@@ -814,6 +816,7 @@ void do_exit(long code)
 	if (tsk->nr_dirtied)
 		__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
 	exit_rcu();
+	TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
 
 	/*
 	 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
-- 
cgit v1.2.3-70-g09d2


From 90ed9cbe765ad358b3151a12b8bf889a3cbcd573 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Fri, 15 Aug 2014 16:05:36 -0400
Subject: exit: Always reap resource stats in __exit_signal()

Oleg pointed out that wait_task_zombie adds a task's usage statistics
to the parent's signal struct, but the task's own signal struct should
also propagate the statistics at exit time.

This allows thread_group_cputime(reaped_zombie) to get the statistics
after __unhash_process() has made the task invisible to for_each_thread,
but before the thread has actually been rcu freed, making sure no
non-monotonic results are returned inside that window.

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Guillaume Morin <guillaume@morinfr.org>
Cc: Ionut Alexa <ionut.m.alexa@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Michal Schmidt <mschmidt@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: umgwanakikbuti@gmail.com
Cc: fweisbec@gmail.com
Cc: srao@redhat.com
Cc: lwoodman@redhat.com
Cc: atheurer@redhat.com
Link: http://lkml.kernel.org/r/1408133138-22048-2-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/exit.c | 43 +++++++++++++++++++++----------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 32c58f7433a..b93d46dab6f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -115,30 +115,29 @@ static void __exit_signal(struct task_struct *tsk)
 
 		if (tsk == sig->curr_target)
 			sig->curr_target = next_thread(tsk);
-		/*
-		 * Accumulate here the counters for all threads but the
-		 * group leader as they die, so they can be added into
-		 * the process-wide totals when those are taken.
-		 * The group leader stays around as a zombie as long
-		 * as there are other threads.  When it gets reaped,
-		 * the exit.c code will add its counts into these totals.
-		 * We won't ever get here for the group leader, since it
-		 * will have been the last reference on the signal_struct.
-		 */
-		task_cputime(tsk, &utime, &stime);
-		sig->utime += utime;
-		sig->stime += stime;
-		sig->gtime += task_gtime(tsk);
-		sig->min_flt += tsk->min_flt;
-		sig->maj_flt += tsk->maj_flt;
-		sig->nvcsw += tsk->nvcsw;
-		sig->nivcsw += tsk->nivcsw;
-		sig->inblock += task_io_get_inblock(tsk);
-		sig->oublock += task_io_get_oublock(tsk);
-		task_io_accounting_add(&sig->ioac, &tsk->ioac);
-		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 	}
 
+	/*
+	 * Accumulate here the counters for all threads but the group leader
+	 * as they die, so they can be added into the process-wide totals
+	 * when those are taken.  The group leader stays around as a zombie as
+	 * long as there are other threads.  When it gets reaped, the exit.c
+	 * code will add its counts into these totals.  We won't ever get here
+	 * for the group leader, since it will have been the last reference on
+	 * the signal_struct.
+	 */
+	task_cputime(tsk, &utime, &stime);
+	sig->utime += utime;
+	sig->stime += stime;
+	sig->gtime += task_gtime(tsk);
+	sig->min_flt += tsk->min_flt;
+	sig->maj_flt += tsk->maj_flt;
+	sig->nvcsw += tsk->nvcsw;
+	sig->nivcsw += tsk->nivcsw;
+	sig->inblock += task_io_get_inblock(tsk);
+	sig->oublock += task_io_get_oublock(tsk);
+	task_io_accounting_add(&sig->ioac, &tsk->ioac);
+	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 	sig->nr_threads--;
 	__unhash_process(tsk, group_dead);
 
-- 
cgit v1.2.3-70-g09d2


From e78c3496790ee8a36522a838b59b388e8a709e65 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Sat, 16 Aug 2014 13:40:10 -0400
Subject: time, signal: Protect resource use statistics with seqlock

Both times() and clock_gettime(CLOCK_PROCESS_CPUTIME_ID) have scalability
issues on large systems, due to both functions being serialized with a
lock.

The lock protects against reporting a wrong value, due to a thread in the
task group exiting, its statistics reporting up to the signal struct, and
that exited task's statistics being counted twice (or not at all).

Protecting that with a lock results in times() and clock_gettime() being
completely serialized on large systems.

This can be fixed by using a seqlock around the events that gather and
propagate statistics. As an additional benefit, the protection code can
be moved into thread_group_cputime(), slightly simplifying the calling
functions.

In the case of posix_cpu_clock_get_task() things can be simplified a
lot, because the calling function already ensures that the task sticks
around, and the rest is now taken care of in thread_group_cputime().

This way the statistics reporting code can run lockless.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alex Thorlton <athorlton@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Daeseok Youn <daeseok.youn@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guillaume Morin <guillaume@morinfr.org>
Cc: Ionut Alexa <ionut.m.alexa@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Michal Schmidt <mschmidt@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: umgwanakikbuti@gmail.com
Cc: fweisbec@gmail.com
Cc: srao@redhat.com
Cc: lwoodman@redhat.com
Cc: atheurer@redhat.com
Link: http://lkml.kernel.org/r/20140816134010.26a9b572@annuminas.surriel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h          |  1 +
 kernel/exit.c                  |  4 ++++
 kernel/fork.c                  |  1 +
 kernel/sched/cputime.c         | 33 ++++++++++++++++++++-------------
 kernel/sys.c                   |  2 --
 kernel/time/posix-cpu-timers.c | 14 --------------
 6 files changed, 26 insertions(+), 29 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5c2c885ee52..dd9eb480738 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -645,6 +645,7 @@ struct signal_struct {
 	 * Live threads maintain their own counters and add to these
 	 * in __exit_signal, except for the group leader.
 	 */
+	seqlock_t stats_lock;
 	cputime_t utime, stime, cutime, cstime;
 	cputime_t gtime;
 	cputime_t cgtime;
diff --git a/kernel/exit.c b/kernel/exit.c
index b93d46dab6f..fa09b86609d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -127,6 +127,7 @@ static void __exit_signal(struct task_struct *tsk)
 	 * the signal_struct.
 	 */
 	task_cputime(tsk, &utime, &stime);
+	write_seqlock(&sig->stats_lock);
 	sig->utime += utime;
 	sig->stime += stime;
 	sig->gtime += task_gtime(tsk);
@@ -140,6 +141,7 @@ static void __exit_signal(struct task_struct *tsk)
 	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 	sig->nr_threads--;
 	__unhash_process(tsk, group_dead);
+	write_sequnlock(&sig->stats_lock);
 
 	/*
 	 * Do this under ->siglock, we can race with another thread
@@ -1042,6 +1044,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		spin_lock_irq(&p->real_parent->sighand->siglock);
 		psig = p->real_parent->signal;
 		sig = p->signal;
+		write_seqlock(&psig->stats_lock);
 		psig->cutime += tgutime + sig->cutime;
 		psig->cstime += tgstime + sig->cstime;
 		psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1064,6 +1067,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 			psig->cmaxrss = maxrss;
 		task_io_accounting_add(&psig->ioac, &p->ioac);
 		task_io_accounting_add(&psig->ioac, &sig->ioac);
+		write_sequnlock(&psig->stats_lock);
 		spin_unlock_irq(&p->real_parent->sighand->siglock);
 	}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 0cf9cdb6e49..9387ae8ab04 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1068,6 +1068,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->curr_target = tsk;
 	init_sigpending(&sig->shared_pending);
 	INIT_LIST_HEAD(&sig->posix_timers);
+	seqlock_init(&sig->stats_lock);
 
 	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	sig->real_timer.function = it_real_fn;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 3e52836359b..49b7cfe98f7 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -288,18 +288,28 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 	struct signal_struct *sig = tsk->signal;
 	cputime_t utime, stime;
 	struct task_struct *t;
-
-	times->utime = sig->utime;
-	times->stime = sig->stime;
-	times->sum_exec_runtime = sig->sum_sched_runtime;
+	unsigned int seq, nextseq;
 
 	rcu_read_lock();
-	for_each_thread(tsk, t) {
-		task_cputime(t, &utime, &stime);
-		times->utime += utime;
-		times->stime += stime;
-		times->sum_exec_runtime += task_sched_runtime(t);
-	}
+	/* Attempt a lockless read on the first round. */
+	nextseq = 0;
+	do {
+		seq = nextseq;
+		read_seqbegin_or_lock(&sig->stats_lock, &seq);
+		times->utime = sig->utime;
+		times->stime = sig->stime;
+		times->sum_exec_runtime = sig->sum_sched_runtime;
+
+		for_each_thread(tsk, t) {
+			task_cputime(t, &utime, &stime);
+			times->utime += utime;
+			times->stime += stime;
+			times->sum_exec_runtime += task_sched_runtime(t);
+		}
+		/* If lockless access failed, take the lock. */
+		nextseq = 1;
+	} while (need_seqretry(&sig->stats_lock, seq));
+	done_seqretry(&sig->stats_lock, seq);
 	rcu_read_unlock();
 }
 
@@ -611,9 +621,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
 
-/*
- * Must be called with siglock held.
- */
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
 	struct task_cputime cputime;
diff --git a/kernel/sys.c b/kernel/sys.c
index ce8129192a2..b6636643cbd 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -862,11 +862,9 @@ void do_sys_times(struct tms *tms)
 {
 	cputime_t tgutime, tgstime, cutime, cstime;
 
-	spin_lock_irq(&current->sighand->siglock);
 	thread_group_cputime_adjusted(current, &tgutime, &tgstime);
 	cutime = current->signal->cutime;
 	cstime = current->signal->cstime;
-	spin_unlock_irq(&current->sighand->siglock);
 	tms->tms_utime = cputime_to_clock_t(tgutime);
 	tms->tms_stime = cputime_to_clock_t(tgstime);
 	tms->tms_cutime = cputime_to_clock_t(cutime);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3b8946416a5..492b986195d 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
 		if (same_thread_group(tsk, current))
 			err = cpu_clock_sample(which_clock, tsk, &rtn);
 	} else {
-		unsigned long flags;
-		struct sighand_struct *sighand;
-
-		/*
-		 * while_each_thread() is not yet entirely RCU safe,
-		 * keep locking the group while sampling process
-		 * clock for now.
-		 */
-		sighand = lock_task_sighand(tsk, &flags);
-		if (!sighand)
-			return err;
-
 		if (tsk == current || thread_group_leader(tsk))
 			err = cpu_clock_sample_group(which_clock, tsk, &rtn);
-
-		unlock_task_sighand(tsk, &flags);
 	}
 
 	if (!err)
-- 
cgit v1.2.3-70-g09d2


From 1029a2b52c09e479fd7b07275812ad97868c0fb0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Sep 2014 10:18:49 +0200
Subject: sched, exit: Deal with nested sleeps

do_wait() is a big wait loop, but we set TASK_RUNNING too late; we end
up calling potential sleeps before we reset it.

Not strictly a bug since we're guaranteed to exit the loop and not
call schedule(); put in annotations to quiet might_sleep().

 WARNING: CPU: 0 PID: 1 at ../kernel/sched/core.c:7123 __might_sleep+0x7e/0x90()
 do not call blocking ops when !TASK_RUNNING; state=1 set at [<ffffffff8109a788>] do_wait+0x88/0x270

 Call Trace:
  [<ffffffff81694991>] dump_stack+0x4e/0x7a
  [<ffffffff8109877c>] warn_slowpath_common+0x8c/0xc0
  [<ffffffff8109886c>] warn_slowpath_fmt+0x4c/0x50
  [<ffffffff810bca6e>] __might_sleep+0x7e/0x90
  [<ffffffff811a1c15>] might_fault+0x55/0xb0
  [<ffffffff8109a3fb>] wait_consider_task+0x90b/0xc10
  [<ffffffff8109a804>] do_wait+0x104/0x270
  [<ffffffff8109b837>] SyS_wait4+0x77/0x100
  [<ffffffff8169d692>] system_call_fastpath+0x16/0x1b

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: tglx@linutronix.de
Cc: umgwanakikbuti@gmail.com
Cc: ilya.dryomov@inktank.com
Cc: Alex Elder <alex.elder@linaro.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Axel Lin <axel.lin@ingics.com>
Cc: Daniel Borkmann <dborkman@redhat.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Guillaume Morin <guillaume@morinfr.org>
Cc: Ionut Alexa <ionut.m.alexa@gmail.com>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Michal Schmidt <mschmidt@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20140924082242.186408915@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kernel.h | 2 ++
 kernel/exit.c          | 5 +++++
 2 files changed, 7 insertions(+)

(limited to 'kernel/exit.c')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3d770f5564b..5068a0d9fec 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -175,10 +175,12 @@ extern int _cond_resched(void);
  */
 # define might_sleep() \
 	do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
+# define sched_annotate_sleep()	__set_current_state(TASK_RUNNING)
 #else
   static inline void __might_sleep(const char *file, int line,
 				   int preempt_offset) { }
 # define might_sleep() do { might_resched(); } while (0)
+# define sched_annotate_sleep() do { } while (0)
 #endif
 
 #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
diff --git a/kernel/exit.c b/kernel/exit.c
index 5d30019ff95..232c4bc8bcc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -997,6 +997,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 
 		get_task_struct(p);
 		read_unlock(&tasklist_lock);
+		sched_annotate_sleep();
+
 		if ((exit_code & 0x7f) == 0) {
 			why = CLD_EXITED;
 			status = exit_code >> 8;
@@ -1079,6 +1081,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 	 * thread can reap it because we its state == DEAD/TRACE.
 	 */
 	read_unlock(&tasklist_lock);
+	sched_annotate_sleep();
 
 	retval = wo->wo_rusage
 		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
@@ -1210,6 +1213,7 @@ unlock_sig:
 	pid = task_pid_vnr(p);
 	why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
 	read_unlock(&tasklist_lock);
+	sched_annotate_sleep();
 
 	if (unlikely(wo->wo_flags & WNOWAIT))
 		return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
@@ -1272,6 +1276,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	pid = task_pid_vnr(p);
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
+	sched_annotate_sleep();
 
 	if (!wo->wo_info) {
 		retval = wo->wo_rusage
-- 
cgit v1.2.3-70-g09d2


From e1c2296c3485158304bfad5a80e89078463d70c8 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Thu, 16 Oct 2014 14:59:48 -0400
Subject: tty: Move session_of_pgrp() and make static

tiocspgrp() is the lone caller of session_of_pgrp(); relocate and
limit to file scope.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Reviewed-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_io.c   | 21 +++++++++++++++++++++
 include/linux/kernel.h |  3 ---
 kernel/exit.c          | 21 ---------------------
 3 files changed, 21 insertions(+), 24 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 114854c5554..ae8f53c7972 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -2515,6 +2515,27 @@ struct pid *tty_get_pgrp(struct tty_struct *tty)
 }
 EXPORT_SYMBOL_GPL(tty_get_pgrp);
 
+/*
+ * This checks not only the pgrp, but falls back on the pid if no
+ * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
+ * without this...
+ *
+ * The caller must hold rcu lock or the tasklist lock.
+ */
+static struct pid *session_of_pgrp(struct pid *pgrp)
+{
+	struct task_struct *p;
+	struct pid *sid = NULL;
+
+	p = pid_task(pgrp, PIDTYPE_PGID);
+	if (p == NULL)
+		p = pid_task(pgrp, PIDTYPE_PID);
+	if (p != NULL)
+		sid = task_session(p);
+
+	return sid;
+}
+
 /**
  *	tiocgpgrp		-	get process group
  *	@tty: tty passed by user
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3d770f5564b..01bc530fbfc 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -411,9 +411,6 @@ extern int __kernel_text_address(unsigned long addr);
 extern int kernel_text_address(unsigned long addr);
 extern int func_ptr_is_kernel_text(void *ptr);
 
-struct pid;
-extern struct pid *session_of_pgrp(struct pid *pgrp);
-
 unsigned long int_sqrt(unsigned long);
 
 extern void bust_spinlocks(int yes);
diff --git a/kernel/exit.c b/kernel/exit.c
index 5d30019ff95..6a3e2e5004b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -214,27 +214,6 @@ repeat:
 		goto repeat;
 }
 
-/*
- * This checks not only the pgrp, but falls back on the pid if no
- * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
- * without this...
- *
- * The caller must hold rcu lock or the tasklist lock.
- */
-struct pid *session_of_pgrp(struct pid *pgrp)
-{
-	struct task_struct *p;
-	struct pid *sid = NULL;
-
-	p = pid_task(pgrp, PIDTYPE_PGID);
-	if (p == NULL)
-		p = pid_task(pgrp, PIDTYPE_PID);
-	if (p != NULL)
-		sid = task_session(p);
-
-	return sid;
-}
-
 /*
  * Determine if a process group is "orphaned", according to the POSIX
  * definition in 2.2.2.52.  Orphaned process groups are not to be affected
-- 
cgit v1.2.3-70-g09d2


From dc2fd4b00946751ebd222d366fc64550e4188dc2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 10 Dec 2014 15:45:24 -0800
Subject: exit: reparent: use ->ptrace_entry rather than ->sibling for
 EXIT_DEAD tasks

reparent_leader() reuses ->sibling as a list node to add an EXIT_DEAD task
into dead_children list we are going to release.  This obviously removes
the dead task from its real_parent->children list and this is even good;
the parent can do nothing with the EXIT_DEAD reparented zombie, it only
makes do_wait() slower.

But, this also means that it can not be reparented once again, so if its
new parent dies too nobody will update ->parent/real_parent, they can
point to the freed memory even before release_task() we are going to call,
this breaks the code which relies on pid_alive() to access
->real_parent/parent.

Fortunately this is mostly theoretical, this can only happen if init or
PR_SET_CHILD_SUBREAPER process ignores SIGCHLD and the new parent
sub-thread exits right after we drop tasklist_lock.

Change this code to use ->ptrace_entry instead, we know that the child is
not traced so nobody can ever use this member.  This also allows to unify
this logic with exit_ptrace(), see the next changes.

Note: we really need to change release_task() to nullify real_parent/
parent/group_leader pointers, but we need to change the current users
first somehow.  And it would be better to reap this zombie immediately but
release_task_locked() we need is complicated by proc_flush_task().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>,
Cc: Sterling Alexander <stalexan@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roland McGrath <roland@hack.frob.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 232c4bc8bcc..0272305bf85 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -548,7 +548,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
 	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
 		if (do_notify_parent(p, p->exit_signal)) {
 			p->exit_state = EXIT_DEAD;
-			list_move_tail(&p->sibling, dead);
+			list_add(&p->ptrace_entry, dead);
 		}
 	}
 
@@ -587,8 +587,8 @@ static void forget_original_parent(struct task_struct *father)
 
 	BUG_ON(!list_empty(&father->children));
 
-	list_for_each_entry_safe(p, n, &dead_children, sibling) {
-		list_del_init(&p->sibling);
+	list_for_each_entry_safe(p, n, &dead_children, ptrace_entry) {
+		list_del_init(&p->ptrace_entry);
 		release_task(p);
 	}
 }
-- 
cgit v1.2.3-70-g09d2


From 57a059187d5ba5592e36c6f23d046bc37616f346 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 10 Dec 2014 15:45:27 -0800
Subject: exit: reparent: cleanup the changing of ->parent

1. Cosmetic, but "if (t->parent == father)" looks a bit confusing.
   We need to change t->parent if and only if t is not traced.

2. If we actually want this BUG_ON() to ensure that parent/ptrace
   match each other, then we should also take ptrace_reparented()
   case into account too.

3. Change this code to use for_each_thread() instead of deprecated
   while_each_thread().

[dan.carpenter@oracle.com: silence a bogus static checker warning]
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>,
Cc: Sterling Alexander <stalexan@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roland McGrath <roland@hack.frob.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 0272305bf85..464971e6923 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -557,7 +557,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
 
 static void forget_original_parent(struct task_struct *father)
 {
-	struct task_struct *p, *n, *reaper;
+	struct task_struct *p, *t, *n, *reaper;
 	LIST_HEAD(dead_children);
 
 	write_lock_irq(&tasklist_lock);
@@ -569,18 +569,15 @@ static void forget_original_parent(struct task_struct *father)
 	reaper = find_new_reaper(father);
 
 	list_for_each_entry_safe(p, n, &father->children, sibling) {
-		struct task_struct *t = p;
-
-		do {
+		for_each_thread(p, t) {
 			t->real_parent = reaper;
-			if (t->parent == father) {
-				BUG_ON(t->ptrace);
+			BUG_ON((!t->ptrace) != (t->parent == father));
+			if (likely(!t->ptrace))
 				t->parent = t->real_parent;
-			}
 			if (t->pdeath_signal)
 				group_send_sig_info(t->pdeath_signal,
 						    SEND_SIG_NOINFO, t);
-		} while_each_thread(p, t);
+		}
 		reparent_leader(father, p, &dead_children);
 	}
 	write_unlock_irq(&tasklist_lock);
-- 
cgit v1.2.3-70-g09d2


From 2831096e21503897ee474c23131c3feb8db0ffb1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 10 Dec 2014 15:45:30 -0800
Subject: exit: reparent: cleanup the usage of reparent_leader()

1. Now that reparent_leader() doesn't abuse ->sibling we can shift
   list_move_tail() from reparent_leader() to forget_original_parent()
   and turn it into a single list_splice_tail_init(). This also makes
   BUG_ON(!list_empty()) and list_for_each_entry_safe() unnecessary.

2. This also allows to shift the same_thread_group() check, it looks
   a bit more clear in the caller.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>,
Cc: Sterling Alexander <stalexan@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roland McGrath <roland@hack.frob.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 464971e6923..772e9175735 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -529,15 +529,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 static void reparent_leader(struct task_struct *father, struct task_struct *p,
 				struct list_head *dead)
 {
-	list_move_tail(&p->sibling, &p->real_parent->children);
-
-	if (p->exit_state == EXIT_DEAD)
-		return;
-	/*
-	 * If this is a threaded reparent there is no need to
-	 * notify anyone anything has happened.
-	 */
-	if (same_thread_group(p->real_parent, father))
+	if (unlikely(p->exit_state == EXIT_DEAD))
 		return;
 
 	/* We don't want people slaying init. */
@@ -568,7 +560,7 @@ static void forget_original_parent(struct task_struct *father)
 	exit_ptrace(father);
 	reaper = find_new_reaper(father);
 
-	list_for_each_entry_safe(p, n, &father->children, sibling) {
+	list_for_each_entry(p, &father->children, sibling) {
 		for_each_thread(p, t) {
 			t->real_parent = reaper;
 			BUG_ON((!t->ptrace) != (t->parent == father));
@@ -578,12 +570,16 @@ static void forget_original_parent(struct task_struct *father)
 				group_send_sig_info(t->pdeath_signal,
 						    SEND_SIG_NOINFO, t);
 		}
-		reparent_leader(father, p, &dead_children);
+		/*
+		 * If this is a threaded reparent there is no need to
+		 * notify anyone anything has happened.
+		 */
+		if (!same_thread_group(reaper, father))
+			reparent_leader(father, p, &dead_children);
 	}
+	list_splice_tail_init(&father->children, &reaper->children);
 	write_unlock_irq(&tasklist_lock);
 
-	BUG_ON(!list_empty(&father->children));
-
 	list_for_each_entry_safe(p, n, &dead_children, ptrace_entry) {
 		list_del_init(&p->ptrace_entry);
 		release_task(p);
-- 
cgit v1.2.3-70-g09d2


From 7c8bd2322c7fd973d089b27de55e29c92c667a06 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 10 Dec 2014 15:45:33 -0800
Subject: exit: ptrace: shift "reap dead" code from exit_ptrace() to
 forget_original_parent()

Now that forget_original_parent() uses ->ptrace_entry for EXIT_DEAD tasks,
we can simply pass "dead_children" list to exit_ptrace() and remove
another release_task() loop.  Plus this way we do not need to drop and
reacquire tasklist_lock.

Also shift the list_empty(ptraced) check, if we want this optimization it
makes sense to eliminate the function call altogether.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>,
Cc: Sterling Alexander <stalexan@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roland McGrath <roland@hack.frob.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h |  2 +-
 kernel/exit.c          | 10 ++++------
 kernel/ptrace.c        | 23 +++--------------------
 3 files changed, 8 insertions(+), 27 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index cc79eff4a1a..987a73a40ef 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -52,7 +52,7 @@ extern void ptrace_notify(int exit_code);
 extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
-extern void exit_ptrace(struct task_struct *tracer);
+extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead);
 #define PTRACE_MODE_READ	0x01
 #define PTRACE_MODE_ATTACH	0x02
 #define PTRACE_MODE_NOAUDIT	0x04
diff --git a/kernel/exit.c b/kernel/exit.c
index 772e9175735..9c9526d8727 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -553,13 +553,11 @@ static void forget_original_parent(struct task_struct *father)
 	LIST_HEAD(dead_children);
 
 	write_lock_irq(&tasklist_lock);
-	/*
-	 * Note that exit_ptrace() and find_new_reaper() might
-	 * drop tasklist_lock and reacquire it.
-	 */
-	exit_ptrace(father);
-	reaper = find_new_reaper(father);
+	if (unlikely(!list_empty(&father->ptraced)))
+		exit_ptrace(father, &dead_children);
 
+	/* Can drop and reacquire tasklist_lock */
+	reaper = find_new_reaper(father);
 	list_for_each_entry(p, &father->children, sibling) {
 		for_each_thread(p, t) {
 			t->real_parent = reaper;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 54e75226c2c..1eb9d90c3af 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -485,36 +485,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
 
 /*
  * Detach all tasks we were using ptrace on. Called with tasklist held
- * for writing, and returns with it held too. But note it can release
- * and reacquire the lock.
+ * for writing.
  */
-void exit_ptrace(struct task_struct *tracer)
-	__releases(&tasklist_lock)
-	__acquires(&tasklist_lock)
+void exit_ptrace(struct task_struct *tracer, struct list_head *dead)
 {
 	struct task_struct *p, *n;
-	LIST_HEAD(ptrace_dead);
-
-	if (likely(list_empty(&tracer->ptraced)))
-		return;
 
 	list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
 		if (unlikely(p->ptrace & PT_EXITKILL))
 			send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
 
 		if (__ptrace_detach(tracer, p))
-			list_add(&p->ptrace_entry, &ptrace_dead);
-	}
-
-	write_unlock_irq(&tasklist_lock);
-	BUG_ON(!list_empty(&tracer->ptraced));
-
-	list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
-		list_del_init(&p->ptrace_entry);
-		release_task(p);
+			list_add(&p->ptrace_entry, dead);
 	}
-
-	write_lock_irq(&tasklist_lock);
 }
 
 int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
-- 
cgit v1.2.3-70-g09d2


From f6507f83bccd4a5f7dc7091079bf58128dc56d66 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 10 Dec 2014 15:54:45 -0800
Subject: exit: wait: cleanup the ptrace_reparented() checks

Now that EXIT_DEAD is the terminal state we can kill "int traced"
variable and check "state == EXIT_DEAD" instead to cleanup the code.  In
particular, this way it is clear that the check obviously doesn't need
tasklist_lock.

Also fix the type of "unsigned long state", "long" was always wrong
although this doesn't matter because cmpxchg/xchg uses typeof(*ptr).

[akpm@linux-foundation.org: don't make me google the C Operator Precedence table]
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sterling Alexander <stalexan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 9c9526d8727..20875d6398a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -973,8 +973,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
  */
 static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 {
-	unsigned long state;
-	int retval, status, traced;
+	int state, retval, status;
 	pid_t pid = task_pid_vnr(p);
 	uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
 	struct siginfo __user *infop;
@@ -999,19 +998,18 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		}
 		return wait_noreap_copyout(wo, p, pid, uid, why, status);
 	}
-
-	traced = ptrace_reparented(p);
 	/*
 	 * Move the task's state to DEAD/TRACE, only one thread can do this.
 	 */
-	state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD;
+	state = (ptrace_reparented(p) && thread_group_leader(p)) ?
+		EXIT_TRACE : EXIT_DEAD;
 	if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
 		return 0;
+
 	/*
-	 * It can be ptraced but not reparented, check
-	 * thread_group_leader() to filter out sub-threads.
+	 * Check thread_group_leader() to exclude the traced sub-threads.
 	 */
-	if (likely(!traced) && thread_group_leader(p)) {
+	if (state == EXIT_DEAD && thread_group_leader(p)) {
 		struct signal_struct *psig;
 		struct signal_struct *sig;
 		unsigned long maxrss;
-- 
cgit v1.2.3-70-g09d2


From f953ccd00615140b5e722ffe2b920da22dfb4db9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 10 Dec 2014 15:54:48 -0800
Subject: exit: wait: don't use zombie->real_parent

1. wait_task_zombie() uses p->real_parent to get psig/siglock. This is
   correct but needs tasklist_lock, ->real_parent can exit.

   We can use "current" instead. This is our natural child, its parent
   must be our sub-thread.

2. Read psig/sig outside of ->siglock, ->signal is no longer protected
   by this lock.

3. Fix the outdated comments about tasklist_lock. We can not race with
   __exit_signal(), the whole thread group is dead, nobody but us can
   call it.

   Also clarify the usage of ->stats_lock and ->siglock.

Note: thread_group_cputime_adjusted() is sub-optimal in this case, we
probably want to export cputime_adjust() to avoid thread_group_cputime().
The comment says "all threads" but there are no other threads.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sterling Alexander <stalexan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 20875d6398a..457673d6593 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1010,8 +1010,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 	 * Check thread_group_leader() to exclude the traced sub-threads.
 	 */
 	if (state == EXIT_DEAD && thread_group_leader(p)) {
-		struct signal_struct *psig;
-		struct signal_struct *sig;
+		struct signal_struct *sig = p->signal;
+		struct signal_struct *psig = current->signal;
 		unsigned long maxrss;
 		cputime_t tgutime, tgstime;
 
@@ -1023,21 +1023,20 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		 * accumulate in the parent's signal_struct c* fields.
 		 *
 		 * We don't bother to take a lock here to protect these
-		 * p->signal fields, because they are only touched by
-		 * __exit_signal, which runs with tasklist_lock
-		 * write-locked anyway, and so is excluded here.  We do
-		 * need to protect the access to parent->signal fields,
-		 * as other threads in the parent group can be right
-		 * here reaping other children at the same time.
+		 * p->signal fields because the whole thread group is dead
+		 * and nobody can change them.
+		 *
+		 * psig->stats_lock also protects us from our sub-theads
+		 * which can reap other children at the same time. Until
+		 * we change k_getrusage()-like users to rely on this lock
+		 * we have to take ->siglock as well.
 		 *
 		 * We use thread_group_cputime_adjusted() to get times for
 		 * the thread group, which consolidates times for all threads
 		 * in the group including the group leader.
 		 */
 		thread_group_cputime_adjusted(p, &tgutime, &tgstime);
-		spin_lock_irq(&p->real_parent->sighand->siglock);
-		psig = p->real_parent->signal;
-		sig = p->signal;
+		spin_lock_irq(&current->sighand->siglock);
 		write_seqlock(&psig->stats_lock);
 		psig->cutime += tgutime + sig->cutime;
 		psig->cstime += tgstime + sig->cstime;
@@ -1062,7 +1061,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		task_io_accounting_add(&psig->ioac, &p->ioac);
 		task_io_accounting_add(&psig->ioac, &sig->ioac);
 		write_sequnlock(&psig->stats_lock);
-		spin_unlock_irq(&p->real_parent->sighand->siglock);
+		spin_unlock_irq(&current->sighand->siglock);
 	}
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From 986094dfe161b4346831547136d4e5ed7f94310e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 10 Dec 2014 15:54:51 -0800
Subject: exit: wait: drop tasklist_lock before psig->c* accounting

wait_task_zombie() no longer needs tasklist_lock to accumulate the
psig->c* counters, we can drop it right after cmpxchg(exit_state).

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sterling Alexander <stalexan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 457673d6593..6297eb0f5bd 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1005,6 +1005,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		EXIT_TRACE : EXIT_DEAD;
 	if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
 		return 0;
+	/*
+	 * We own this thread, nobody else can reap it.
+	 */
+	read_unlock(&tasklist_lock);
+	sched_annotate_sleep();
 
 	/*
 	 * Check thread_group_leader() to exclude the traced sub-threads.
@@ -1064,13 +1069,6 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		spin_unlock_irq(&current->sighand->siglock);
 	}
 
-	/*
-	 * Now we are sure this task is interesting, and no other
-	 * thread can reap it because we its state == DEAD/TRACE.
-	 */
-	read_unlock(&tasklist_lock);
-	sched_annotate_sleep();
-
 	retval = wo->wo_rusage
 		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
 	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
-- 
cgit v1.2.3-70-g09d2


From 26e75b5c3d2226cb995fde064744aa93f63849c4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 10 Dec 2014 15:54:54 -0800
Subject: exit: release_task: fix the comment about group leader accounting

Contrary to what the comment in __exit_signal() says we do account the
group leader. Fix this and explain why.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sterling Alexander <stalexan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 6297eb0f5bd..9a65f10dc9f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -118,13 +118,10 @@ static void __exit_signal(struct task_struct *tsk)
 	}
 
 	/*
-	 * Accumulate here the counters for all threads but the group leader
-	 * as they die, so they can be added into the process-wide totals
-	 * when those are taken.  The group leader stays around as a zombie as
-	 * long as there are other threads.  When it gets reaped, the exit.c
-	 * code will add its counts into these totals.  We won't ever get here
-	 * for the group leader, since it will have been the last reference on
-	 * the signal_struct.
+	 * Accumulate here the counters for all threads as they die. We could
+	 * skip the group leader because it is the last user of signal_struct,
+	 * but we want to avoid the race with thread_group_cputime() which can
+	 * see the empty ->thread_head list.
 	 */
 	task_cputime(tsk, &utime, &stime);
 	write_seqlock(&sig->stats_lock);
-- 
cgit v1.2.3-70-g09d2


From 8a1296aea4a319b33c3367ff3805835e949a229f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 10 Dec 2014 15:54:59 -0800
Subject: exit: reparent: fix the dead-parent PR_SET_CHILD_SUBREAPER
 reparenting

The ->has_child_subreaper code in find_new_reaper() finds alive "thread"
but returns another "reaper" thread which can be dead.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Kay Sievers <kay@vrfy.org>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Sterling Alexander <stalexan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 9a65f10dc9f..fd38a8f0436 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -512,7 +512,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 			thread = reaper;
 			do {
 				if (!(thread->flags & PF_EXITING))
-					return reaper;
+					return thread;
 			} while_each_thread(reaper, thread);
 		}
 	}
-- 
cgit v1.2.3-70-g09d2


From 7d24e2df52f596a1ea922e4f84a61f2fb24fbb70 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 10 Dec 2014 15:55:02 -0800
Subject: exit: reparent: fix the cross-namespace PR_SET_CHILD_SUBREAPER
 reparenting

find_new_reaper() assumes that "has_child_subreaper" logic is safe as
long as we are not the exiting ->child_reaper and this is doubly wrong:

1. In fact it is safe if "pid_ns->child_reaper == father"; there must
   be no children after zap_pid_ns_processes() returns, so it doesn't
   matter what we return in this case and even pid_ns->child_reaper is
   wrong otherwise: we can't reparent to ->child_reaper == current.

   This is not a bug, but this is confusing.

2. It is not safe if we are not pid_ns->child_reaper but from the same
   thread group. We drop tasklist_lock before zap_pid_ns_processes(),
   so another thread can lock it and choose the new reaper from the
   upper namespace if has_child_subreaper == T, and this is obviously
   wrong.

   This is not that bad, zap_pid_ns_processes() won't return until the
   the new reaper reaps all zombies, but this should be fixed anyway.

We could change for_each_thread() loop to use ->exit_state instead of
PF_EXITING which we had to use until 8aac62706ada, or we could change
copy_signal() to check CLONE_NEWPID before setting has_child_subreaper,
but lets change this code so that it is clear we can't look outside of
our namespace, otherwise same_thread_group(reaper, child_reaper) check
will look wrong and confusing anyway.

We can simply start from "father" and fix the problem. We can't wrongly
return a thread from the same thread group if ->is_child_subreaper == T,
we know that all threads have PF_EXITING set.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Kay Sievers <kay@vrfy.org>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Sterling Alexander <stalexan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index fd38a8f0436..9babd47a36e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -492,7 +492,9 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 
 		zap_pid_ns_processes(pid_ns);
 		write_lock_irq(&tasklist_lock);
-	} else if (father->signal->has_child_subreaper) {
+	}
+
+	if (father->signal->has_child_subreaper) {
 		struct task_struct *reaper;
 
 		/*
@@ -502,7 +504,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 		 * PID namespace. However we still need the check above, see
 		 * http://marc.info/?l=linux-kernel&m=131385460420380
 		 */
-		for (reaper = father->real_parent;
+		for (reaper = father;
 		     reaper != &init_task;
 		     reaper = reaper->real_parent) {
 			if (same_thread_group(reaper, pid_ns->child_reaper))
-- 
cgit v1.2.3-70-g09d2


From 3750ef979cfa1296630aa9f23e265c1bd721498a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 10 Dec 2014 15:55:05 -0800
Subject: exit: reparent: s/while_each_thread/for_each_thread/ in
 find_new_reaper()

Change find_new_reaper() to use for_each_thread() instead of deprecated
while_each_thread().  We do not bother to check "thread != father" in the
1st loop, we can rely on PF_EXITING check.

Note: this means the minor behavioural change: for_each_thread() starts
from the group leader.  But this should be fine, nobody should make any
assumption about do_wait(__WNOTHREAD) when it comes to reparented tasks.
And this can avoid the pointless reparenting to a short-living thread
While zombie leaders are not that common.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Kay Sievers <kay@vrfy.org>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Sterling Alexander <stalexan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 9babd47a36e..a4204aaba8a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -473,8 +473,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 	struct pid_namespace *pid_ns = task_active_pid_ns(father);
 	struct task_struct *thread;
 
-	thread = father;
-	while_each_thread(father, thread) {
+	for_each_thread(father, thread) {
 		if (thread->flags & PF_EXITING)
 			continue;
 		if (unlikely(pid_ns->child_reaper == father))
@@ -511,11 +510,10 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 				break;
 			if (!reaper->signal->is_child_subreaper)
 				continue;
-			thread = reaper;
-			do {
+			for_each_thread(reaper, thread) {
 				if (!(thread->flags & PF_EXITING))
 					return thread;
-			} while_each_thread(reaper, thread);
+			}
 		}
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 175aed3f8d38b87d3287bb765c794205f2b511de Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 10 Dec 2014 15:55:08 -0800
Subject: exit: reparent: document the ->has_child_subreaper checks

Swap the "init_task" and same_thread_group() checks.  This way it is more
simple to document these checks and we can remove the link to the previous
discussion on lkml.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Kay Sievers <kay@vrfy.org>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Sterling Alexander <stalexan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index a4204aaba8a..576949ce566 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -495,18 +495,16 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 
 	if (father->signal->has_child_subreaper) {
 		struct task_struct *reaper;
-
 		/*
-		 * Find the first ancestor marked as child_subreaper.
-		 * Note that the code below checks same_thread_group(reaper,
-		 * pid_ns->child_reaper).  This is what we need to DTRT in a
-		 * PID namespace. However we still need the check above, see
-		 * http://marc.info/?l=linux-kernel&m=131385460420380
+		 * Find the first ->is_child_subreaper ancestor in our pid_ns.
+		 * We start from father to ensure we can not look into another
+		 * namespace, this is safe because all its threads are dead.
 		 */
 		for (reaper = father;
-		     reaper != &init_task;
+		     !same_thread_group(reaper, pid_ns->child_reaper);
 		     reaper = reaper->real_parent) {
-			if (same_thread_group(reaper, pid_ns->child_reaper))
+			/* call_usermodehelper() descendants need this check */
+			if (reaper == &init_task)
 				break;
 			if (!reaper->signal->is_child_subreaper)
 				continue;
-- 
cgit v1.2.3-70-g09d2


From 1109909c7df08f55ff9104276bb9db1ee2e6e53d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 10 Dec 2014 15:55:11 -0800
Subject: exit: reparent: introduce find_child_reaper()

find_new_reaper() does 2 completely different things.  Not only it finds a
reaper, it also updates pid_ns->child_reaper or kills the whole namespace
if the caller is ->child_reaper.

Now that has_child_subreaper logic doesn't depend on child_reaper check we
can move that pid_ns code into a separate helper.  IMHO this makes the
code more clean, and this allows the next changes.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Kay Sievers <kay@vrfy.org>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Sterling Alexander <stalexan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 56 +++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 21 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 576949ce566..930fbe1b5ee 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -459,6 +459,34 @@ static void exit_mm(struct task_struct *tsk)
 	clear_thread_flag(TIF_MEMDIE);
 }
 
+static struct task_struct *find_child_reaper(struct task_struct *father)
+	__releases(&tasklist_lock)
+	__acquires(&tasklist_lock)
+{
+	struct pid_namespace *pid_ns = task_active_pid_ns(father);
+	struct task_struct *reaper = pid_ns->child_reaper;
+
+	if (likely(reaper != father))
+		return reaper;
+
+	for_each_thread(father, reaper) {
+		if (reaper->flags & PF_EXITING)
+			continue;
+		pid_ns->child_reaper = reaper;
+		return reaper;
+	}
+
+	write_unlock_irq(&tasklist_lock);
+	if (unlikely(pid_ns == &init_pid_ns)) {
+		panic("Attempted to kill init! exitcode=0x%08x\n",
+			father->signal->group_exit_code ?: father->exit_code);
+	}
+	zap_pid_ns_processes(pid_ns);
+	write_lock_irq(&tasklist_lock);
+
+	return father;
+}
+
 /*
  * When we die, we re-parent all our children, and try to:
  * 1. give them to another thread in our thread group, if such a member exists
@@ -466,33 +494,17 @@ static void exit_mm(struct task_struct *tsk)
  *    child_subreaper for its children (like a service manager)
  * 3. give it to the init process (PID 1) in our pid namespace
  */
-static struct task_struct *find_new_reaper(struct task_struct *father)
-	__releases(&tasklist_lock)
-	__acquires(&tasklist_lock)
+static struct task_struct *find_new_reaper(struct task_struct *father,
+					   struct task_struct *child_reaper)
 {
-	struct pid_namespace *pid_ns = task_active_pid_ns(father);
 	struct task_struct *thread;
 
 	for_each_thread(father, thread) {
 		if (thread->flags & PF_EXITING)
 			continue;
-		if (unlikely(pid_ns->child_reaper == father))
-			pid_ns->child_reaper = thread;
 		return thread;
 	}
 
-	if (unlikely(pid_ns->child_reaper == father)) {
-		write_unlock_irq(&tasklist_lock);
-		if (unlikely(pid_ns == &init_pid_ns)) {
-			panic("Attempted to kill init! exitcode=0x%08x\n",
-				father->signal->group_exit_code ?:
-					father->exit_code);
-		}
-
-		zap_pid_ns_processes(pid_ns);
-		write_lock_irq(&tasklist_lock);
-	}
-
 	if (father->signal->has_child_subreaper) {
 		struct task_struct *reaper;
 		/*
@@ -501,7 +513,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 		 * namespace, this is safe because all its threads are dead.
 		 */
 		for (reaper = father;
-		     !same_thread_group(reaper, pid_ns->child_reaper);
+		     !same_thread_group(reaper, child_reaper);
 		     reaper = reaper->real_parent) {
 			/* call_usermodehelper() descendants need this check */
 			if (reaper == &init_task)
@@ -515,7 +527,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 		}
 	}
 
-	return pid_ns->child_reaper;
+	return child_reaper;
 }
 
 /*
@@ -552,7 +564,9 @@ static void forget_original_parent(struct task_struct *father)
 		exit_ptrace(father, &dead_children);
 
 	/* Can drop and reacquire tasklist_lock */
-	reaper = find_new_reaper(father);
+	reaper = find_child_reaper(father);
+
+	reaper = find_new_reaper(father, reaper);
 	list_for_each_entry(p, &father->children, sibling) {
 		for_each_thread(p, t) {
 			t->real_parent = reaper;
-- 
cgit v1.2.3-70-g09d2


From c9dc05bfdb3f7fd7c00f3cbd33816c99d2cb9029 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 10 Dec 2014 15:55:14 -0800
Subject: exit: reparent: introduce find_alive_thread()

Add the new simple helper to factor out the for_each_thread() code in
find_child_reaper() and find_new_reaper().  It can also simplify the
potential PF_EXITING -> exit_state change, plus perhaps we can change this
code to take SIGNAL_GROUP_EXIT into account.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Kay Sievers <kay@vrfy.org>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Sterling Alexander <stalexan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 930fbe1b5ee..b0f482f5daf 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -459,6 +459,17 @@ static void exit_mm(struct task_struct *tsk)
 	clear_thread_flag(TIF_MEMDIE);
 }
 
+static struct task_struct *find_alive_thread(struct task_struct *p)
+{
+	struct task_struct *t;
+
+	for_each_thread(p, t) {
+		if (!(t->flags & PF_EXITING))
+			return t;
+	}
+	return NULL;
+}
+
 static struct task_struct *find_child_reaper(struct task_struct *father)
 	__releases(&tasklist_lock)
 	__acquires(&tasklist_lock)
@@ -469,9 +480,8 @@ static struct task_struct *find_child_reaper(struct task_struct *father)
 	if (likely(reaper != father))
 		return reaper;
 
-	for_each_thread(father, reaper) {
-		if (reaper->flags & PF_EXITING)
-			continue;
+	reaper = find_alive_thread(father);
+	if (reaper) {
 		pid_ns->child_reaper = reaper;
 		return reaper;
 	}
@@ -497,16 +507,13 @@ static struct task_struct *find_child_reaper(struct task_struct *father)
 static struct task_struct *find_new_reaper(struct task_struct *father,
 					   struct task_struct *child_reaper)
 {
-	struct task_struct *thread;
+	struct task_struct *thread, *reaper;
 
-	for_each_thread(father, thread) {
-		if (thread->flags & PF_EXITING)
-			continue;
+	thread = find_alive_thread(father);
+	if (thread)
 		return thread;
-	}
 
 	if (father->signal->has_child_subreaper) {
-		struct task_struct *reaper;
 		/*
 		 * Find the first ->is_child_subreaper ancestor in our pid_ns.
 		 * We start from father to ensure we can not look into another
@@ -520,10 +527,9 @@ static struct task_struct *find_new_reaper(struct task_struct *father,
 				break;
 			if (!reaper->signal->is_child_subreaper)
 				continue;
-			for_each_thread(reaper, thread) {
-				if (!(thread->flags & PF_EXITING))
-					return thread;
-			}
+			thread = find_alive_thread(reaper);
+			if (thread)
+				return thread;
 		}
 	}
 
-- 
cgit v1.2.3-70-g09d2


From ad9e206aefa56788b676ebcd6329e828f40d2238 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 10 Dec 2014 15:55:17 -0800
Subject: exit: reparent: avoid find_new_reaper() if no children

Now that pid_ns logic was isolated we can change forget_original_parent()
to return right after find_child_reaper() when father->children is empty,
there is nothing to reparent in this case.

In particular this avoids find_alive_thread() and this can help if the
whole process exits and it has a lot of PF_EXITING threads at the start of
the thread list, this can easily lead to O(nr_threads ** 2) iterations.

Trivial test case (tested under KVM, 2 CPUs):

    static void *tfunc(void *arg)
    {
        pause();
        return NULL;
    }

    static int child(unsigned int nt)
    {
        pthread_t pt;

        while (nt--)
            assert(pthread_create(&pt, NULL, tfunc, NULL) == 0);

        pthread_kill(pt, SIGTRAP);
        pause();
        return 0;
    }

    int main(int argc, const char *argv[])
    {
        int stat;
        unsigned int nf = atoi(argv[1]);
        unsigned int nt = atoi(argv[2]);

        while (nf--) {
            if (!fork())
                return child(nt);

            wait(&stat);
            assert(stat == SIGTRAP);
        }

        return 0;
    }

$ time ./test 16 16536 shows:

              real        user         sys
    -    5m37.628s    0m4.437s    8m5.560s
    +    0m50.032s    0m7.130s    1m4.927s

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Sterling Alexander <stalexan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index b0f482f5daf..063745699f7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -571,6 +571,8 @@ static void forget_original_parent(struct task_struct *father)
 
 	/* Can drop and reacquire tasklist_lock */
 	reaper = find_child_reaper(father);
+	if (list_empty(&father->children))
+		goto unlock;
 
 	reaper = find_new_reaper(father, reaper);
 	list_for_each_entry(p, &father->children, sibling) {
@@ -591,6 +593,7 @@ static void forget_original_parent(struct task_struct *father)
 			reparent_leader(father, p, &dead_children);
 	}
 	list_splice_tail_init(&father->children, &reaper->children);
+ unlock:
 	write_unlock_irq(&tasklist_lock);
 
 	list_for_each_entry_safe(p, n, &dead_children, ptrace_entry) {
-- 
cgit v1.2.3-70-g09d2


From 482a3767e5087f6e6ad2486a6655aaa5f3d59301 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 10 Dec 2014 15:55:20 -0800
Subject: exit: reparent: call forget_original_parent() under tasklist_lock

Shift "release dead children" loop from forget_original_parent() to its
caller, exit_notify().  It is safe to reap them even if our parent reaps
us right after we drop tasklist_lock, those children no longer have any
connection to the exiting task.

And this allows us to avoid write_lock_irq(tasklist_lock) right after it
was released by forget_original_parent(), we can simply call it with
tasklist_lock held.

While at it, move the comment about forget_original_parent() up to
this function.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Sterling Alexander <stalexan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 47 +++++++++++++++++++++++------------------------
 1 file changed, 23 insertions(+), 24 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 063745699f7..8061891ddd9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -560,19 +560,26 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
 	kill_orphaned_pgrp(p, father);
 }
 
-static void forget_original_parent(struct task_struct *father)
+/*
+ * This does two things:
+ *
+ * A.  Make init inherit all the child processes
+ * B.  Check to see if any process groups have become orphaned
+ *	as a result of our exiting, and if they have any stopped
+ *	jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
+ */
+static void forget_original_parent(struct task_struct *father,
+					struct list_head *dead)
 {
-	struct task_struct *p, *t, *n, *reaper;
-	LIST_HEAD(dead_children);
+	struct task_struct *p, *t, *reaper;
 
-	write_lock_irq(&tasklist_lock);
 	if (unlikely(!list_empty(&father->ptraced)))
-		exit_ptrace(father, &dead_children);
+		exit_ptrace(father, dead);
 
 	/* Can drop and reacquire tasklist_lock */
 	reaper = find_child_reaper(father);
 	if (list_empty(&father->children))
-		goto unlock;
+		return;
 
 	reaper = find_new_reaper(father, reaper);
 	list_for_each_entry(p, &father->children, sibling) {
@@ -590,16 +597,9 @@ static void forget_original_parent(struct task_struct *father)
 		 * notify anyone anything has happened.
 		 */
 		if (!same_thread_group(reaper, father))
-			reparent_leader(father, p, &dead_children);
+			reparent_leader(father, p, dead);
 	}
 	list_splice_tail_init(&father->children, &reaper->children);
- unlock:
-	write_unlock_irq(&tasklist_lock);
-
-	list_for_each_entry_safe(p, n, &dead_children, ptrace_entry) {
-		list_del_init(&p->ptrace_entry);
-		release_task(p);
-	}
 }
 
 /*
@@ -609,18 +609,12 @@ static void forget_original_parent(struct task_struct *father)
 static void exit_notify(struct task_struct *tsk, int group_dead)
 {
 	bool autoreap;
-
-	/*
-	 * This does two things:
-	 *
-	 * A.  Make init inherit all the child processes
-	 * B.  Check to see if any process groups have become orphaned
-	 *	as a result of our exiting, and if they have any stopped
-	 *	jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
-	 */
-	forget_original_parent(tsk);
+	struct task_struct *p, *n;
+	LIST_HEAD(dead);
 
 	write_lock_irq(&tasklist_lock);
+	forget_original_parent(tsk, &dead);
+
 	if (group_dead)
 		kill_orphaned_pgrp(tsk->group_leader, NULL);
 
@@ -644,6 +638,11 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 		wake_up_process(tsk->signal->group_exit_task);
 	write_unlock_irq(&tasklist_lock);
 
+	list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
+		list_del_init(&p->ptrace_entry);
+		release_task(p);
+	}
+
 	/* If the process is dead, release it - nobody will wait for it */
 	if (autoreap)
 		release_task(tsk);
-- 
cgit v1.2.3-70-g09d2


From 6c66e7dba3d4419c8b973505679635efcd6b311c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 10 Dec 2014 15:55:23 -0800
Subject: exit: exit_notify: re-use "dead" list to autoreap current

After the previous change we can add just the exiting EXIT_DEAD task to
the "dead" list and remove another release_task(tsk).

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Sterling Alexander <stalexan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 8061891ddd9..8714e5ded8b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -632,6 +632,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	}
 
 	tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
+	if (tsk->exit_state == EXIT_DEAD)
+		list_add(&tsk->ptrace_entry, &dead);
 
 	/* mt-exec, de_thread() is waiting for group leader */
 	if (unlikely(tsk->signal->notify_count < 0))
@@ -642,10 +644,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 		list_del_init(&p->ptrace_entry);
 		release_task(p);
 	}
-
-	/* If the process is dead, release it - nobody will wait for it */
-	if (autoreap)
-		release_task(tsk);
 }
 
 #ifdef CONFIG_DEBUG_STACK_USAGE
-- 
cgit v1.2.3-70-g09d2


From 3245d6acab981a2388ffb877c7ecc97e763c59d4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 8 Jan 2015 14:32:12 -0800
Subject: exit: fix race between wait_consider_task() and wait_task_zombie()

wait_consider_task() checks EXIT_ZOMBIE after EXIT_DEAD/EXIT_TRACE and
both checks can fail if we race with EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE
change in between, gcc needs to reload p->exit_state after
security_task_wait().  In this case ->notask_error will be wrongly
cleared and do_wait() can hang forever if it was the last eligible
child.

Many thanks to Arne who carefully investigated the problem.

Note: this bug is very old but it was pure theoretical until commit
b3ab03160dfa ("wait: completely ignore the EXIT_DEAD tasks").  Before
this commit "-O2" was probably enough to guarantee that compiler won't
read ->exit_state twice.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Arne Goedeke <el@laramies.com>
Tested-by: Arne Goedeke <el@laramies.com>
Cc: <stable@vger.kernel.org>	[3.15+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 1ea4369890a..6806c55475e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1287,9 +1287,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 static int wait_consider_task(struct wait_opts *wo, int ptrace,
 				struct task_struct *p)
 {
+	/*
+	 * We can race with wait_task_zombie() from another thread.
+	 * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
+	 * can't confuse the checks below.
+	 */
+	int exit_state = ACCESS_ONCE(p->exit_state);
 	int ret;
 
-	if (unlikely(p->exit_state == EXIT_DEAD))
+	if (unlikely(exit_state == EXIT_DEAD))
 		return 0;
 
 	ret = eligible_child(wo, p);
@@ -1310,7 +1316,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		return 0;
 	}
 
-	if (unlikely(p->exit_state == EXIT_TRACE)) {
+	if (unlikely(exit_state == EXIT_TRACE)) {
 		/*
 		 * ptrace == 0 means we are the natural parent. In this case
 		 * we should clear notask_error, debugger will notify us.
@@ -1337,7 +1343,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 	}
 
 	/* slay zombie? */
-	if (p->exit_state == EXIT_ZOMBIE) {
+	if (exit_state == EXIT_ZOMBIE) {
 		/* we don't reap group leaders with subthreads */
 		if (!delay_group_leader(p)) {
 			/*
-- 
cgit v1.2.3-70-g09d2