From 4040153087478993cbf0809f444400a3c808074c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Mon, 13 Feb 2012 03:58:52 +0000
Subject: security: trim security.h

Trim security.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/exit.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 4b4042f9bc6..5ad867a3685 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -52,6 +52,7 @@
 #include <linux/hw_breakpoint.h>
 #include <linux/oom.h>
 #include <linux/writeback.h>
+#include <linux/shm.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
-- 
cgit v1.2.3-70-g09d2


From 37f08be11be9a7d9351fb1b9b408259519a126f3 Mon Sep 17 00:00:00 2001
From: Marcos Paulo de Souza <marcos.mage@gmail.com>
Date: Tue, 21 Feb 2012 23:57:47 +0100
Subject: PM / Freezer: Remove references to TIF_FREEZE in comments

This patch removes all the references in the code about the TIF_FREEZE
flag removed by commit a3201227f803ad7fd43180c5195dbe5a2bf998aa

    freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE

There still are some references to TIF_FREEZE in
Documentation/power/freezing-of-tasks.txt, but it looks like that
documentation needs more thorough work to reflect how the new
freezer works, and hence merely removing the references to TIF_FREEZE
won't really help. So I have not touched that part in this patch.

Suggested-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Marcos Paulo de Souza <marcos.mage@gmail.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/exit.c          | 2 +-
 kernel/freezer.c       | 6 +++---
 kernel/power/process.c | 8 +++-----
 3 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 294b1709170..fd0af05e063 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -424,7 +424,7 @@ void daemonize(const char *name, ...)
 	 */
 	exit_mm(current);
 	/*
-	 * We don't want to have TIF_FREEZE set if the system-wide hibernation
+	 * We don't want to get frozen, in case system-wide hibernation
 	 * or suspend transition begins right now.
 	 */
 	current->flags |= (PF_NOFREEZE | PF_KTHREAD);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 9815b8d1eed..11f82a4d4ea 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -99,9 +99,9 @@ static void fake_signal_wake_up(struct task_struct *p)
  * freeze_task - send a freeze request to given task
  * @p: task to send the request to
  *
- * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE
- * flag and either sending a fake signal to it or waking it up, depending
- * on whether it has %PF_FREEZER_NOSIG set.
+ * If @p is freezing, the freeze request is sent either by sending a fake
+ * signal (if it's not a kernel thread) or waking it up (if it's a kernel
+ * thread).
  *
  * RETURNS:
  * %false, if @p is not freezing or already frozen; %true, otherwise
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 6aeb5efe00e..0d2aeb22610 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -53,11 +53,9 @@ static int try_to_freeze_tasks(bool user_only)
 			 * It is "frozen enough".  If the task does wake
 			 * up, it will immediately call try_to_freeze.
 			 *
-			 * Because freeze_task() goes through p's
-			 * scheduler lock after setting TIF_FREEZE, it's
-			 * guaranteed that either we see TASK_RUNNING or
-			 * try_to_stop() after schedule() in ptrace/signal
-			 * stop sees TIF_FREEZE.
+			 * Because freeze_task() goes through p's scheduler lock, it's
+			 * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING
+			 * transition can't race with task state testing here.
 			 */
 			if (!task_is_stopped_or_traced(p) &&
 			    !freezer_should_skip(p))
-- 
cgit v1.2.3-70-g09d2


From 5234ffb9f74cfa8993d174782bc861dd9b7b5bfb Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@redhat.com>
Date: Fri, 9 Mar 2012 14:59:59 +0100
Subject: genirq: Get rid of unnecessary IRQTF_DIED flag

Currently IRQTF_DIED flag is set when a IRQ thread handler calls do_exit()
But also PF_EXITING per process flag gets set when a thread exits. This
fix eliminates the duplicate by using PF_EXITING flag.

Also, there is a race condition in exit_irq_thread(). In case a thread's
bit is cleared in desc->threads_oneshot (and the IRQ line gets unmasked),
but before IRQTF_DIED flag is set, a new interrupt might come in and set
just cleared bit again, this time forever. This fix throws IRQTF_DIED flag
away, eliminating the race as a result.

[ tglx: Test THREAD_EXITING first as suggested by Oleg ]

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
Link: http://lkml.kernel.org/r/20120309135958.GD2114@dhcp-26-207.brq.redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/exit.c          |  4 ++--
 kernel/irq/handle.c    |  2 +-
 kernel/irq/internals.h |  2 --
 kernel/irq/manage.c    | 11 +----------
 4 files changed, 4 insertions(+), 15 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 4b4042f9bc6..752d2c0abd1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -935,8 +935,6 @@ void do_exit(long code)
 		schedule();
 	}
 
-	exit_irq_thread();
-
 	exit_signals(tsk);  /* sets PF_EXITING */
 	/*
 	 * tsk->flags are checked in the futex code to protect against
@@ -945,6 +943,8 @@ void do_exit(long code)
 	smp_mb();
 	raw_spin_unlock_wait(&tsk->pi_lock);
 
+	exit_irq_thread();
+
 	if (unlikely(in_atomic()))
 		printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
 				current->comm, task_pid_nr(current),
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 470d08c82bb..500aaf67c54 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -60,7 +60,7 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
 	 * device interrupt, so no irq storm is lurking. If the
 	 * RUNTHREAD bit is already set, nothing to do.
 	 */
-	if (test_bit(IRQTF_DIED, &action->thread_flags) ||
+	if ((action->thread->flags & PF_EXITING) ||
 	    test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
 		return;
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b7952316016..5c233e0ea2c 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -20,14 +20,12 @@ extern bool noirqdebug;
 /*
  * Bits used by threaded handlers:
  * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
- * IRQTF_DIED      - handler thread died
  * IRQTF_WARNED    - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
  * IRQTF_AFFINITY  - irq thread is requested to adjust affinity
  * IRQTF_FORCED_THREAD  - irq action is force threaded
  */
 enum {
 	IRQTF_RUNTHREAD,
-	IRQTF_DIED,
 	IRQTF_WARNED,
 	IRQTF_AFFINITY,
 	IRQTF_FORCED_THREAD,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3feab4ab687..a94466db73b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -845,17 +845,8 @@ void exit_irq_thread(void)
 
 	desc = irq_to_desc(action->irq);
 
-	/*
-	 * Prevent a stale desc->threads_oneshot. Must be called
-	 * before setting the IRQTF_DIED flag.
-	 */
+	/* Prevent a stale desc->threads_oneshot */
 	irq_finalize_oneshot(desc, action, true);
-
-	/*
-	 * Set the THREAD DIED flag to prevent further wakeups of the
-	 * soon to be gone threaded handler.
-	 */
-	set_bit(IRQTF_DIED, &action->flags);
 }
 
 static void irq_setup_forced_threading(struct irqaction *new)
-- 
cgit v1.2.3-70-g09d2


From e636825346b36a07ccfc8e30946d52855e21f681 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 19 Mar 2012 17:03:22 +0100
Subject: exit_signal: simplify the "we have changed execution domain" logic

exit_notify() checks "tsk->self_exec_id != tsk->parent_exec_id"
to handle the "we have changed execution domain" case.

We can change do_thread() to always set ->exit_signal = SIGCHLD
and remove this check to simplify the code.

We could change setup_new_exec() instead, this looks more logical
because it increments ->self_exec_id. But note that de_thread()
already resets ->exit_signal if it changes the leader, let's keep
both changes close to each other.

Note that we change ->exit_signal lockless, this changes the rules.
Thereafter ->exit_signal is not stable under tasklist but this is
fine, the only possible change is OLDSIG -> SIGCHLD. This can race
with eligible_child() but the race is harmless. We can race with
reparent_leader() which changes our ->exit_signal in parallel, but
it does the same change to SIGCHLD.

The noticeable user-visible change is that the execing task is not
"visible" to do_wait()->eligible_child(__WCLONE) right after exec.
To me this looks more logical, and this is consistent with mt case.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c     | 3 +++
 kernel/exit.c | 7 +------
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/fs/exec.c b/fs/exec.c
index b0695a9900e..1e94d2263ae 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -977,6 +977,9 @@ static int de_thread(struct task_struct *tsk)
 	sig->notify_count = 0;
 
 no_thread_group:
+	/* we have changed execution domain */
+	tsk->exit_signal = SIGCHLD;
+
 	if (current->mm)
 		setmax_mm_hiwater_rss(&sig->maxrss, current->mm);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 752d2c0abd1..51ac4ced131 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -827,14 +827,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	 * If the parent exec id doesn't match the exec id we saved
 	 * when we started then we know the parent has changed security
 	 * domain.
-	 *
-	 * If our self_exec id doesn't match our parent_exec_id then
-	 * we have changed execution domain as these two values started
-	 * the same after a fork.
 	 */
 	if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
-	    (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
-	     tsk->self_exec_id != tsk->parent_exec_id))
+	    tsk->parent_exec_id != tsk->real_parent->self_exec_id)
 		tsk->exit_signal = SIGCHLD;
 
 	if (unlikely(tsk->ptrace)) {
-- 
cgit v1.2.3-70-g09d2


From b6e238dceed36891cc633167afe7151f1f3d83c5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 19 Mar 2012 17:03:41 +0100
Subject: exit_signal: fix the "parent has changed security domain" logic

exit_notify() changes ->exit_signal if the parent already did exec.
This doesn't really work, we are not going to send the signal now
if there is another live thread or the exiting task is traced. The
parent can exec before the last dies or the tracer detaches.

Move this check into do_notify_parent() which actually sends the
signal.

The user-visible change is that we do not change ->exit_signal,
and thus the exiting task is still "clone children" for
do_wait()->eligible_child(__WCLONE). Hopefully this is fine, the
current logic is racy anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c   | 14 --------------
 kernel/signal.c |  9 +++++++++
 2 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 51ac4ced131..ce5f758f40b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -818,20 +818,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	if (group_dead)
 		kill_orphaned_pgrp(tsk->group_leader, NULL);
 
-	/* Let father know we died
-	 *
-	 * Thread signals are configurable, but you aren't going to use
-	 * that to send signals to arbitrary processes.
-	 * That stops right now.
-	 *
-	 * If the parent exec id doesn't match the exec id we saved
-	 * when we started then we know the parent has changed security
-	 * domain.
-	 */
-	if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
-	    tsk->parent_exec_id != tsk->real_parent->self_exec_id)
-		tsk->exit_signal = SIGCHLD;
-
 	if (unlikely(tsk->ptrace)) {
 		int sig = thread_group_leader(tsk) &&
 				thread_group_empty(tsk) &&
diff --git a/kernel/signal.c b/kernel/signal.c
index 8511e39813c..e76001ccf5c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1652,6 +1652,15 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 	BUG_ON(!tsk->ptrace &&
 	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
 
+	if (sig != SIGCHLD) {
+		/*
+		 * This is only possible if parent == real_parent.
+		 * Check if it has changed security domain.
+		 */
+		if (tsk->parent_exec_id != tsk->parent->self_exec_id)
+			sig = SIGCHLD;
+	}
+
 	info.si_signo = sig;
 	info.si_errno = 0;
 	/*
-- 
cgit v1.2.3-70-g09d2


From 05af2e104a0c282dcd9303431e1360750ba76de6 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 21 Mar 2012 16:34:13 -0700
Subject: mm, counters: remove task argument to sync_mm_rss() and
 __sync_task_rss_stat()

sync_mm_rss() can only be used for current to avoid race conditions in
iterating and clearing its per-task counters.  Remove the task argument
for it and its helper function, __sync_task_rss_stat(), to avoid thinking
it can be used safely for anything other than current.

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c          |  2 +-
 include/linux/mm.h |  4 ++--
 kernel/exit.c      |  2 +-
 mm/memory.c        | 18 +++++++++---------
 mm/mmu_context.c   |  2 +-
 5 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/fs/exec.c b/fs/exec.c
index 3908544f5d1..6ed164d20d7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -824,7 +824,7 @@ static int exec_mmap(struct mm_struct *mm)
 	/* Notify parent that we're no longer interested in the old VM */
 	tsk = current;
 	old_mm = current->mm;
-	sync_mm_rss(tsk, old_mm);
+	sync_mm_rss(old_mm);
 	mm_release(tsk, old_mm);
 
 	if (old_mm) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index df17ff23d50..ce2b2a3b287 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1131,9 +1131,9 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
 }
 
 #if defined(SPLIT_RSS_COUNTING)
-void sync_mm_rss(struct task_struct *task, struct mm_struct *mm);
+void sync_mm_rss(struct mm_struct *mm);
 #else
-static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
+static inline void sync_mm_rss(struct mm_struct *mm)
 {
 }
 #endif
diff --git a/kernel/exit.c b/kernel/exit.c
index 0ed15fed579..d26acd3c1e2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -934,7 +934,7 @@ void do_exit(long code)
 	acct_update_integrals(tsk);
 	/* sync mm's RSS info before statistics gathering */
 	if (tsk->mm)
-		sync_mm_rss(tsk, tsk->mm);
+		sync_mm_rss(tsk->mm);
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
 		hrtimer_cancel(&tsk->signal->real_timer);
diff --git a/mm/memory.c b/mm/memory.c
index a5de734e14a..2d27239ce4d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -125,17 +125,17 @@ core_initcall(init_zero_pfn);
 
 #if defined(SPLIT_RSS_COUNTING)
 
-static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
+static void __sync_task_rss_stat(struct mm_struct *mm)
 {
 	int i;
 
 	for (i = 0; i < NR_MM_COUNTERS; i++) {
-		if (task->rss_stat.count[i]) {
-			add_mm_counter(mm, i, task->rss_stat.count[i]);
-			task->rss_stat.count[i] = 0;
+		if (current->rss_stat.count[i]) {
+			add_mm_counter(mm, i, current->rss_stat.count[i]);
+			current->rss_stat.count[i] = 0;
 		}
 	}
-	task->rss_stat.events = 0;
+	current->rss_stat.events = 0;
 }
 
 static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
@@ -157,12 +157,12 @@ static void check_sync_rss_stat(struct task_struct *task)
 	if (unlikely(task != current))
 		return;
 	if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
-		__sync_task_rss_stat(task, task->mm);
+		__sync_task_rss_stat(task->mm);
 }
 
-void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
+void sync_mm_rss(struct mm_struct *mm)
 {
-	__sync_task_rss_stat(task, mm);
+	__sync_task_rss_stat(mm);
 }
 #else /* SPLIT_RSS_COUNTING */
 
@@ -643,7 +643,7 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
 	int i;
 
 	if (current->mm == mm)
-		sync_mm_rss(current, mm);
+		sync_mm_rss(mm);
 	for (i = 0; i < NR_MM_COUNTERS; i++)
 		if (rss[i])
 			add_mm_counter(mm, i, rss[i]);
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index cf332bc0080..3dcfaf4ed35 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -53,7 +53,7 @@ void unuse_mm(struct mm_struct *mm)
 	struct task_struct *tsk = current;
 
 	task_lock(tsk);
-	sync_mm_rss(tsk, mm);
+	sync_mm_rss(mm);
 	tsk->mm = NULL;
 	/* active_mm is still 'mm' */
 	enter_lazy_tlb(mm, tsk);
-- 
cgit v1.2.3-70-g09d2


From ebec18a6d3aa1e7d84aab16225e87fd25170ec2b Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 23 Mar 2012 15:01:54 -0700
Subject: prctl: add PR_{SET,GET}_CHILD_SUBREAPER to allow simple process
 supervision

Userspace service managers/supervisors need to track their started
services.  Many services daemonize by double-forking and get implicitly
re-parented to PID 1.  The service manager will no longer be able to
receive the SIGCHLD signals for them, and is no longer in charge of
reaping the children with wait().  All information about the children is
lost at the moment PID 1 cleans up the re-parented processes.

With this prctl, a service manager process can mark itself as a sort of
'sub-init', able to stay as the parent for all orphaned processes
created by the started services.  All SIGCHLD signals will be delivered
to the service manager.

Receiving SIGCHLD and doing wait() is in cases of a service-manager much
preferred over any possible asynchronous notification about specific
PIDs, because the service manager has full access to the child process
data in /proc and the PID can not be re-used until the wait(), the
service-manager itself is in charge of, has happened.

As a side effect, the relevant parent PID information does not get lost
by a double-fork, which results in a more elaborate process tree and
'ps' output:

before:
  # ps afx
  253 ?        Ss     0:00 /bin/dbus-daemon --system --nofork
  294 ?        Sl     0:00 /usr/libexec/polkit-1/polkitd
  328 ?        S      0:00 /usr/sbin/modem-manager
  608 ?        Sl     0:00 /usr/libexec/colord
  658 ?        Sl     0:00 /usr/libexec/upowerd
  819 ?        Sl     0:00 /usr/libexec/imsettings-daemon
  916 ?        Sl     0:00 /usr/libexec/udisks-daemon
  917 ?        S      0:00  \_ udisks-daemon: not polling any devices

after:
  # ps afx
  294 ?        Ss     0:00 /bin/dbus-daemon --system --nofork
  426 ?        Sl     0:00  \_ /usr/libexec/polkit-1/polkitd
  449 ?        S      0:00  \_ /usr/sbin/modem-manager
  635 ?        Sl     0:00  \_ /usr/libexec/colord
  705 ?        Sl     0:00  \_ /usr/libexec/upowerd
  959 ?        Sl     0:00  \_ /usr/libexec/udisks-daemon
  960 ?        S      0:00  |   \_ udisks-daemon: not polling any devices
  977 ?        Sl     0:00  \_ /usr/libexec/packagekitd

This prctl is orthogonal to PID namespaces.  PID namespaces are isolated
from each other, while a service management process usually requires the
services to live in the same namespace, to be able to talk to each
other.

Users of this will be the systemd per-user instance, which provides
init-like functionality for the user's login session and D-Bus, which
activates bus services on-demand.  Both need init-like capabilities to
be able to properly keep track of the services they start.

Many thanks to Oleg for several rounds of review and insights.

[akpm@linux-foundation.org: fix comment layout and spelling]
[akpm@linux-foundation.org: add lengthy code comment from Oleg]
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Lennart Poettering <lennart@poettering.net>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Acked-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/prctl.h |  3 +++
 include/linux/sched.h | 12 ++++++++++++
 kernel/exit.c         | 33 ++++++++++++++++++++++++++++-----
 kernel/fork.c         |  3 +++
 kernel/sys.c          |  8 ++++++++
 5 files changed, 54 insertions(+), 5 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a0413ac3abe..e0cfec2490a 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -121,4 +121,7 @@
 #define PR_SET_PTRACER 0x59616d61
 # define PR_SET_PTRACER_ANY ((unsigned long)-1)
 
+#define PR_SET_CHILD_SUBREAPER 36
+#define PR_GET_CHILD_SUBREAPER 37
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0c147a4260a..0c3854b0d4b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -553,6 +553,18 @@ struct signal_struct {
 	int			group_stop_count;
 	unsigned int		flags; /* see SIGNAL_* flags below */
 
+	/*
+	 * PR_SET_CHILD_SUBREAPER marks a process, like a service
+	 * manager, to re-parent orphan (double-forking) child processes
+	 * to this process instead of 'init'. The service manager is
+	 * able to receive SIGCHLD signals and is able to investigate
+	 * the process until it calls wait(). All children of this
+	 * process will inherit a flag if they should look for a
+	 * child_subreaper process at exit.
+	 */
+	unsigned int		is_child_subreaper:1;
+	unsigned int		has_child_subreaper:1;
+
 	/* POSIX.1b Interval Timers */
 	struct list_head posix_timers;
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 16b07bfac22..456329fd4ea 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -687,11 +687,11 @@ static void exit_mm(struct task_struct * tsk)
 }
 
 /*
- * When we die, we re-parent all our children.
- * Try to give them to another thread in our thread
- * group, and if no such member exists, give it to
- * the child reaper process (ie "init") in our pid
- * space.
+ * When we die, we re-parent all our children, and try to:
+ * 1. give them to another thread in our thread group, if such a member exists
+ * 2. give it to the first ancestor process which prctl'd itself as a
+ *    child_subreaper for its children (like a service manager)
+ * 3. give it to the init process (PID 1) in our pid namespace
  */
 static struct task_struct *find_new_reaper(struct task_struct *father)
 	__releases(&tasklist_lock)
@@ -722,6 +722,29 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 		 * forget_original_parent() must move them somewhere.
 		 */
 		pid_ns->child_reaper = init_pid_ns.child_reaper;
+	} else if (father->signal->has_child_subreaper) {
+		struct task_struct *reaper;
+
+		/*
+		 * Find the first ancestor marked as child_subreaper.
+		 * Note that the code below checks same_thread_group(reaper,
+		 * pid_ns->child_reaper).  This is what we need to DTRT in a
+		 * PID namespace. However we still need the check above, see
+		 * http://marc.info/?l=linux-kernel&m=131385460420380
+		 */
+		for (reaper = father->real_parent;
+		     reaper != &init_task;
+		     reaper = reaper->real_parent) {
+			if (same_thread_group(reaper, pid_ns->child_reaper))
+				break;
+			if (!reaper->signal->is_child_subreaper)
+				continue;
+			thread = reaper;
+			do {
+				if (!(thread->flags & PF_EXITING))
+					return reaper;
+			} while_each_thread(reaper, thread);
+		}
 	}
 
 	return pid_ns->child_reaper;
diff --git a/kernel/fork.c b/kernel/fork.c
index 37674ec55cd..b9372a0bff1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1051,6 +1051,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->oom_score_adj = current->signal->oom_score_adj;
 	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
+	sig->has_child_subreaper = current->signal->has_child_subreaper ||
+				   current->signal->is_child_subreaper;
+
 	mutex_init(&sig->cred_guard_mutex);
 
 	return 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index 888d227fd19..9eb7fcab8df 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1962,6 +1962,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		case PR_SET_MM:
 			error = prctl_set_mm(arg2, arg3, arg4, arg5);
 			break;
+		case PR_SET_CHILD_SUBREAPER:
+			me->signal->is_child_subreaper = !!arg2;
+			error = 0;
+			break;
+		case PR_GET_CHILD_SUBREAPER:
+			error = put_user(me->signal->is_child_subreaper,
+					 (int __user *) arg2);
+			break;
 		default:
 			error = -EINVAL;
 			break;
-- 
cgit v1.2.3-70-g09d2


From 397a21f24d455982a8a6f9bc11b5f3326ce3c6ef Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Fri, 23 Mar 2012 15:01:54 -0700
Subject: kernel/exit.c: if init dies, log a signal which killed it, if any

I just received another user's pleas for help when their init
mysteriously died.  I again explained that they need to check whether it
died because of bad instruction, a segv, or something else.  Which was
an annoying detour into writing a trivial C program to spawn his init
and print its exit code:

  http://lists.busybox.net/pipermail/busybox/2012-January/077172.html

I hear you saying "just test it under /bin/sh".  Well, the crashing init
_was_ /bin/sh.

Which prompted me to make kernel do this first step automatically.  We can
print exit code, which makes it possible to see that death was from e.g.
SIGILL without writing test programs.

[akpm@linux-foundation.org: add 0x to hex number output]
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 456329fd4ea..3db1909faed 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -711,8 +711,11 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 
 	if (unlikely(pid_ns->child_reaper == father)) {
 		write_unlock_irq(&tasklist_lock);
-		if (unlikely(pid_ns == &init_pid_ns))
-			panic("Attempted to kill init!");
+		if (unlikely(pid_ns == &init_pid_ns)) {
+			panic("Attempted to kill init! exitcode=0x%08x\n",
+				father->signal->group_exit_code ?:
+					father->exit_code);
+		}
 
 		zap_pid_ns_processes(pid_ns);
 		write_lock_irq(&tasklist_lock);
-- 
cgit v1.2.3-70-g09d2