From 4040153087478993cbf0809f444400a3c808074c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 13 Feb 2012 03:58:52 +0000 Subject: security: trim security.h Trim security.h Signed-off-by: Al Viro Signed-off-by: James Morris --- kernel/exit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 4b4042f9bc6..5ad867a3685 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3-70-g09d2 From 37f08be11be9a7d9351fb1b9b408259519a126f3 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Tue, 21 Feb 2012 23:57:47 +0100 Subject: PM / Freezer: Remove references to TIF_FREEZE in comments This patch removes all the references in the code about the TIF_FREEZE flag removed by commit a3201227f803ad7fd43180c5195dbe5a2bf998aa freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE There still are some references to TIF_FREEZE in Documentation/power/freezing-of-tasks.txt, but it looks like that documentation needs more thorough work to reflect how the new freezer works, and hence merely removing the references to TIF_FREEZE won't really help. So I have not touched that part in this patch. Suggested-by: Srivatsa S. Bhat Signed-off-by: Marcos Paulo de Souza Signed-off-by: Rafael J. Wysocki --- kernel/exit.c | 2 +- kernel/freezer.c | 6 +++--- kernel/power/process.c | 8 +++----- 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 294b1709170..fd0af05e063 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -424,7 +424,7 @@ void daemonize(const char *name, ...) */ exit_mm(current); /* - * We don't want to have TIF_FREEZE set if the system-wide hibernation + * We don't want to get frozen, in case system-wide hibernation * or suspend transition begins right now. */ current->flags |= (PF_NOFREEZE | PF_KTHREAD); diff --git a/kernel/freezer.c b/kernel/freezer.c index 9815b8d1eed..11f82a4d4ea 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -99,9 +99,9 @@ static void fake_signal_wake_up(struct task_struct *p) * freeze_task - send a freeze request to given task * @p: task to send the request to * - * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE - * flag and either sending a fake signal to it or waking it up, depending - * on whether it has %PF_FREEZER_NOSIG set. + * If @p is freezing, the freeze request is sent either by sending a fake + * signal (if it's not a kernel thread) or waking it up (if it's a kernel + * thread). * * RETURNS: * %false, if @p is not freezing or already frozen; %true, otherwise diff --git a/kernel/power/process.c b/kernel/power/process.c index 6aeb5efe00e..0d2aeb22610 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -53,11 +53,9 @@ static int try_to_freeze_tasks(bool user_only) * It is "frozen enough". If the task does wake * up, it will immediately call try_to_freeze. * - * Because freeze_task() goes through p's - * scheduler lock after setting TIF_FREEZE, it's - * guaranteed that either we see TASK_RUNNING or - * try_to_stop() after schedule() in ptrace/signal - * stop sees TIF_FREEZE. + * Because freeze_task() goes through p's scheduler lock, it's + * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING + * transition can't race with task state testing here. */ if (!task_is_stopped_or_traced(p) && !freezer_should_skip(p)) -- cgit v1.2.3-70-g09d2 From 5234ffb9f74cfa8993d174782bc861dd9b7b5bfb Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 9 Mar 2012 14:59:59 +0100 Subject: genirq: Get rid of unnecessary IRQTF_DIED flag Currently IRQTF_DIED flag is set when a IRQ thread handler calls do_exit() But also PF_EXITING per process flag gets set when a thread exits. This fix eliminates the duplicate by using PF_EXITING flag. Also, there is a race condition in exit_irq_thread(). In case a thread's bit is cleared in desc->threads_oneshot (and the IRQ line gets unmasked), but before IRQTF_DIED flag is set, a new interrupt might come in and set just cleared bit again, this time forever. This fix throws IRQTF_DIED flag away, eliminating the race as a result. [ tglx: Test THREAD_EXITING first as suggested by Oleg ] Reported-by: Oleg Nesterov Signed-off-by: Alexander Gordeev Link: http://lkml.kernel.org/r/20120309135958.GD2114@dhcp-26-207.brq.redhat.com Signed-off-by: Thomas Gleixner --- kernel/exit.c | 4 ++-- kernel/irq/handle.c | 2 +- kernel/irq/internals.h | 2 -- kernel/irq/manage.c | 11 +---------- 4 files changed, 4 insertions(+), 15 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 4b4042f9bc6..752d2c0abd1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -935,8 +935,6 @@ void do_exit(long code) schedule(); } - exit_irq_thread(); - exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against @@ -945,6 +943,8 @@ void do_exit(long code) smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); + exit_irq_thread(); + if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 470d08c82bb..500aaf67c54 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -60,7 +60,7 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) * device interrupt, so no irq storm is lurking. If the * RUNTHREAD bit is already set, nothing to do. */ - if (test_bit(IRQTF_DIED, &action->thread_flags) || + if ((action->thread->flags & PF_EXITING) || test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) return; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b7952316016..5c233e0ea2c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -20,14 +20,12 @@ extern bool noirqdebug; /* * Bits used by threaded handlers: * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run - * IRQTF_DIED - handler thread died * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed * IRQTF_AFFINITY - irq thread is requested to adjust affinity * IRQTF_FORCED_THREAD - irq action is force threaded */ enum { IRQTF_RUNTHREAD, - IRQTF_DIED, IRQTF_WARNED, IRQTF_AFFINITY, IRQTF_FORCED_THREAD, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3feab4ab687..a94466db73b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -845,17 +845,8 @@ void exit_irq_thread(void) desc = irq_to_desc(action->irq); - /* - * Prevent a stale desc->threads_oneshot. Must be called - * before setting the IRQTF_DIED flag. - */ + /* Prevent a stale desc->threads_oneshot */ irq_finalize_oneshot(desc, action, true); - - /* - * Set the THREAD DIED flag to prevent further wakeups of the - * soon to be gone threaded handler. - */ - set_bit(IRQTF_DIED, &action->flags); } static void irq_setup_forced_threading(struct irqaction *new) -- cgit v1.2.3-70-g09d2 From e636825346b36a07ccfc8e30946d52855e21f681 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 19 Mar 2012 17:03:22 +0100 Subject: exit_signal: simplify the "we have changed execution domain" logic exit_notify() checks "tsk->self_exec_id != tsk->parent_exec_id" to handle the "we have changed execution domain" case. We can change do_thread() to always set ->exit_signal = SIGCHLD and remove this check to simplify the code. We could change setup_new_exec() instead, this looks more logical because it increments ->self_exec_id. But note that de_thread() already resets ->exit_signal if it changes the leader, let's keep both changes close to each other. Note that we change ->exit_signal lockless, this changes the rules. Thereafter ->exit_signal is not stable under tasklist but this is fine, the only possible change is OLDSIG -> SIGCHLD. This can race with eligible_child() but the race is harmless. We can race with reparent_leader() which changes our ->exit_signal in parallel, but it does the same change to SIGCHLD. The noticeable user-visible change is that the execing task is not "visible" to do_wait()->eligible_child(__WCLONE) right after exec. To me this looks more logical, and this is consistent with mt case. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/exec.c | 3 +++ kernel/exit.c | 7 +------ 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/exec.c b/fs/exec.c index b0695a9900e..1e94d2263ae 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -977,6 +977,9 @@ static int de_thread(struct task_struct *tsk) sig->notify_count = 0; no_thread_group: + /* we have changed execution domain */ + tsk->exit_signal = SIGCHLD; + if (current->mm) setmax_mm_hiwater_rss(&sig->maxrss, current->mm); diff --git a/kernel/exit.c b/kernel/exit.c index 752d2c0abd1..51ac4ced131 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -827,14 +827,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead) * If the parent exec id doesn't match the exec id we saved * when we started then we know the parent has changed security * domain. - * - * If our self_exec id doesn't match our parent_exec_id then - * we have changed execution domain as these two values started - * the same after a fork. */ if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD && - (tsk->parent_exec_id != tsk->real_parent->self_exec_id || - tsk->self_exec_id != tsk->parent_exec_id)) + tsk->parent_exec_id != tsk->real_parent->self_exec_id) tsk->exit_signal = SIGCHLD; if (unlikely(tsk->ptrace)) { -- cgit v1.2.3-70-g09d2 From b6e238dceed36891cc633167afe7151f1f3d83c5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 19 Mar 2012 17:03:41 +0100 Subject: exit_signal: fix the "parent has changed security domain" logic exit_notify() changes ->exit_signal if the parent already did exec. This doesn't really work, we are not going to send the signal now if there is another live thread or the exiting task is traced. The parent can exec before the last dies or the tracer detaches. Move this check into do_notify_parent() which actually sends the signal. The user-visible change is that we do not change ->exit_signal, and thus the exiting task is still "clone children" for do_wait()->eligible_child(__WCLONE). Hopefully this is fine, the current logic is racy anyway. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 14 -------------- kernel/signal.c | 9 +++++++++ 2 files changed, 9 insertions(+), 14 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 51ac4ced131..ce5f758f40b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -818,20 +818,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); - /* Let father know we died - * - * Thread signals are configurable, but you aren't going to use - * that to send signals to arbitrary processes. - * That stops right now. - * - * If the parent exec id doesn't match the exec id we saved - * when we started then we know the parent has changed security - * domain. - */ - if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD && - tsk->parent_exec_id != tsk->real_parent->self_exec_id) - tsk->exit_signal = SIGCHLD; - if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && diff --git a/kernel/signal.c b/kernel/signal.c index 8511e39813c..e76001ccf5c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1652,6 +1652,15 @@ bool do_notify_parent(struct task_struct *tsk, int sig) BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); + if (sig != SIGCHLD) { + /* + * This is only possible if parent == real_parent. + * Check if it has changed security domain. + */ + if (tsk->parent_exec_id != tsk->parent->self_exec_id) + sig = SIGCHLD; + } + info.si_signo = sig; info.si_errno = 0; /* -- cgit v1.2.3-70-g09d2 From 05af2e104a0c282dcd9303431e1360750ba76de6 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 21 Mar 2012 16:34:13 -0700 Subject: mm, counters: remove task argument to sync_mm_rss() and __sync_task_rss_stat() sync_mm_rss() can only be used for current to avoid race conditions in iterating and clearing its per-task counters. Remove the task argument for it and its helper function, __sync_task_rss_stat(), to avoid thinking it can be used safely for anything other than current. Signed-off-by: David Rientjes Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- include/linux/mm.h | 4 ++-- kernel/exit.c | 2 +- mm/memory.c | 18 +++++++++--------- mm/mmu_context.c | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/exec.c b/fs/exec.c index 3908544f5d1..6ed164d20d7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -824,7 +824,7 @@ static int exec_mmap(struct mm_struct *mm) /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; - sync_mm_rss(tsk, old_mm); + sync_mm_rss(old_mm); mm_release(tsk, old_mm); if (old_mm) { diff --git a/include/linux/mm.h b/include/linux/mm.h index df17ff23d50..ce2b2a3b287 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1131,9 +1131,9 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, } #if defined(SPLIT_RSS_COUNTING) -void sync_mm_rss(struct task_struct *task, struct mm_struct *mm); +void sync_mm_rss(struct mm_struct *mm); #else -static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) +static inline void sync_mm_rss(struct mm_struct *mm) { } #endif diff --git a/kernel/exit.c b/kernel/exit.c index 0ed15fed579..d26acd3c1e2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -934,7 +934,7 @@ void do_exit(long code) acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ if (tsk->mm) - sync_mm_rss(tsk, tsk->mm); + sync_mm_rss(tsk->mm); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); diff --git a/mm/memory.c b/mm/memory.c index a5de734e14a..2d27239ce4d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -125,17 +125,17 @@ core_initcall(init_zero_pfn); #if defined(SPLIT_RSS_COUNTING) -static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) +static void __sync_task_rss_stat(struct mm_struct *mm) { int i; for (i = 0; i < NR_MM_COUNTERS; i++) { - if (task->rss_stat.count[i]) { - add_mm_counter(mm, i, task->rss_stat.count[i]); - task->rss_stat.count[i] = 0; + if (current->rss_stat.count[i]) { + add_mm_counter(mm, i, current->rss_stat.count[i]); + current->rss_stat.count[i] = 0; } } - task->rss_stat.events = 0; + current->rss_stat.events = 0; } static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) @@ -157,12 +157,12 @@ static void check_sync_rss_stat(struct task_struct *task) if (unlikely(task != current)) return; if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) - __sync_task_rss_stat(task, task->mm); + __sync_task_rss_stat(task->mm); } -void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) +void sync_mm_rss(struct mm_struct *mm) { - __sync_task_rss_stat(task, mm); + __sync_task_rss_stat(mm); } #else /* SPLIT_RSS_COUNTING */ @@ -643,7 +643,7 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) int i; if (current->mm == mm) - sync_mm_rss(current, mm); + sync_mm_rss(mm); for (i = 0; i < NR_MM_COUNTERS; i++) if (rss[i]) add_mm_counter(mm, i, rss[i]); diff --git a/mm/mmu_context.c b/mm/mmu_context.c index cf332bc0080..3dcfaf4ed35 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -53,7 +53,7 @@ void unuse_mm(struct mm_struct *mm) struct task_struct *tsk = current; task_lock(tsk); - sync_mm_rss(tsk, mm); + sync_mm_rss(mm); tsk->mm = NULL; /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); -- cgit v1.2.3-70-g09d2 From ebec18a6d3aa1e7d84aab16225e87fd25170ec2b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 Mar 2012 15:01:54 -0700 Subject: prctl: add PR_{SET,GET}_CHILD_SUBREAPER to allow simple process supervision Userspace service managers/supervisors need to track their started services. Many services daemonize by double-forking and get implicitly re-parented to PID 1. The service manager will no longer be able to receive the SIGCHLD signals for them, and is no longer in charge of reaping the children with wait(). All information about the children is lost at the moment PID 1 cleans up the re-parented processes. With this prctl, a service manager process can mark itself as a sort of 'sub-init', able to stay as the parent for all orphaned processes created by the started services. All SIGCHLD signals will be delivered to the service manager. Receiving SIGCHLD and doing wait() is in cases of a service-manager much preferred over any possible asynchronous notification about specific PIDs, because the service manager has full access to the child process data in /proc and the PID can not be re-used until the wait(), the service-manager itself is in charge of, has happened. As a side effect, the relevant parent PID information does not get lost by a double-fork, which results in a more elaborate process tree and 'ps' output: before: # ps afx 253 ? Ss 0:00 /bin/dbus-daemon --system --nofork 294 ? Sl 0:00 /usr/libexec/polkit-1/polkitd 328 ? S 0:00 /usr/sbin/modem-manager 608 ? Sl 0:00 /usr/libexec/colord 658 ? Sl 0:00 /usr/libexec/upowerd 819 ? Sl 0:00 /usr/libexec/imsettings-daemon 916 ? Sl 0:00 /usr/libexec/udisks-daemon 917 ? S 0:00 \_ udisks-daemon: not polling any devices after: # ps afx 294 ? Ss 0:00 /bin/dbus-daemon --system --nofork 426 ? Sl 0:00 \_ /usr/libexec/polkit-1/polkitd 449 ? S 0:00 \_ /usr/sbin/modem-manager 635 ? Sl 0:00 \_ /usr/libexec/colord 705 ? Sl 0:00 \_ /usr/libexec/upowerd 959 ? Sl 0:00 \_ /usr/libexec/udisks-daemon 960 ? S 0:00 | \_ udisks-daemon: not polling any devices 977 ? Sl 0:00 \_ /usr/libexec/packagekitd This prctl is orthogonal to PID namespaces. PID namespaces are isolated from each other, while a service management process usually requires the services to live in the same namespace, to be able to talk to each other. Users of this will be the systemd per-user instance, which provides init-like functionality for the user's login session and D-Bus, which activates bus services on-demand. Both need init-like capabilities to be able to properly keep track of the services they start. Many thanks to Oleg for several rounds of review and insights. [akpm@linux-foundation.org: fix comment layout and spelling] [akpm@linux-foundation.org: add lengthy code comment from Oleg] Reviewed-by: Oleg Nesterov Signed-off-by: Lennart Poettering Signed-off-by: Kay Sievers Acked-by: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/prctl.h | 3 +++ include/linux/sched.h | 12 ++++++++++++ kernel/exit.c | 33 ++++++++++++++++++++++++++++----- kernel/fork.c | 3 +++ kernel/sys.c | 8 ++++++++ 5 files changed, 54 insertions(+), 5 deletions(-) (limited to 'kernel/exit.c') diff --git a/include/linux/prctl.h b/include/linux/prctl.h index a0413ac3abe..e0cfec2490a 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -121,4 +121,7 @@ #define PR_SET_PTRACER 0x59616d61 # define PR_SET_PTRACER_ANY ((unsigned long)-1) +#define PR_SET_CHILD_SUBREAPER 36 +#define PR_GET_CHILD_SUBREAPER 37 + #endif /* _LINUX_PRCTL_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 0c147a4260a..0c3854b0d4b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -553,6 +553,18 @@ struct signal_struct { int group_stop_count; unsigned int flags; /* see SIGNAL_* flags below */ + /* + * PR_SET_CHILD_SUBREAPER marks a process, like a service + * manager, to re-parent orphan (double-forking) child processes + * to this process instead of 'init'. The service manager is + * able to receive SIGCHLD signals and is able to investigate + * the process until it calls wait(). All children of this + * process will inherit a flag if they should look for a + * child_subreaper process at exit. + */ + unsigned int is_child_subreaper:1; + unsigned int has_child_subreaper:1; + /* POSIX.1b Interval Timers */ struct list_head posix_timers; diff --git a/kernel/exit.c b/kernel/exit.c index 16b07bfac22..456329fd4ea 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -687,11 +687,11 @@ static void exit_mm(struct task_struct * tsk) } /* - * When we die, we re-parent all our children. - * Try to give them to another thread in our thread - * group, and if no such member exists, give it to - * the child reaper process (ie "init") in our pid - * space. + * When we die, we re-parent all our children, and try to: + * 1. give them to another thread in our thread group, if such a member exists + * 2. give it to the first ancestor process which prctl'd itself as a + * child_subreaper for its children (like a service manager) + * 3. give it to the init process (PID 1) in our pid namespace */ static struct task_struct *find_new_reaper(struct task_struct *father) __releases(&tasklist_lock) @@ -722,6 +722,29 @@ static struct task_struct *find_new_reaper(struct task_struct *father) * forget_original_parent() must move them somewhere. */ pid_ns->child_reaper = init_pid_ns.child_reaper; + } else if (father->signal->has_child_subreaper) { + struct task_struct *reaper; + + /* + * Find the first ancestor marked as child_subreaper. + * Note that the code below checks same_thread_group(reaper, + * pid_ns->child_reaper). This is what we need to DTRT in a + * PID namespace. However we still need the check above, see + * http://marc.info/?l=linux-kernel&m=131385460420380 + */ + for (reaper = father->real_parent; + reaper != &init_task; + reaper = reaper->real_parent) { + if (same_thread_group(reaper, pid_ns->child_reaper)) + break; + if (!reaper->signal->is_child_subreaper) + continue; + thread = reaper; + do { + if (!(thread->flags & PF_EXITING)) + return reaper; + } while_each_thread(reaper, thread); + } } return pid_ns->child_reaper; diff --git a/kernel/fork.c b/kernel/fork.c index 37674ec55cd..b9372a0bff1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1051,6 +1051,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; + sig->has_child_subreaper = current->signal->has_child_subreaper || + current->signal->is_child_subreaper; + mutex_init(&sig->cred_guard_mutex); return 0; diff --git a/kernel/sys.c b/kernel/sys.c index 888d227fd19..9eb7fcab8df 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1962,6 +1962,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_MM: error = prctl_set_mm(arg2, arg3, arg4, arg5); break; + case PR_SET_CHILD_SUBREAPER: + me->signal->is_child_subreaper = !!arg2; + error = 0; + break; + case PR_GET_CHILD_SUBREAPER: + error = put_user(me->signal->is_child_subreaper, + (int __user *) arg2); + break; default: error = -EINVAL; break; -- cgit v1.2.3-70-g09d2 From 397a21f24d455982a8a6f9bc11b5f3326ce3c6ef Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 23 Mar 2012 15:01:54 -0700 Subject: kernel/exit.c: if init dies, log a signal which killed it, if any I just received another user's pleas for help when their init mysteriously died. I again explained that they need to check whether it died because of bad instruction, a segv, or something else. Which was an annoying detour into writing a trivial C program to spawn his init and print its exit code: http://lists.busybox.net/pipermail/busybox/2012-January/077172.html I hear you saying "just test it under /bin/sh". Well, the crashing init _was_ /bin/sh. Which prompted me to make kernel do this first step automatically. We can print exit code, which makes it possible to see that death was from e.g. SIGILL without writing test programs. [akpm@linux-foundation.org: add 0x to hex number output] Signed-off-by: Denys Vlasenko Acked-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 456329fd4ea..3db1909faed 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -711,8 +711,11 @@ static struct task_struct *find_new_reaper(struct task_struct *father) if (unlikely(pid_ns->child_reaper == father)) { write_unlock_irq(&tasklist_lock); - if (unlikely(pid_ns == &init_pid_ns)) - panic("Attempted to kill init!"); + if (unlikely(pid_ns == &init_pid_ns)) { + panic("Attempted to kill init! exitcode=0x%08x\n", + father->signal->group_exit_code ?: + father->exit_code); + } zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); -- cgit v1.2.3-70-g09d2