diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/exit.c | 40 | ||||
-rw-r--r-- | kernel/fork.c | 3 | ||||
-rw-r--r-- | kernel/kmod.c | 84 | ||||
-rw-r--r-- | kernel/pid_namespace.c | 8 | ||||
-rw-r--r-- | kernel/ptrace.c | 66 | ||||
-rw-r--r-- | kernel/signal.c | 18 | ||||
-rw-r--r-- | kernel/sys.c | 8 | ||||
-rw-r--r-- | kernel/watchdog.c | 27 |
8 files changed, 166 insertions, 88 deletions
diff --git a/kernel/exit.c b/kernel/exit.c index 16b07bfac22..3db1909faed 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -687,11 +687,11 @@ static void exit_mm(struct task_struct * tsk) } /* - * When we die, we re-parent all our children. - * Try to give them to another thread in our thread - * group, and if no such member exists, give it to - * the child reaper process (ie "init") in our pid - * space. + * When we die, we re-parent all our children, and try to: + * 1. give them to another thread in our thread group, if such a member exists + * 2. give it to the first ancestor process which prctl'd itself as a + * child_subreaper for its children (like a service manager) + * 3. give it to the init process (PID 1) in our pid namespace */ static struct task_struct *find_new_reaper(struct task_struct *father) __releases(&tasklist_lock) @@ -711,8 +711,11 @@ static struct task_struct *find_new_reaper(struct task_struct *father) if (unlikely(pid_ns->child_reaper == father)) { write_unlock_irq(&tasklist_lock); - if (unlikely(pid_ns == &init_pid_ns)) - panic("Attempted to kill init!"); + if (unlikely(pid_ns == &init_pid_ns)) { + panic("Attempted to kill init! exitcode=0x%08x\n", + father->signal->group_exit_code ?: + father->exit_code); + } zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); @@ -722,6 +725,29 @@ static struct task_struct *find_new_reaper(struct task_struct *father) * forget_original_parent() must move them somewhere. */ pid_ns->child_reaper = init_pid_ns.child_reaper; + } else if (father->signal->has_child_subreaper) { + struct task_struct *reaper; + + /* + * Find the first ancestor marked as child_subreaper. + * Note that the code below checks same_thread_group(reaper, + * pid_ns->child_reaper). This is what we need to DTRT in a + * PID namespace. However we still need the check above, see + * http://marc.info/?l=linux-kernel&m=131385460420380 + */ + for (reaper = father->real_parent; + reaper != &init_task; + reaper = reaper->real_parent) { + if (same_thread_group(reaper, pid_ns->child_reaper)) + break; + if (!reaper->signal->is_child_subreaper) + continue; + thread = reaper; + do { + if (!(thread->flags & PF_EXITING)) + return reaper; + } while_each_thread(reaper, thread); + } } return pid_ns->child_reaper; diff --git a/kernel/fork.c b/kernel/fork.c index 37674ec55cd..b9372a0bff1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1051,6 +1051,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; + sig->has_child_subreaper = current->signal->has_child_subreaper || + current->signal->is_child_subreaper; + mutex_init(&sig->cred_guard_mutex); return 0; diff --git a/kernel/kmod.c b/kernel/kmod.c index a0a88543934..957a7aab8eb 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -60,6 +60,43 @@ static DECLARE_RWSEM(umhelper_sem); */ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; +static void free_modprobe_argv(struct subprocess_info *info) +{ + kfree(info->argv[3]); /* check call_modprobe() */ + kfree(info->argv); +} + +static int call_modprobe(char *module_name, int wait) +{ + static char *envp[] = { + "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL + }; + + char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL); + if (!argv) + goto out; + + module_name = kstrdup(module_name, GFP_KERNEL); + if (!module_name) + goto free_argv; + + argv[0] = modprobe_path; + argv[1] = "-q"; + argv[2] = "--"; + argv[3] = module_name; /* check free_modprobe_argv() */ + argv[4] = NULL; + + return call_usermodehelper_fns(modprobe_path, argv, envp, + wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL); +free_argv: + kfree(argv); +out: + return -ENOMEM; +} + /** * __request_module - try to load a kernel module * @wait: wait (or not) for the operation to complete @@ -81,11 +118,6 @@ int __request_module(bool wait, const char *fmt, ...) char module_name[MODULE_NAME_LEN]; unsigned int max_modprobes; int ret; - char *argv[] = { modprobe_path, "-q", "--", module_name, NULL }; - static char *envp[] = { "HOME=/", - "TERM=linux", - "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - NULL }; static atomic_t kmod_concurrent = ATOMIC_INIT(0); #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; @@ -128,9 +160,7 @@ int __request_module(bool wait, const char *fmt, ...) trace_module_request(module_name, wait, _RET_IP_); - ret = call_usermodehelper_fns(modprobe_path, argv, envp, - wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC, - NULL, NULL, NULL); + ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); atomic_dec(&kmod_concurrent); return ret; @@ -188,7 +218,7 @@ static int ____call_usermodehelper(void *data) /* Exec failed? */ fail: sub_info->retval = retval; - do_exit(0); + return 0; } void call_usermodehelper_freeinfo(struct subprocess_info *info) @@ -199,6 +229,19 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info) } EXPORT_SYMBOL(call_usermodehelper_freeinfo); +static void umh_complete(struct subprocess_info *sub_info) +{ + struct completion *comp = xchg(&sub_info->complete, NULL); + /* + * See call_usermodehelper_exec(). If xchg() returns NULL + * we own sub_info, the UMH_KILLABLE caller has gone away. + */ + if (comp) + complete(comp); + else + call_usermodehelper_freeinfo(sub_info); +} + /* Keventd can't block, but this (a child) can. */ static int wait_for_helper(void *data) { @@ -235,7 +278,7 @@ static int wait_for_helper(void *data) sub_info->retval = ret; } - complete(sub_info->complete); + umh_complete(sub_info); return 0; } @@ -244,7 +287,7 @@ static void __call_usermodehelper(struct work_struct *work) { struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); - enum umh_wait wait = sub_info->wait; + int wait = sub_info->wait & ~UMH_KILLABLE; pid_t pid; /* CLONE_VFORK: wait until the usermode helper has execve'd @@ -269,7 +312,7 @@ static void __call_usermodehelper(struct work_struct *work) case UMH_WAIT_EXEC: if (pid < 0) sub_info->retval = pid; - complete(sub_info->complete); + umh_complete(sub_info); } } @@ -435,8 +478,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns); * asynchronously if wait is not set, and runs as a child of keventd. * (ie. it runs with full root capabilities). */ -int call_usermodehelper_exec(struct subprocess_info *sub_info, - enum umh_wait wait) +int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { DECLARE_COMPLETION_ONSTACK(done); int retval = 0; @@ -456,9 +498,21 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, queue_work(khelper_wq, &sub_info->work); if (wait == UMH_NO_WAIT) /* task has freed sub_info */ goto unlock; + + if (wait & UMH_KILLABLE) { + retval = wait_for_completion_killable(&done); + if (!retval) + goto wait_done; + + /* umh_complete() will see NULL and free sub_info */ + if (xchg(&sub_info->complete, NULL)) + goto unlock; + /* fallthrough, umh_complete() was already called */ + } + wait_for_completion(&done); +wait_done: retval = sub_info->retval; - out: call_usermodehelper_freeinfo(sub_info); unlock: diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a8968396046..17b232869a0 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -168,13 +168,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) while (nr > 0) { rcu_read_lock(); - /* - * Any nested-container's init processes won't ignore the - * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser(). - */ task = pid_task(find_vpid(nr), PIDTYPE_PID); - if (task) - send_sig_info(SIGKILL, SEND_SIG_NOINFO, task); + if (task && !__fatal_signal_pending(task)) + send_sig_info(SIGKILL, SEND_SIG_FORCED, task); rcu_read_unlock(); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 00ab2ca5ed1..ee8d49b9c30 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -231,26 +231,22 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) } static int ptrace_attach(struct task_struct *task, long request, + unsigned long addr, unsigned long flags) { bool seize = (request == PTRACE_SEIZE); int retval; - /* - * SEIZE will enable new ptrace behaviors which will be implemented - * gradually. SEIZE_DEVEL is used to prevent applications - * expecting full SEIZE behaviors trapping on kernel commits which - * are still in the process of implementing them. - * - * Only test programs for new ptrace behaviors being implemented - * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO. - * - * Once SEIZE behaviors are completely implemented, this flag and - * the following test will be removed. - */ retval = -EIO; - if (seize && !(flags & PTRACE_SEIZE_DEVEL)) - goto out; + if (seize) { + if (addr != 0) + goto out; + if (flags & ~(unsigned long)PTRACE_O_MASK) + goto out; + flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT); + } else { + flags = PT_PTRACED; + } audit_ptrace(task); @@ -262,7 +258,7 @@ static int ptrace_attach(struct task_struct *task, long request, /* * Protect exec's credential calculations against our interference; - * interference; SUID, SGID and LSM creds get determined differently + * SUID, SGID and LSM creds get determined differently * under ptrace. */ retval = -ERESTARTNOINTR; @@ -282,11 +278,11 @@ static int ptrace_attach(struct task_struct *task, long request, if (task->ptrace) goto unlock_tasklist; - task->ptrace = PT_PTRACED; if (seize) - task->ptrace |= PT_SEIZED; + flags |= PT_SEIZED; if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) - task->ptrace |= PT_PTRACE_CAP; + flags |= PT_PTRACE_CAP; + task->ptrace = flags; __ptrace_link(task, current); @@ -528,30 +524,18 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds static int ptrace_setoptions(struct task_struct *child, unsigned long data) { - child->ptrace &= ~PT_TRACE_MASK; + unsigned flags; - if (data & PTRACE_O_TRACESYSGOOD) - child->ptrace |= PT_TRACESYSGOOD; - - if (data & PTRACE_O_TRACEFORK) - child->ptrace |= PT_TRACE_FORK; - - if (data & PTRACE_O_TRACEVFORK) - child->ptrace |= PT_TRACE_VFORK; - - if (data & PTRACE_O_TRACECLONE) - child->ptrace |= PT_TRACE_CLONE; - - if (data & PTRACE_O_TRACEEXEC) - child->ptrace |= PT_TRACE_EXEC; - - if (data & PTRACE_O_TRACEVFORKDONE) - child->ptrace |= PT_TRACE_VFORK_DONE; + if (data & ~(unsigned long)PTRACE_O_MASK) + return -EINVAL; - if (data & PTRACE_O_TRACEEXIT) - child->ptrace |= PT_TRACE_EXIT; + /* Avoid intermediate state when all opts are cleared */ + flags = child->ptrace; + flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT); + flags |= (data << PT_OPT_FLAG_SHIFT); + child->ptrace = flags; - return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; + return 0; } static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) @@ -891,7 +875,7 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, } if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { - ret = ptrace_attach(child, request, data); + ret = ptrace_attach(child, request, addr, data); /* * Some architectures need to do book-keeping after * a ptrace attach. @@ -1034,7 +1018,7 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, } if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { - ret = ptrace_attach(child, request, data); + ret = ptrace_attach(child, request, addr, data); /* * Some architectures need to do book-keeping after * a ptrace attach. diff --git a/kernel/signal.c b/kernel/signal.c index e76001ccf5c..d523da02dd1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -58,21 +58,20 @@ static int sig_handler_ignored(void __user *handler, int sig) (handler == SIG_DFL && sig_kernel_ignore(sig)); } -static int sig_task_ignored(struct task_struct *t, int sig, - int from_ancestor_ns) +static int sig_task_ignored(struct task_struct *t, int sig, bool force) { void __user *handler; handler = sig_handler(t, sig); if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && - handler == SIG_DFL && !from_ancestor_ns) + handler == SIG_DFL && !force) return 1; return sig_handler_ignored(handler, sig); } -static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) +static int sig_ignored(struct task_struct *t, int sig, bool force) { /* * Blocked signals are never ignored, since the @@ -82,7 +81,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) return 0; - if (!sig_task_ignored(t, sig, from_ancestor_ns)) + if (!sig_task_ignored(t, sig, force)) return 0; /* @@ -855,7 +854,7 @@ static void ptrace_trap_notify(struct task_struct *t) * Returns true if the signal should be actually delivered, otherwise * it should be dropped. */ -static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) +static int prepare_signal(int sig, struct task_struct *p, bool force) { struct signal_struct *signal = p->signal; struct task_struct *t; @@ -915,7 +914,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) } } - return !sig_ignored(p, sig, from_ancestor_ns); + return !sig_ignored(p, sig, force); } /* @@ -1059,7 +1058,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, assert_spin_locked(&t->sighand->siglock); result = TRACE_SIGNAL_IGNORED; - if (!prepare_signal(sig, t, from_ancestor_ns)) + if (!prepare_signal(sig, t, + from_ancestor_ns || (info == SEND_SIG_FORCED))) goto ret; pending = group ? &t->signal->shared_pending : &t->pending; @@ -1601,7 +1601,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) ret = 1; /* the signal is ignored */ result = TRACE_SIGNAL_IGNORED; - if (!prepare_signal(sig, t, 0)) + if (!prepare_signal(sig, t, false)) goto out; ret = 0; diff --git a/kernel/sys.c b/kernel/sys.c index 888d227fd19..9eb7fcab8df 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1962,6 +1962,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_MM: error = prctl_set_mm(arg2, arg3, arg4, arg5); break; + case PR_SET_CHILD_SUBREAPER: + me->signal->is_child_subreaper = !!arg2; + error = 0; + break; + case PR_GET_CHILD_SUBREAPER: + error = put_user(me->signal->is_child_subreaper, + (int __user *) arg2); + break; default: error = -EINVAL; break; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 14bc092fb12..df30ee08bdd 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -9,6 +9,8 @@ * to those contributors as well. */ +#define pr_fmt(fmt) "NMI watchdog: " fmt + #include <linux/mm.h> #include <linux/cpu.h> #include <linux/nmi.h> @@ -319,11 +321,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) */ static int watchdog(void *unused) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct sched_param param = { .sched_priority = 0 }; struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); - sched_setscheduler(current, SCHED_FIFO, ¶m); - /* initialize timestamp */ __touch_watchdog(); @@ -349,8 +349,11 @@ static int watchdog(void *unused) set_current_state(TASK_INTERRUPTIBLE); } + /* + * Drop the policy/priority elevation during thread exit to avoid a + * scheduling latency spike. + */ __set_current_state(TASK_RUNNING); - param.sched_priority = 0; sched_setscheduler(current, SCHED_NORMAL, ¶m); return 0; } @@ -376,18 +379,20 @@ static int watchdog_nmi_enable(int cpu) /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); if (!IS_ERR(event)) { - printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); + pr_info("enabled, takes one hw-pmu counter.\n"); goto out_save; } /* vary the KERN level based on the returned errno */ if (PTR_ERR(event) == -EOPNOTSUPP) - printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu); + pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); else if (PTR_ERR(event) == -ENOENT) - printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu); + pr_warning("disabled (cpu%i): hardware events not enabled\n", + cpu); else - printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); + pr_err("disabled (cpu%i): unable to create perf event: %ld\n", + cpu, PTR_ERR(event)); return PTR_ERR(event); /* success path */ @@ -439,9 +444,10 @@ static int watchdog_enable(int cpu) /* create the watchdog thread */ if (!p) { + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); if (IS_ERR(p)) { - printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); + pr_err("softlockup watchdog for %i failed\n", cpu); if (!err) { /* if hardlockup hasn't already set this */ err = PTR_ERR(p); @@ -450,6 +456,7 @@ static int watchdog_enable(int cpu) } goto out; } + sched_setscheduler(p, SCHED_FIFO, ¶m); kthread_bind(p, cpu); per_cpu(watchdog_touch_ts, cpu) = 0; per_cpu(softlockup_watchdog, cpu) = p; @@ -496,7 +503,7 @@ static void watchdog_enable_all_cpus(void) watchdog_enabled = 1; if (!watchdog_enabled) - printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); + pr_err("failed to be enabled on some cpus\n"); } |