summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c29
-rw-r--r--kernel/exit.c91
-rw-r--r--kernel/fork.c33
-rw-r--r--kernel/ptrace.c197
-rw-r--r--kernel/rcutree_plugin.h53
-rw-r--r--kernel/sched.c233
-rw-r--r--kernel/sched_fair.c46
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/signal.c444
-rw-r--r--kernel/softirq.c12
-rw-r--r--kernel/workqueue.c81
11 files changed, 847 insertions, 374 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 93950031706..52501b5d490 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -55,6 +55,9 @@
#include <net/sock.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
+#ifdef CONFIG_SECURITY
+#include <linux/security.h>
+#endif
#include <linux/netlink.h>
#include <linux/freezer.h>
#include <linux/tty.h>
@@ -1502,6 +1505,32 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
}
}
+#ifdef CONFIG_SECURITY
+/**
+ * audit_log_secctx - Converts and logs SELinux context
+ * @ab: audit_buffer
+ * @secid: security number
+ *
+ * This is a helper function that calls security_secid_to_secctx to convert
+ * secid to secctx and then adds the (converted) SELinux context to the audit
+ * log by calling audit_log_format, thus also preventing leak of internal secid
+ * to userspace. If secid cannot be converted audit_panic is called.
+ */
+void audit_log_secctx(struct audit_buffer *ab, u32 secid)
+{
+ u32 len;
+ char *secctx;
+
+ if (security_secid_to_secctx(secid, &secctx, &len)) {
+ audit_panic("Cannot convert secid to context");
+ } else {
+ audit_log_format(ab, " obj=%s", secctx);
+ security_release_secctx(secctx, len);
+ }
+}
+EXPORT_SYMBOL(audit_log_secctx);
+#endif
+
EXPORT_SYMBOL(audit_log_start);
EXPORT_SYMBOL(audit_log_end);
EXPORT_SYMBOL(audit_log_format);
diff --git a/kernel/exit.c b/kernel/exit.c
index f2b321bae44..73bb192a3d3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -169,7 +169,6 @@ void release_task(struct task_struct * p)
struct task_struct *leader;
int zap_leader;
repeat:
- tracehook_prepare_release_task(p);
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock();
@@ -179,7 +178,7 @@ repeat:
proc_flush_task(p);
write_lock_irq(&tasklist_lock);
- tracehook_finish_release_task(p);
+ ptrace_release_task(p);
__exit_signal(p);
/*
@@ -190,22 +189,12 @@ repeat:
zap_leader = 0;
leader = p->group_leader;
if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
- BUG_ON(task_detached(leader));
- do_notify_parent(leader, leader->exit_signal);
/*
* If we were the last child thread and the leader has
* exited already, and the leader's parent ignores SIGCHLD,
* then we are the one who should release the leader.
- *
- * do_notify_parent() will have marked it self-reaping in
- * that case.
- */
- zap_leader = task_detached(leader);
-
- /*
- * This maintains the invariant that release_task()
- * only runs on a task in EXIT_DEAD, just for sanity.
*/
+ zap_leader = do_notify_parent(leader, leader->exit_signal);
if (zap_leader)
leader->exit_state = EXIT_DEAD;
}
@@ -277,18 +266,16 @@ int is_current_pgrp_orphaned(void)
return retval;
}
-static int has_stopped_jobs(struct pid *pgrp)
+static bool has_stopped_jobs(struct pid *pgrp)
{
- int retval = 0;
struct task_struct *p;
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
- if (!task_is_stopped(p))
- continue;
- retval = 1;
- break;
+ if (p->signal->flags & SIGNAL_STOP_STOPPED)
+ return true;
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
- return retval;
+
+ return false;
}
/*
@@ -751,7 +738,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
{
list_move_tail(&p->sibling, &p->real_parent->children);
- if (task_detached(p))
+ if (p->exit_state == EXIT_DEAD)
return;
/*
* If this is a threaded reparent there is no need to
@@ -764,10 +751,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
p->exit_signal = SIGCHLD;
/* If it has exited notify the new parent about this child's death. */
- if (!task_ptrace(p) &&
+ if (!p->ptrace &&
p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
- do_notify_parent(p, p->exit_signal);
- if (task_detached(p)) {
+ if (do_notify_parent(p, p->exit_signal)) {
p->exit_state = EXIT_DEAD;
list_move_tail(&p->sibling, dead);
}
@@ -794,7 +780,7 @@ static void forget_original_parent(struct task_struct *father)
do {
t->real_parent = reaper;
if (t->parent == father) {
- BUG_ON(task_ptrace(t));
+ BUG_ON(t->ptrace);
t->parent = t->real_parent;
}
if (t->pdeath_signal)
@@ -819,8 +805,7 @@ static void forget_original_parent(struct task_struct *father)
*/
static void exit_notify(struct task_struct *tsk, int group_dead)
{
- int signal;
- void *cookie;
+ bool autoreap;
/*
* This does two things:
@@ -851,26 +836,33 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
* we have changed execution domain as these two values started
* the same after a fork.
*/
- if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
+ if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
(tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
tsk->self_exec_id != tsk->parent_exec_id))
tsk->exit_signal = SIGCHLD;
- signal = tracehook_notify_death(tsk, &cookie, group_dead);
- if (signal >= 0)
- signal = do_notify_parent(tsk, signal);
+ if (unlikely(tsk->ptrace)) {
+ int sig = thread_group_leader(tsk) &&
+ thread_group_empty(tsk) &&
+ !ptrace_reparented(tsk) ?
+ tsk->exit_signal : SIGCHLD;
+ autoreap = do_notify_parent(tsk, sig);
+ } else if (thread_group_leader(tsk)) {
+ autoreap = thread_group_empty(tsk) &&
+ do_notify_parent(tsk, tsk->exit_signal);
+ } else {
+ autoreap = true;
+ }
- tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
+ tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
wake_up_process(tsk->signal->group_exit_task);
write_unlock_irq(&tasklist_lock);
- tracehook_report_death(tsk, signal, cookie, group_dead);
-
/* If the process is dead, release it - nobody will wait for it */
- if (signal == DEATH_REAP)
+ if (autoreap)
release_task(tsk);
}
@@ -923,7 +915,7 @@ NORET_TYPE void do_exit(long code)
*/
set_fs(USER_DS);
- tracehook_report_exit(&code);
+ ptrace_event(PTRACE_EVENT_EXIT, code);
validate_creds_for_do_exit(tsk);
@@ -1235,9 +1227,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
traced = ptrace_reparented(p);
/*
* It can be ptraced but not reparented, check
- * !task_detached() to filter out sub-threads.
+ * thread_group_leader() to filter out sub-threads.
*/
- if (likely(!traced) && likely(!task_detached(p))) {
+ if (likely(!traced) && thread_group_leader(p)) {
struct signal_struct *psig;
struct signal_struct *sig;
unsigned long maxrss;
@@ -1345,16 +1337,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
/* We dropped tasklist, ptracer could die and untrace */
ptrace_unlink(p);
/*
- * If this is not a detached task, notify the parent.
- * If it's still not detached after that, don't release
- * it now.
+ * If this is not a sub-thread, notify the parent.
+ * If parent wants a zombie, don't release it now.
*/
- if (!task_detached(p)) {
- do_notify_parent(p, p->exit_signal);
- if (!task_detached(p)) {
- p->exit_state = EXIT_ZOMBIE;
- p = NULL;
- }
+ if (thread_group_leader(p) &&
+ !do_notify_parent(p, p->exit_signal)) {
+ p->exit_state = EXIT_ZOMBIE;
+ p = NULL;
}
write_unlock_irq(&tasklist_lock);
}
@@ -1367,7 +1356,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
static int *task_stopped_code(struct task_struct *p, bool ptrace)
{
if (ptrace) {
- if (task_is_stopped_or_traced(p))
+ if (task_is_stopped_or_traced(p) &&
+ !(p->jobctl & JOBCTL_LISTENING))
return &p->exit_code;
} else {
if (p->signal->flags & SIGNAL_STOP_STOPPED)
@@ -1563,7 +1553,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
* Notification and reaping will be cascaded to the real
* parent when the ptracer detaches.
*/
- if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+ if (likely(!ptrace) && unlikely(p->ptrace)) {
/* it will become visible, clear notask_error */
wo->notask_error = 0;
return 0;
@@ -1606,8 +1596,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
* own children, it should create a separate process which
* takes the role of real parent.
*/
- if (likely(!ptrace) && task_ptrace(p) &&
- same_thread_group(p->parent, p->real_parent))
+ if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p))
return 0;
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 0276c30401a..4d4117e0150 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,7 +37,6 @@
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/jiffies.h>
-#include <linux/tracehook.h>
#include <linux/futex.h>
#include <linux/compat.h>
#include <linux/kthread.h>
@@ -1340,7 +1339,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
}
if (likely(p->pid)) {
- tracehook_finish_clone(p, clone_flags, trace);
+ ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
if (thread_group_leader(p)) {
if (is_child_reaper(pid))
@@ -1481,10 +1480,22 @@ long do_fork(unsigned long clone_flags,
}
/*
- * When called from kernel_thread, don't do user tracing stuff.
+ * Determine whether and which event to report to ptracer. When
+ * called from kernel_thread or CLONE_UNTRACED is explicitly
+ * requested, no event is reported; otherwise, report if the event
+ * for the type of forking is enabled.
*/
- if (likely(user_mode(regs)))
- trace = tracehook_prepare_clone(clone_flags);
+ if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
+ if (clone_flags & CLONE_VFORK)
+ trace = PTRACE_EVENT_VFORK;
+ else if ((clone_flags & CSIGNAL) != SIGCHLD)
+ trace = PTRACE_EVENT_CLONE;
+ else
+ trace = PTRACE_EVENT_FORK;
+
+ if (likely(!ptrace_event_enabled(current, trace)))
+ trace = 0;
+ }
p = copy_process(clone_flags, stack_start, regs, stack_size,
child_tidptr, NULL, trace);
@@ -1508,26 +1519,26 @@ long do_fork(unsigned long clone_flags,
}
audit_finish_fork(p);
- tracehook_report_clone(regs, clone_flags, nr, p);
/*
* We set PF_STARTING at creation in case tracing wants to
* use this to distinguish a fully live task from one that
- * hasn't gotten to tracehook_report_clone() yet. Now we
- * clear it and set the child going.
+ * hasn't finished SIGSTOP raising yet. Now we clear it
+ * and set the child going.
*/
p->flags &= ~PF_STARTING;
wake_up_new_task(p);
- tracehook_report_clone_complete(trace, regs,
- clone_flags, nr, p);
+ /* forking complete and child started to run, tell ptracer */
+ if (unlikely(trace))
+ ptrace_event(trace, nr);
if (clone_flags & CLONE_VFORK) {
freezer_do_not_count();
wait_for_completion(&vfork);
freezer_count();
- tracehook_report_vfork_done(p, nr);
+ ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
}
} else {
nr = PTR_ERR(p);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2df115790cd..9de3ecfd20f 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -23,8 +23,15 @@
#include <linux/uaccess.h>
#include <linux/regset.h>
#include <linux/hw_breakpoint.h>
+#include <linux/cn_proc.h>
+static int ptrace_trapping_sleep_fn(void *flags)
+{
+ schedule();
+ return 0;
+}
+
/*
* ptrace a task: make the debugger its new parent and
* move it to the ptrace list.
@@ -77,13 +84,20 @@ void __ptrace_unlink(struct task_struct *child)
spin_lock(&child->sighand->siglock);
/*
- * Reinstate GROUP_STOP_PENDING if group stop is in effect and
+ * Clear all pending traps and TRAPPING. TRAPPING should be
+ * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly.
+ */
+ task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK);
+ task_clear_jobctl_trapping(child);
+
+ /*
+ * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and
* @child isn't dead.
*/
if (!(child->flags & PF_EXITING) &&
(child->signal->flags & SIGNAL_STOP_STOPPED ||
child->signal->group_stop_count))
- child->group_stop |= GROUP_STOP_PENDING;
+ child->jobctl |= JOBCTL_STOP_PENDING;
/*
* If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
@@ -91,16 +105,30 @@ void __ptrace_unlink(struct task_struct *child)
* is in TASK_TRACED; otherwise, we might unduly disrupt
* TASK_KILLABLE sleeps.
*/
- if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child))
+ if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child))
signal_wake_up(child, task_is_traced(child));
spin_unlock(&child->sighand->siglock);
}
-/*
- * Check that we have indeed attached to the thing..
+/**
+ * ptrace_check_attach - check whether ptracee is ready for ptrace operation
+ * @child: ptracee to check for
+ * @ignore_state: don't check whether @child is currently %TASK_TRACED
+ *
+ * Check whether @child is being ptraced by %current and ready for further
+ * ptrace operations. If @ignore_state is %false, @child also should be in
+ * %TASK_TRACED state and on return the child is guaranteed to be traced
+ * and not executing. If @ignore_state is %true, @child can be in any
+ * state.
+ *
+ * CONTEXT:
+ * Grabs and releases tasklist_lock and @child->sighand->siglock.
+ *
+ * RETURNS:
+ * 0 on success, -ESRCH if %child is not ready.
*/
-int ptrace_check_attach(struct task_struct *child, int kill)
+int ptrace_check_attach(struct task_struct *child, bool ignore_state)
{
int ret = -ESRCH;
@@ -119,13 +147,14 @@ int ptrace_check_attach(struct task_struct *child, int kill)
*/
spin_lock_irq(&child->sighand->siglock);
WARN_ON_ONCE(task_is_stopped(child));
- if (task_is_traced(child) || kill)
+ if (ignore_state || (task_is_traced(child) &&
+ !(child->jobctl & JOBCTL_LISTENING)))
ret = 0;
spin_unlock_irq(&child->sighand->siglock);
}
read_unlock(&tasklist_lock);
- if (!ret && !kill)
+ if (!ret && !ignore_state)
ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
/* All systems go.. */
@@ -182,11 +211,28 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
return !err;
}
-static int ptrace_attach(struct task_struct *task)
+static int ptrace_attach(struct task_struct *task, long request,
+ unsigned long flags)
{
- bool wait_trap = false;
+ bool seize = (request == PTRACE_SEIZE);
int retval;
+ /*
+ * SEIZE will enable new ptrace behaviors which will be implemented
+ * gradually. SEIZE_DEVEL is used to prevent applications
+ * expecting full SEIZE behaviors trapping on kernel commits which
+ * are still in the process of implementing them.
+ *
+ * Only test programs for new ptrace behaviors being implemented
+ * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO.
+ *
+ * Once SEIZE behaviors are completely implemented, this flag and
+ * the following test will be removed.
+ */
+ retval = -EIO;
+ if (seize && !(flags & PTRACE_SEIZE_DEVEL))
+ goto out;
+
audit_ptrace(task);
retval = -EPERM;
@@ -218,16 +264,21 @@ static int ptrace_attach(struct task_struct *task)
goto unlock_tasklist;
task->ptrace = PT_PTRACED;
+ if (seize)
+ task->ptrace |= PT_SEIZED;
if (task_ns_capable(task, CAP_SYS_PTRACE))
task->ptrace |= PT_PTRACE_CAP;
__ptrace_link(task, current);
- send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
+
+ /* SEIZE doesn't trap tracee on attach */
+ if (!seize)
+ send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
spin_lock(&task->sighand->siglock);
/*
- * If the task is already STOPPED, set GROUP_STOP_PENDING and
+ * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
* TRAPPING, and kick it so that it transits to TRACED. TRAPPING
* will be cleared if the child completes the transition or any
* event which clears the group stop states happens. We'll wait
@@ -243,11 +294,9 @@ static int ptrace_attach(struct task_struct *task)
* The following task_is_stopped() test is safe as both transitions
* in and out of STOPPED are protected by siglock.
*/
- if (task_is_stopped(task)) {
- task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING;
+ if (task_is_stopped(task) &&
+ task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
signal_wake_up(task, 1);
- wait_trap = true;
- }
spin_unlock(&task->sighand->siglock);
@@ -257,9 +306,12 @@ unlock_tasklist:
unlock_creds:
mutex_unlock(&task->signal->cred_guard_mutex);
out:
- if (wait_trap)
- wait_event(current->signal->wait_chldexit,
- !(task->group_stop & GROUP_STOP_TRAPPING));
+ if (!retval) {
+ wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
+ ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE);
+ proc_ptrace_connector(task, PTRACE_ATTACH);
+ }
+
return retval;
}
@@ -322,25 +374,27 @@ static int ignoring_children(struct sighand_struct *sigh)
*/
static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
{
+ bool dead;
+
__ptrace_unlink(p);
- if (p->exit_state == EXIT_ZOMBIE) {
- if (!task_detached(p) && thread_group_empty(p)) {
- if (!same_thread_group(p->real_parent, tracer))
- do_notify_parent(p, p->exit_signal);
- else if (ignoring_children(tracer->sighand)) {
- __wake_up_parent(p, tracer);
- p->exit_signal = -1;
- }
- }
- if (task_detached(p)) {
- /* Mark it as in the process of being reaped. */
- p->exit_state = EXIT_DEAD;
- return true;
+ if (p->exit_state != EXIT_ZOMBIE)
+ return false;
+
+ dead = !thread_group_leader(p);
+
+ if (!dead && thread_group_empty(p)) {
+ if (!same_thread_group(p->real_parent, tracer))
+ dead = do_notify_parent(p, p->exit_signal);
+ else if (ignoring_children(tracer->sighand)) {
+ __wake_up_parent(p, tracer);
+ dead = true;
}
}
-
- return false;
+ /* Mark it as in the process of being reaped. */
+ if (dead)
+ p->exit_state = EXIT_DEAD;
+ return dead;
}
static int ptrace_detach(struct task_struct *child, unsigned int data)
@@ -365,6 +419,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
}
write_unlock_irq(&tasklist_lock);
+ proc_ptrace_connector(child, PTRACE_DETACH);
if (unlikely(dead))
release_task(child);
@@ -611,10 +666,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
int ptrace_request(struct task_struct *child, long request,
unsigned long addr, unsigned long data)
{
+ bool seized = child->ptrace & PT_SEIZED;
int ret = -EIO;
- siginfo_t siginfo;
+ siginfo_t siginfo, *si;
void __user *datavp = (void __user *) data;
unsigned long __user *datalp = datavp;
+ unsigned long flags;
switch (request) {
case PTRACE_PEEKTEXT:
@@ -647,6 +704,62 @@ int ptrace_request(struct task_struct *child, long request,
ret = ptrace_setsiginfo(child, &siginfo);
break;
+ case PTRACE_INTERRUPT:
+ /*
+ * Stop tracee without any side-effect on signal or job
+ * control. At least one trap is guaranteed to happen
+ * after this request. If @child is already trapped, the
+ * current trap is not disturbed and another trap will
+ * happen after the current trap is ended with PTRACE_CONT.
+ *
+ * The actual trap might not be PTRACE_EVENT_STOP trap but
+ * the pending condition is cleared regardless.
+ */
+ if (unlikely(!seized || !lock_task_sighand(child, &flags)))
+ break;
+
+ /*
+ * INTERRUPT doesn't disturb existing trap sans one
+ * exception. If ptracer issued LISTEN for the current
+ * STOP, this INTERRUPT should clear LISTEN and re-trap
+ * tracee into STOP.
+ */
+ if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
+ signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
+
+ unlock_task_sighand(child, &flags);
+ ret = 0;
+ break;
+
+ case PTRACE_LISTEN:
+ /*
+ * Listen for events. Tracee must be in STOP. It's not
+ * resumed per-se but is not considered to be in TRACED by
+ * wait(2) or ptrace(2). If an async event (e.g. group
+ * stop state change) happens, tracee will enter STOP trap
+ * again. Alternatively, ptracer can issue INTERRUPT to
+ * finish listening and re-trap tracee into STOP.
+ */
+ if (unlikely(!seized || !lock_task_sighand(child, &flags)))
+ break;
+
+ si = child->last_siginfo;
+ if (unlikely(!si || si->si_code >> 8 != PTRACE_EVENT_STOP))
+ break;
+
+ child->jobctl |= JOBCTL_LISTENING;
+
+ /*
+ * If NOTIFY is set, it means event happened between start
+ * of this trap and now. Trigger re-trap immediately.
+ */
+ if (child->jobctl & JOBCTL_TRAP_NOTIFY)
+ signal_wake_up(child, true);
+
+ unlock_task_sighand(child, &flags);
+ ret = 0;
+ break;
+
case PTRACE_DETACH: /* detach a process that was attached. */
ret = ptrace_detach(child, data);
break;
@@ -761,8 +874,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
goto out;
}
- if (request == PTRACE_ATTACH) {
- ret = ptrace_attach(child);
+ if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
+ ret = ptrace_attach(child, request, data);
/*
* Some architectures need to do book-keeping after
* a ptrace attach.
@@ -772,7 +885,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
goto out_put_task_struct;
}
- ret = ptrace_check_attach(child, request == PTRACE_KILL);
+ ret = ptrace_check_attach(child, request == PTRACE_KILL ||
+ request == PTRACE_INTERRUPT);
if (ret < 0)
goto out_put_task_struct;
@@ -903,8 +1017,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
goto out;
}
- if (request == PTRACE_ATTACH) {
- ret = ptrace_attach(child);
+ if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
+ ret = ptrace_attach(child, request, data);
/*
* Some architectures need to do book-keeping after
* a ptrace attach.
@@ -914,7 +1028,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
goto out_put_task_struct;
}
- ret = ptrace_check_attach(child, request == PTRACE_KILL);
+ ret = ptrace_check_attach(child, request == PTRACE_KILL ||
+ request == PTRACE_INTERRUPT);
if (!ret)
ret = compat_arch_ptrace(child, request, addr, data);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 75113cb7c4f..8aafbb80b8b 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -68,6 +68,7 @@ struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
static struct rcu_state *rcu_state = &rcu_preempt_state;
+static void rcu_read_unlock_special(struct task_struct *t);
static int rcu_preempted_readers_exp(struct rcu_node *rnp);
/*
@@ -147,7 +148,7 @@ static void rcu_preempt_note_context_switch(int cpu)
struct rcu_data *rdp;
struct rcu_node *rnp;
- if (t->rcu_read_lock_nesting &&
+ if (t->rcu_read_lock_nesting > 0 &&
(t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
/* Possibly blocking in an RCU read-side critical section. */
@@ -190,6 +191,14 @@ static void rcu_preempt_note_context_switch(int cpu)
rnp->gp_tasks = &t->rcu_node_entry;
}
raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ } else if (t->rcu_read_lock_nesting < 0 &&
+ t->rcu_read_unlock_special) {
+
+ /*
+ * Complete exit from RCU read-side critical section on
+ * behalf of preempted instance of __rcu_read_unlock().
+ */
+ rcu_read_unlock_special(t);
}
/*
@@ -284,7 +293,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
* notify RCU core processing or task having blocked during the RCU
* read-side critical section.
*/
-static void rcu_read_unlock_special(struct task_struct *t)
+static noinline void rcu_read_unlock_special(struct task_struct *t)
{
int empty;
int empty_exp;
@@ -309,7 +318,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
}
/* Hardware IRQ handlers cannot block. */
- if (in_irq()) {
+ if (in_irq() || in_serving_softirq()) {
local_irq_restore(flags);
return;
}
@@ -342,6 +351,11 @@ static void rcu_read_unlock_special(struct task_struct *t)
#ifdef CONFIG_RCU_BOOST
if (&t->rcu_node_entry == rnp->boost_tasks)
rnp->boost_tasks = np;
+ /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */
+ if (t->rcu_boosted) {
+ special |= RCU_READ_UNLOCK_BOOSTED;
+ t->rcu_boosted = 0;
+ }
#endif /* #ifdef CONFIG_RCU_BOOST */
t->rcu_blocked_node = NULL;
@@ -358,7 +372,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
#ifdef CONFIG_RCU_BOOST
/* Unboost if we were boosted. */
if (special & RCU_READ_UNLOCK_BOOSTED) {
- t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
rt_mutex_unlock(t->rcu_boost_mutex);
t->rcu_boost_mutex = NULL;
}
@@ -387,13 +400,22 @@ void __rcu_read_unlock(void)
struct task_struct *t = current;
barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
- --t->rcu_read_lock_nesting;
- barrier(); /* decrement before load of ->rcu_read_unlock_special */
- if (t->rcu_read_lock_nesting == 0 &&
- unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
- rcu_read_unlock_special(t);
+ if (t->rcu_read_lock_nesting != 1)
+ --t->rcu_read_lock_nesting;
+ else {
+ t->rcu_read_lock_nesting = INT_MIN;
+ barrier(); /* assign before ->rcu_read_unlock_special load */
+ if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+ rcu_read_unlock_special(t);
+ barrier(); /* ->rcu_read_unlock_special load before assign */
+ t->rcu_read_lock_nesting = 0;
+ }
#ifdef CONFIG_PROVE_LOCKING
- WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0);
+ {
+ int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+
+ WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
+ }
#endif /* #ifdef CONFIG_PROVE_LOCKING */
}
EXPORT_SYMBOL_GPL(__rcu_read_unlock);
@@ -589,7 +611,8 @@ static void rcu_preempt_check_callbacks(int cpu)
rcu_preempt_qs(cpu);
return;
}
- if (per_cpu(rcu_preempt_data, cpu).qs_pending)
+ if (t->rcu_read_lock_nesting > 0 &&
+ per_cpu(rcu_preempt_data, cpu).qs_pending)
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
}
@@ -695,9 +718,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
raw_spin_lock_irqsave(&rnp->lock, flags);
for (;;) {
- if (!sync_rcu_preempt_exp_done(rnp))
+ if (!sync_rcu_preempt_exp_done(rnp)) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
break;
+ }
if (rnp->parent == NULL) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
wake_up(&sync_rcu_preempt_exp_wq);
break;
}
@@ -707,7 +733,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
raw_spin_lock(&rnp->lock); /* irqs already disabled */
rnp->expmask &= ~mask;
}
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
/*
@@ -1174,7 +1199,7 @@ static int rcu_boost(struct rcu_node *rnp)
t = container_of(tb, struct task_struct, rcu_node_entry);
rt_mutex_init_proxy_locked(&mtx, t);
t->rcu_boost_mutex = &mtx;
- t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+ t->rcu_boosted = 1;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
diff --git a/kernel/sched.c b/kernel/sched.c
index 3dc716f6d8a..fde6ff90352 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2544,13 +2544,9 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
}
#ifdef CONFIG_SMP
-static void sched_ttwu_pending(void)
+static void sched_ttwu_do_pending(struct task_struct *list)
{
struct rq *rq = this_rq();
- struct task_struct *list = xchg(&rq->wake_list, NULL);
-
- if (!list)
- return;
raw_spin_lock(&rq->lock);
@@ -2563,9 +2559,45 @@ static void sched_ttwu_pending(void)
raw_spin_unlock(&rq->lock);
}
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void sched_ttwu_pending(void)
+{
+ struct rq *rq = this_rq();
+ struct task_struct *list = xchg(&rq->wake_list, NULL);
+
+ if (!list)
+ return;
+
+ sched_ttwu_do_pending(list);
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
void scheduler_ipi(void)
{
- sched_ttwu_pending();
+ struct rq *rq = this_rq();
+ struct task_struct *list = xchg(&rq->wake_list, NULL);
+
+ if (!list)
+ return;
+
+ /*
+ * Not all reschedule IPI handlers call irq_enter/irq_exit, since
+ * traditionally all their work was done from the interrupt return
+ * path. Now that we actually do some work, we need to make sure
+ * we do call them.
+ *
+ * Some archs already do call them, luckily irq_enter/exit nest
+ * properly.
+ *
+ * Arguably we should visit all archs and update all handlers,
+ * however a fair share of IPIs are still resched only so this would
+ * somewhat pessimize the simple resched case.
+ */
+ irq_enter();
+ sched_ttwu_do_pending(list);
+ irq_exit();
}
static void ttwu_queue_remote(struct task_struct *p, int cpu)
@@ -6557,7 +6589,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!group->cpu_power) {
+ if (!group->sgp->power) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n");
@@ -6581,9 +6613,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str);
- if (group->cpu_power != SCHED_POWER_SCALE) {
+ if (group->sgp->power != SCHED_POWER_SCALE) {
printk(KERN_CONT " (cpu_power = %d)",
- group->cpu_power);
+ group->sgp->power);
}
group = group->next;
@@ -6774,11 +6806,39 @@ static struct root_domain *alloc_rootdomain(void)
return rd;
}
+static void free_sched_groups(struct sched_group *sg, int free_sgp)
+{
+ struct sched_group *tmp, *first;
+
+ if (!sg)
+ return;
+
+ first = sg;
+ do {
+ tmp = sg->next;
+
+ if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
+ kfree(sg->sgp);
+
+ kfree(sg);
+ sg = tmp;
+ } while (sg != first);
+}
+
static void free_sched_domain(struct rcu_head *rcu)
{
struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
- if (atomic_dec_and_test(&sd->groups->ref))
+
+ /*
+ * If its an overlapping domain it has private groups, iterate and
+ * nuke them all.
+ */
+ if (sd->flags & SD_OVERLAP) {
+ free_sched_groups(sd->groups, 1);
+ } else if (atomic_dec_and_test(&sd->groups->ref)) {
+ kfree(sd->groups->sgp);
kfree(sd->groups);
+ }
kfree(sd);
}
@@ -6945,6 +7005,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
struct sd_data {
struct sched_domain **__percpu sd;
struct sched_group **__percpu sg;
+ struct sched_group_power **__percpu sgp;
};
struct s_data {
@@ -6964,15 +7025,73 @@ struct sched_domain_topology_level;
typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
+#define SDTL_OVERLAP 0x01
+
struct sched_domain_topology_level {
sched_domain_init_f init;
sched_domain_mask_f mask;
+ int flags;
struct sd_data data;
};
-/*
- * Assumes the sched_domain tree is fully constructed
- */
+static int
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
+{
+ struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+ const struct cpumask *span = sched_domain_span(sd);
+ struct cpumask *covered = sched_domains_tmpmask;
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *child;
+ int i;
+
+ cpumask_clear(covered);
+
+ for_each_cpu(i, span) {
+ struct cpumask *sg_span;
+
+ if (cpumask_test_cpu(i, covered))
+ continue;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(i));
+
+ if (!sg)
+ goto fail;
+
+ sg_span = sched_group_cpus(sg);
+
+ child = *per_cpu_ptr(sdd->sd, i);
+ if (child->child) {
+ child = child->child;
+ cpumask_copy(sg_span, sched_domain_span(child));
+ } else
+ cpumask_set_cpu(i, sg_span);
+
+ cpumask_or(covered, covered, sg_span);
+
+ sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+ atomic_inc(&sg->sgp->ref);
+
+ if (cpumask_test_cpu(cpu, sg_span))
+ groups = sg;
+
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
+ last->next = first;
+ }
+ sd->groups = groups;
+
+ return 0;
+
+fail:
+ free_sched_groups(first, 0);
+
+ return -ENOMEM;
+}
+
static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
{
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
@@ -6981,24 +7100,24 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
if (child)
cpu = cpumask_first(sched_domain_span(child));
- if (sg)
+ if (sg) {
*sg = *per_cpu_ptr(sdd->sg, cpu);
+ (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
+ atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
+ }
return cpu;
}
/*
- * build_sched_groups takes the cpumask we wish to span, and a pointer
- * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
- * (due to the fact that we keep track of groups covered with a struct cpumask).
- *
* build_sched_groups will build a circular linked list of the groups
* covered by the given span, and will set each group's ->cpumask correctly,
* and ->cpu_power to 0.
+ *
+ * Assumes the sched_domain tree is fully constructed
*/
-static void
-build_sched_groups(struct sched_domain *sd)
+static int
+build_sched_groups(struct sched_domain *sd, int cpu)
{
struct sched_group *first = NULL, *last = NULL;
struct sd_data *sdd = sd->private;
@@ -7006,6 +7125,12 @@ build_sched_groups(struct sched_domain *sd)
struct cpumask *covered;
int i;
+ get_group(cpu, sdd, &sd->groups);
+ atomic_inc(&sd->groups->ref);
+
+ if (cpu != cpumask_first(sched_domain_span(sd)))
+ return 0;
+
lockdep_assert_held(&sched_domains_mutex);
covered = sched_domains_tmpmask;
@@ -7020,7 +7145,7 @@ build_sched_groups(struct sched_domain *sd)
continue;
cpumask_clear(sched_group_cpus(sg));
- sg->cpu_power = 0;
+ sg->sgp->power = 0;
for_each_cpu(j, span) {
if (get_group(j, sdd, NULL) != group)
@@ -7037,6 +7162,8 @@ build_sched_groups(struct sched_domain *sd)
last = sg;
}
last->next = first;
+
+ return 0;
}
/*
@@ -7051,12 +7178,17 @@ build_sched_groups(struct sched_domain *sd)
*/
static void init_sched_groups_power(int cpu, struct sched_domain *sd)
{
- WARN_ON(!sd || !sd->groups);
+ struct sched_group *sg = sd->groups;
- if (cpu != group_first_cpu(sd->groups))
- return;
+ WARN_ON(!sd || !sg);
- sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
+ do {
+ sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+ sg = sg->next;
+ } while (sg != sd->groups);
+
+ if (cpu != group_first_cpu(sg))
+ return;
update_group_power(sd, cpu);
}
@@ -7177,15 +7309,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
static void claim_allocations(int cpu, struct sched_domain *sd)
{
struct sd_data *sdd = sd->private;
- struct sched_group *sg = sd->groups;
WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
*per_cpu_ptr(sdd->sd, cpu) = NULL;
- if (cpu == cpumask_first(sched_group_cpus(sg))) {
- WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
+ if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
*per_cpu_ptr(sdd->sg, cpu) = NULL;
- }
+
+ if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
+ *per_cpu_ptr(sdd->sgp, cpu) = NULL;
}
#ifdef CONFIG_SCHED_SMT
@@ -7210,7 +7342,7 @@ static struct sched_domain_topology_level default_topology[] = {
#endif
{ sd_init_CPU, cpu_cpu_mask, },
#ifdef CONFIG_NUMA
- { sd_init_NODE, cpu_node_mask, },
+ { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
{ sd_init_ALLNODES, cpu_allnodes_mask, },
#endif
{ NULL, },
@@ -7234,9 +7366,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sdd->sg)
return -ENOMEM;
+ sdd->sgp = alloc_percpu(struct sched_group_power *);
+ if (!sdd->sgp)
+ return -ENOMEM;
+
for_each_cpu(j, cpu_map) {
struct sched_domain *sd;
struct sched_group *sg;
+ struct sched_group_power *sgp;
sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
@@ -7251,6 +7388,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
return -ENOMEM;
*per_cpu_ptr(sdd->sg, j) = sg;
+
+ sgp = kzalloc_node(sizeof(struct sched_group_power),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sgp)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sgp, j) = sgp;
}
}
@@ -7266,11 +7410,15 @@ static void __sdt_free(const struct cpumask *cpu_map)
struct sd_data *sdd = &tl->data;
for_each_cpu(j, cpu_map) {
- kfree(*per_cpu_ptr(sdd->sd, j));
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
+ if (sd && (sd->flags & SD_OVERLAP))
+ free_sched_groups(sd->groups, 0);
kfree(*per_cpu_ptr(sdd->sg, j));
+ kfree(*per_cpu_ptr(sdd->sgp, j));
}
free_percpu(sdd->sd);
free_percpu(sdd->sg);
+ free_percpu(sdd->sgp);
}
}
@@ -7316,8 +7464,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
struct sched_domain_topology_level *tl;
sd = NULL;
- for (tl = sched_domain_topology; tl->init; tl++)
+ for (tl = sched_domain_topology; tl->init; tl++) {
sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+ if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+ sd->flags |= SD_OVERLAP;
+ if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+ break;
+ }
while (sd->child)
sd = sd->child;
@@ -7329,13 +7482,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
for_each_cpu(i, cpu_map) {
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
sd->span_weight = cpumask_weight(sched_domain_span(sd));
- get_group(i, sd->private, &sd->groups);
- atomic_inc(&sd->groups->ref);
-
- if (i != cpumask_first(sched_domain_span(sd)))
- continue;
-
- build_sched_groups(sd);
+ if (sd->flags & SD_OVERLAP) {
+ if (build_overlap_sched_groups(sd, i))
+ goto error;
+ } else {
+ if (build_sched_groups(sd, i))
+ goto error;
+ }
}
}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 433491c2dc8..c768588e180 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1585,7 +1585,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
}
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power;
+ avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
if (local_group) {
this_load = avg_load;
@@ -2631,7 +2631,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
power >>= SCHED_POWER_SHIFT;
}
- sdg->cpu_power_orig = power;
+ sdg->sgp->power_orig = power;
if (sched_feat(ARCH_POWER))
power *= arch_scale_freq_power(sd, cpu);
@@ -2647,7 +2647,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
power = 1;
cpu_rq(cpu)->cpu_power = power;
- sdg->cpu_power = power;
+ sdg->sgp->power = power;
}
static void update_group_power(struct sched_domain *sd, int cpu)
@@ -2665,11 +2665,11 @@ static void update_group_power(struct sched_domain *sd, int cpu)
group = child->groups;
do {
- power += group->cpu_power;
+ power += group->sgp->power;
group = group->next;
} while (group != child->groups);
- sdg->cpu_power = power;
+ sdg->sgp->power = power;
}
/*
@@ -2691,7 +2691,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
/*
* If ~90% of the cpu_power is still there, we're good.
*/
- if (group->cpu_power * 32 > group->cpu_power_orig * 29)
+ if (group->sgp->power * 32 > group->sgp->power_orig * 29)
return 1;
return 0;
@@ -2771,7 +2771,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
}
/* Adjust by relative CPU power of the group */
- sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power;
+ sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
/*
* Consider the group unbalanced when the imbalance is larger
@@ -2788,7 +2788,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
sgs->group_imb = 1;
- sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power,
+ sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
SCHED_POWER_SCALE);
if (!sgs->group_capacity)
sgs->group_capacity = fix_small_capacity(sd, group);
@@ -2877,7 +2877,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
return;
sds->total_load += sgs.group_load;
- sds->total_pwr += sg->cpu_power;
+ sds->total_pwr += sg->sgp->power;
/*
* In case the child domain prefers tasks go to siblings
@@ -2962,7 +2962,7 @@ static int check_asym_packing(struct sched_domain *sd,
if (this_cpu > busiest_cpu)
return 0;
- *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
+ *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
SCHED_POWER_SCALE);
return 1;
}
@@ -2993,7 +2993,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
scaled_busy_load_per_task = sds->busiest_load_per_task
* SCHED_POWER_SCALE;
- scaled_busy_load_per_task /= sds->busiest->cpu_power;
+ scaled_busy_load_per_task /= sds->busiest->sgp->power;
if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
(scaled_busy_load_per_task * imbn)) {
@@ -3007,28 +3007,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
* moving them.
*/
- pwr_now += sds->busiest->cpu_power *
+ pwr_now += sds->busiest->sgp->power *
min(sds->busiest_load_per_task, sds->max_load);
- pwr_now += sds->this->cpu_power *
+ pwr_now += sds->this->sgp->power *
min(sds->this_load_per_task, sds->this_load);
pwr_now /= SCHED_POWER_SCALE;
/* Amount of load we'd subtract */
tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
- sds->busiest->cpu_power;
+ sds->busiest->sgp->power;
if (sds->max_load > tmp)
- pwr_move += sds->busiest->cpu_power *
+ pwr_move += sds->busiest->sgp->power *
min(sds->busiest_load_per_task, sds->max_load - tmp);
/* Amount of load we'd add */
- if (sds->max_load * sds->busiest->cpu_power <
+ if (sds->max_load * sds->busiest->sgp->power <
sds->busiest_load_per_task * SCHED_POWER_SCALE)
- tmp = (sds->max_load * sds->busiest->cpu_power) /
- sds->this->cpu_power;
+ tmp = (sds->max_load * sds->busiest->sgp->power) /
+ sds->this->sgp->power;
else
tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
- sds->this->cpu_power;
- pwr_move += sds->this->cpu_power *
+ sds->this->sgp->power;
+ pwr_move += sds->this->sgp->power *
min(sds->this_load_per_task, sds->this_load + tmp);
pwr_move /= SCHED_POWER_SCALE;
@@ -3074,7 +3074,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
- load_above_capacity /= sds->busiest->cpu_power;
+ load_above_capacity /= sds->busiest->sgp->power;
}
/*
@@ -3090,8 +3090,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
/* How much load to actually move to equalise the imbalance */
- *imbalance = min(max_pull * sds->busiest->cpu_power,
- (sds->avg_load - sds->this_load) * sds->this->cpu_power)
+ *imbalance = min(max_pull * sds->busiest->sgp->power,
+ (sds->avg_load - sds->this_load) * sds->this->sgp->power)
/ SCHED_POWER_SCALE;
/*
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index be40f7371ee..1e7066d76c2 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -70,3 +70,5 @@ SCHED_FEAT(NONIRQ_POWER, 1)
* using the scheduler IPI. Reduces rq->lock contention/bounces.
*/
SCHED_FEAT(TTWU_QUEUE, 1)
+
+SCHED_FEAT(FORCE_SD_OVERLAP, 0)
diff --git a/kernel/signal.c b/kernel/signal.c
index ff767860332..d7f70aed1cc 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -87,7 +87,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
/*
* Tracers may want to know about even ignored signals.
*/
- return !tracehook_consider_ignored_signal(t, sig);
+ return !t->ptrace;
}
/*
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
static int recalc_sigpending_tsk(struct task_struct *t)
{
- if ((t->group_stop & GROUP_STOP_PENDING) ||
+ if ((t->jobctl & JOBCTL_PENDING_MASK) ||
PENDING(&t->pending, &t->blocked) ||
PENDING(&t->signal->shared_pending, &t->blocked)) {
set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -150,9 +150,7 @@ void recalc_sigpending_and_wake(struct task_struct *t)
void recalc_sigpending(void)
{
- if (unlikely(tracehook_force_sigpending()))
- set_thread_flag(TIF_SIGPENDING);
- else if (!recalc_sigpending_tsk(current) && !freezing(current))
+ if (!recalc_sigpending_tsk(current) && !freezing(current))
clear_thread_flag(TIF_SIGPENDING);
}
@@ -224,47 +222,93 @@ static inline void print_dropped_signal(int sig)
}
/**
- * task_clear_group_stop_trapping - clear group stop trapping bit
+ * task_set_jobctl_pending - set jobctl pending bits
* @task: target task
+ * @mask: pending bits to set
*
- * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us. Clear it
- * and wake up the ptracer. Note that we don't need any further locking.
- * @task->siglock guarantees that @task->parent points to the ptracer.
+ * Clear @mask from @task->jobctl. @mask must be subset of
+ * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK |
+ * %JOBCTL_TRAPPING. If stop signo is being set, the existing signo is
+ * cleared. If @task is already being killed or exiting, this function
+ * becomes noop.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ *
+ * RETURNS:
+ * %true if @mask is set, %false if made noop because @task was dying.
+ */
+bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
+{
+ BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
+ JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
+ BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));
+
+ if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
+ return false;
+
+ if (mask & JOBCTL_STOP_SIGMASK)
+ task->jobctl &= ~JOBCTL_STOP_SIGMASK;
+
+ task->jobctl |= mask;
+ return true;
+}
+
+/**
+ * task_clear_jobctl_trapping - clear jobctl trapping bit
+ * @task: target task
+ *
+ * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED.
+ * Clear it and wake up the ptracer. Note that we don't need any further
+ * locking. @task->siglock guarantees that @task->parent points to the
+ * ptracer.
*
* CONTEXT:
* Must be called with @task->sighand->siglock held.
*/
-static void task_clear_group_stop_trapping(struct task_struct *task)
+void task_clear_jobctl_trapping(struct task_struct *task)
{
- if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) {
- task->group_stop &= ~GROUP_STOP_TRAPPING;
- __wake_up_sync_key(&task->parent->signal->wait_chldexit,
- TASK_UNINTERRUPTIBLE, 1, task);
+ if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
+ task->jobctl &= ~JOBCTL_TRAPPING;
+ wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
}
}
/**
- * task_clear_group_stop_pending - clear pending group stop
+ * task_clear_jobctl_pending - clear jobctl pending bits
* @task: target task
+ * @mask: pending bits to clear
*
- * Clear group stop states for @task.
+ * Clear @mask from @task->jobctl. @mask must be subset of
+ * %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other
+ * STOP bits are cleared together.
+ *
+ * If clearing of @mask leaves no stop or trap pending, this function calls
+ * task_clear_jobctl_trapping().
*
* CONTEXT:
* Must be called with @task->sighand->siglock held.
*/
-void task_clear_group_stop_pending(struct task_struct *task)
+void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
{
- task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME |
- GROUP_STOP_DEQUEUED);
+ BUG_ON(mask & ~JOBCTL_PENDING_MASK);
+
+ if (mask & JOBCTL_STOP_PENDING)
+ mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED;
+
+ task->jobctl &= ~mask;
+
+ if (!(task->jobctl & JOBCTL_PENDING_MASK))
+ task_clear_jobctl_trapping(task);
}
/**
* task_participate_group_stop - participate in a group stop
* @task: task participating in a group stop
*
- * @task has GROUP_STOP_PENDING set and is participating in a group stop.
+ * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
* Group stop states are cleared and the group stop count is consumed if
- * %GROUP_STOP_CONSUME was set. If the consumption completes the group
+ * %JOBCTL_STOP_CONSUME was set. If the consumption completes the group
* stop, the appropriate %SIGNAL_* flags are set.
*
* CONTEXT:
@@ -277,11 +321,11 @@ void task_clear_group_stop_pending(struct task_struct *task)
static bool task_participate_group_stop(struct task_struct *task)
{
struct signal_struct *sig = task->signal;
- bool consume = task->group_stop & GROUP_STOP_CONSUME;
+ bool consume = task->jobctl & JOBCTL_STOP_CONSUME;
- WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING));
+ WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));
- task_clear_group_stop_pending(task);
+ task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING);
if (!consume)
return false;
@@ -449,7 +493,8 @@ int unhandled_signal(struct task_struct *tsk, int sig)
return 1;
if (handler != SIG_IGN && handler != SIG_DFL)
return 0;
- return !tracehook_consider_fatal_signal(tsk, sig);
+ /* if ptraced, let the tracer determine */
+ return !tsk->ptrace;
}
/*
@@ -604,7 +649,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
* is to alert stop-signal processing code when another
* processor has come along and cleared the flag.
*/
- current->group_stop |= GROUP_STOP_DEQUEUED;
+ current->jobctl |= JOBCTL_STOP_DEQUEUED;
}
if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
/*
@@ -773,6 +818,32 @@ static int check_kill_permission(int sig, struct siginfo *info,
return security_task_kill(t, info, sig, 0);
}
+/**
+ * ptrace_trap_notify - schedule trap to notify ptracer
+ * @t: tracee wanting to notify tracer
+ *
+ * This function schedules sticky ptrace trap which is cleared on the next
+ * TRAP_STOP to notify ptracer of an event. @t must have been seized by
+ * ptracer.
+ *
+ * If @t is running, STOP trap will be taken. If trapped for STOP and
+ * ptracer is listening for events, tracee is woken up so that it can
+ * re-trap for the new event. If trapped otherwise, STOP trap will be
+ * eventually taken without returning to userland after the existing traps
+ * are finished by PTRACE_CONT.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+static void ptrace_trap_notify(struct task_struct *t)
+{
+ WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
+ assert_spin_locked(&t->sighand->siglock);
+
+ task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
+ signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
+}
+
/*
* Handle magic process-wide effects of stop/continue signals. Unlike
* the signal actions, these happen immediately at signal-generation
@@ -809,9 +880,12 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
t = p;
do {
- task_clear_group_stop_pending(t);
+ task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
- wake_up_state(t, __TASK_STOPPED);
+ if (likely(!(t->ptrace & PT_SEIZED)))
+ wake_up_state(t, __TASK_STOPPED);
+ else
+ ptrace_trap_notify(t);
} while_each_thread(p, t);
/*
@@ -908,8 +982,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
if (sig_fatal(p, sig) &&
!(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
!sigismember(&t->real_blocked, sig) &&
- (sig == SIGKILL ||
- !tracehook_consider_fatal_signal(t, sig))) {
+ (sig == SIGKILL || !t->ptrace)) {
/*
* This signal will be fatal to the whole group.
*/
@@ -925,7 +998,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
signal->group_stop_count = 0;
t = p;
do {
- task_clear_group_stop_pending(t);
+ task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
} while_each_thread(p, t);
@@ -1160,7 +1233,7 @@ int zap_other_threads(struct task_struct *p)
p->signal->group_stop_count = 0;
while_each_thread(p, t) {
- task_clear_group_stop_pending(t);
+ task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
count++;
/* Don't bother with already dead threads */
@@ -1178,18 +1251,25 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
{
struct sighand_struct *sighand;
- rcu_read_lock();
for (;;) {
+ local_irq_save(*flags);
+ rcu_read_lock();
sighand = rcu_dereference(tsk->sighand);
- if (unlikely(sighand == NULL))
+ if (unlikely(sighand == NULL)) {
+ rcu_read_unlock();
+ local_irq_restore(*flags);
break;
+ }
- spin_lock_irqsave(&sighand->siglock, *flags);
- if (likely(sighand == tsk->sighand))
+ spin_lock(&sighand->siglock);
+ if (likely(sighand == tsk->sighand)) {
+ rcu_read_unlock();
break;
- spin_unlock_irqrestore(&sighand->siglock, *flags);
+ }
+ spin_unlock(&sighand->siglock);
+ rcu_read_unlock();
+ local_irq_restore(*flags);
}
- rcu_read_unlock();
return sighand;
}
@@ -1504,22 +1584,22 @@ ret:
* Let a parent know about the death of a child.
* For a stopped/continued status change, use do_notify_parent_cldstop instead.
*
- * Returns -1 if our parent ignored us and so we've switched to
- * self-reaping, or else @sig.
+ * Returns true if our parent ignored us and so we've switched to
+ * self-reaping.
*/
-int do_notify_parent(struct task_struct *tsk, int sig)
+bool do_notify_parent(struct task_struct *tsk, int sig)
{
struct siginfo info;
unsigned long flags;
struct sighand_struct *psig;
- int ret = sig;
+ bool autoreap = false;
BUG_ON(sig == -1);
/* do_notify_parent_cldstop should have been called instead. */
BUG_ON(task_is_stopped_or_traced(tsk));
- BUG_ON(!task_ptrace(tsk) &&
+ BUG_ON(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
info.si_signo = sig;
@@ -1558,7 +1638,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
psig = tsk->parent->sighand;
spin_lock_irqsave(&psig->siglock, flags);
- if (!task_ptrace(tsk) && sig == SIGCHLD &&
+ if (!tsk->ptrace && sig == SIGCHLD &&
(psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
(psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
/*
@@ -1576,16 +1656,16 @@ int do_notify_parent(struct task_struct *tsk, int sig)
* is implementation-defined: we do (if you don't want
* it, just use SIG_IGN instead).
*/
- ret = tsk->exit_signal = -1;
+ autoreap = true;
if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
- sig = -1;
+ sig = 0;
}
- if (valid_signal(sig) && sig > 0)
+ if (valid_signal(sig) && sig)
__group_send_sig_info(sig, &info, tsk->parent);
__wake_up_parent(tsk, tsk->parent);
spin_unlock_irqrestore(&psig->siglock, flags);
- return ret;
+ return autoreap;
}
/**
@@ -1658,7 +1738,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
static inline int may_ptrace_stop(void)
{
- if (!likely(task_ptrace(current)))
+ if (!likely(current->ptrace))
return 0;
/*
* Are we in the middle of do_coredump?
@@ -1687,15 +1767,6 @@ static int sigkill_pending(struct task_struct *tsk)
}
/*
- * Test whether the target task of the usual cldstop notification - the
- * real_parent of @child - is in the same group as the ptracer.
- */
-static bool real_parent_is_ptracer(struct task_struct *child)
-{
- return same_thread_group(child->parent, child->real_parent);
-}
-
-/*
* This must be called with current->sighand->siglock held.
*
* This should be the path for all ptrace stops.
@@ -1732,31 +1803,34 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
}
/*
- * If @why is CLD_STOPPED, we're trapping to participate in a group
- * stop. Do the bookkeeping. Note that if SIGCONT was delievered
- * while siglock was released for the arch hook, PENDING could be
- * clear now. We act as if SIGCONT is received after TASK_TRACED
- * is entered - ignore it.
+ * We're committing to trapping. TRACED should be visible before
+ * TRAPPING is cleared; otherwise, the tracer might fail do_wait().
+ * Also, transition to TRACED and updates to ->jobctl should be
+ * atomic with respect to siglock and should be done after the arch
+ * hook as siglock is released and regrabbed across it.
*/
- if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING))
- gstop_done = task_participate_group_stop(current);
+ set_current_state(TASK_TRACED);
current->last_siginfo = info;
current->exit_code = exit_code;
/*
- * TRACED should be visible before TRAPPING is cleared; otherwise,
- * the tracer might fail do_wait().
+ * If @why is CLD_STOPPED, we're trapping to participate in a group
+ * stop. Do the bookkeeping. Note that if SIGCONT was delievered
+ * across siglock relocks since INTERRUPT was scheduled, PENDING
+ * could be clear now. We act as if SIGCONT is received after
+ * TASK_TRACED is entered - ignore it.
*/
- set_current_state(TASK_TRACED);
+ if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
+ gstop_done = task_participate_group_stop(current);
- /*
- * We're committing to trapping. Clearing GROUP_STOP_TRAPPING and
- * transition to TASK_TRACED should be atomic with respect to
- * siglock. This hsould be done after the arch hook as siglock is
- * released and regrabbed across it.
- */
- task_clear_group_stop_trapping(current);
+ /* any trap clears pending STOP trap, STOP trap clears NOTIFY */
+ task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP);
+ if (info && info->si_code >> 8 == PTRACE_EVENT_STOP)
+ task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY);
+
+ /* entering a trap, clear TRAPPING */
+ task_clear_jobctl_trapping(current);
spin_unlock_irq(&current->sighand->siglock);
read_lock(&tasklist_lock);
@@ -1772,7 +1846,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
* separately unless they're gonna be duplicates.
*/
do_notify_parent_cldstop(current, true, why);
- if (gstop_done && !real_parent_is_ptracer(current))
+ if (gstop_done && ptrace_reparented(current))
do_notify_parent_cldstop(current, false, why);
/*
@@ -1792,9 +1866,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
*
* If @gstop_done, the ptracer went away between group stop
* completion and here. During detach, it would have set
- * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED
- * in do_signal_stop() on return, so notifying the real
- * parent of the group stop completion is enough.
+ * JOBCTL_STOP_PENDING on us and we'll re-enter
+ * TASK_STOPPED in do_signal_stop() on return, so notifying
+ * the real parent of the group stop completion is enough.
*/
if (gstop_done)
do_notify_parent_cldstop(current, false, why);
@@ -1820,6 +1894,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
spin_lock_irq(&current->sighand->siglock);
current->last_siginfo = NULL;
+ /* LISTENING can be set only during STOP traps, clear it */
+ current->jobctl &= ~JOBCTL_LISTENING;
+
/*
* Queued signals ignored us while we were stopped for tracing.
* So check for any that we should take before resuming user mode.
@@ -1828,44 +1905,66 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
recalc_sigpending_tsk(current);
}
-void ptrace_notify(int exit_code)
+static void ptrace_do_notify(int signr, int exit_code, int why)
{
siginfo_t info;
- BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
-
memset(&info, 0, sizeof info);
- info.si_signo = SIGTRAP;
+ info.si_signo = signr;
info.si_code = exit_code;
info.si_pid = task_pid_vnr(current);
info.si_uid = current_uid();
/* Let the debugger run. */
+ ptrace_stop(exit_code, why, 1, &info);
+}
+
+void ptrace_notify(int exit_code)
+{
+ BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
+
spin_lock_irq(&current->sighand->siglock);
- ptrace_stop(exit_code, CLD_TRAPPED, 1, &info);
+ ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
spin_unlock_irq(&current->sighand->siglock);
}
-/*
- * This performs the stopping for SIGSTOP and other stop signals.
- * We have to stop all threads in the thread group.
- * Returns non-zero if we've actually stopped and released the siglock.
- * Returns zero if we didn't stop and still hold the siglock.
+/**
+ * do_signal_stop - handle group stop for SIGSTOP and other stop signals
+ * @signr: signr causing group stop if initiating
+ *
+ * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr
+ * and participate in it. If already set, participate in the existing
+ * group stop. If participated in a group stop (and thus slept), %true is
+ * returned with siglock released.
+ *
+ * If ptraced, this function doesn't handle stop itself. Instead,
+ * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock
+ * untouched. The caller must ensure that INTERRUPT trap handling takes
+ * places afterwards.
+ *
+ * CONTEXT:
+ * Must be called with @current->sighand->siglock held, which is released
+ * on %true return.
+ *
+ * RETURNS:
+ * %false if group stop is already cancelled or ptrace trap is scheduled.
+ * %true if participated in group stop.
*/
-static int do_signal_stop(int signr)
+static bool do_signal_stop(int signr)
+ __releases(&current->sighand->siglock)
{
struct signal_struct *sig = current->signal;
- if (!(current->group_stop & GROUP_STOP_PENDING)) {
- unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME;
+ if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
+ unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
struct task_struct *t;
- /* signr will be recorded in task->group_stop for retries */
- WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK);
+ /* signr will be recorded in task->jobctl for retries */
+ WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);
- if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) ||
+ if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
unlikely(signal_group_exit(sig)))
- return 0;
+ return false;
/*
* There is no group stop already in progress. We must
* initiate one now.
@@ -1888,28 +1987,32 @@ static int do_signal_stop(int signr)
if (!(sig->flags & SIGNAL_STOP_STOPPED))
sig->group_exit_code = signr;
else
- WARN_ON_ONCE(!task_ptrace(current));
+ WARN_ON_ONCE(!current->ptrace);
+
+ sig->group_stop_count = 0;
+
+ if (task_set_jobctl_pending(current, signr | gstop))
+ sig->group_stop_count++;
- current->group_stop &= ~GROUP_STOP_SIGMASK;
- current->group_stop |= signr | gstop;
- sig->group_stop_count = 1;
for (t = next_thread(current); t != current;
t = next_thread(t)) {
- t->group_stop &= ~GROUP_STOP_SIGMASK;
/*
* Setting state to TASK_STOPPED for a group
* stop is always done with the siglock held,
* so this check has no races.
*/
- if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) {
- t->group_stop |= signr | gstop;
+ if (!task_is_stopped(t) &&
+ task_set_jobctl_pending(t, signr | gstop)) {
sig->group_stop_count++;
- signal_wake_up(t, 0);
+ if (likely(!(t->ptrace & PT_SEIZED)))
+ signal_wake_up(t, 0);
+ else
+ ptrace_trap_notify(t);
}
}
}
-retry:
- if (likely(!task_ptrace(current))) {
+
+ if (likely(!current->ptrace)) {
int notify = 0;
/*
@@ -1940,43 +2043,65 @@ retry:
/* Now we don't run again until woken by SIGCONT or SIGKILL */
schedule();
-
- spin_lock_irq(&current->sighand->siglock);
+ return true;
} else {
- ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK,
- CLD_STOPPED, 0, NULL);
- current->exit_code = 0;
+ /*
+ * While ptraced, group stop is handled by STOP trap.
+ * Schedule it and let the caller deal with it.
+ */
+ task_set_jobctl_pending(current, JOBCTL_TRAP_STOP);
+ return false;
}
+}
- /*
- * GROUP_STOP_PENDING could be set if another group stop has
- * started since being woken up or ptrace wants us to transit
- * between TASK_STOPPED and TRACED. Retry group stop.
- */
- if (current->group_stop & GROUP_STOP_PENDING) {
- WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK));
- goto retry;
+/**
+ * do_jobctl_trap - take care of ptrace jobctl traps
+ *
+ * When PT_SEIZED, it's used for both group stop and explicit
+ * SEIZE/INTERRUPT traps. Both generate PTRACE_EVENT_STOP trap with
+ * accompanying siginfo. If stopped, lower eight bits of exit_code contain
+ * the stop signal; otherwise, %SIGTRAP.
+ *
+ * When !PT_SEIZED, it's used only for group stop trap with stop signal
+ * number as exit_code and no siginfo.
+ *
+ * CONTEXT:
+ * Must be called with @current->sighand->siglock held, which may be
+ * released and re-acquired before returning with intervening sleep.
+ */
+static void do_jobctl_trap(void)
+{
+ struct signal_struct *signal = current->signal;
+ int signr = current->jobctl & JOBCTL_STOP_SIGMASK;
+
+ if (current->ptrace & PT_SEIZED) {
+ if (!signal->group_stop_count &&
+ !(signal->flags & SIGNAL_STOP_STOPPED))
+ signr = SIGTRAP;
+ WARN_ON_ONCE(!signr);
+ ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
+ CLD_STOPPED);
+ } else {
+ WARN_ON_ONCE(!signr);
+ ptrace_stop(signr, CLD_STOPPED, 0, NULL);
+ current->exit_code = 0;
}
-
- /* PTRACE_ATTACH might have raced with task killing, clear trapping */
- task_clear_group_stop_trapping(current);
-
- spin_unlock_irq(&current->sighand->siglock);
-
- tracehook_finish_jctl();
-
- return 1;
}
static int ptrace_signal(int signr, siginfo_t *info,
struct pt_regs *regs, void *cookie)
{
- if (!task_ptrace(current))
- return signr;
-
ptrace_signal_deliver(regs, cookie);
-
- /* Let the debugger run. */
+ /*
+ * We do not check sig_kernel_stop(signr) but set this marker
+ * unconditionally because we do not know whether debugger will
+ * change signr. This flag has no meaning unless we are going
+ * to stop after return from ptrace_stop(). In this case it will
+ * be checked in do_signal_stop(), we should only stop if it was
+ * not cleared by SIGCONT while we were sleeping. See also the
+ * comment in dequeue_signal().
+ */
+ current->jobctl |= JOBCTL_STOP_DEQUEUED;
ptrace_stop(signr, CLD_TRAPPED, 0, info);
/* We're back. Did the debugger cancel the sig? */
@@ -2032,7 +2157,6 @@ relock:
* the CLD_ si_code into SIGNAL_CLD_MASK bits.
*/
if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
- struct task_struct *leader;
int why;
if (signal->flags & SIGNAL_CLD_CONTINUED)
@@ -2053,13 +2177,11 @@ relock:
* a duplicate.
*/
read_lock(&tasklist_lock);
-
do_notify_parent_cldstop(current, false, why);
- leader = current->group_leader;
- if (task_ptrace(leader) && !real_parent_is_ptracer(leader))
- do_notify_parent_cldstop(leader, true, why);
-
+ if (ptrace_reparented(current->group_leader))
+ do_notify_parent_cldstop(current->group_leader,
+ true, why);
read_unlock(&tasklist_lock);
goto relock;
@@ -2067,37 +2189,31 @@ relock:
for (;;) {
struct k_sigaction *ka;
- /*
- * Tracing can induce an artificial signal and choose sigaction.
- * The return value in @signr determines the default action,
- * but @info->si_signo is the signal number we will report.
- */
- signr = tracehook_get_signal(current, regs, info, return_ka);
- if (unlikely(signr < 0))
+
+ if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
+ do_signal_stop(0))
goto relock;
- if (unlikely(signr != 0))
- ka = return_ka;
- else {
- if (unlikely(current->group_stop &
- GROUP_STOP_PENDING) && do_signal_stop(0))
- goto relock;
- signr = dequeue_signal(current, &current->blocked,
- info);
+ if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) {
+ do_jobctl_trap();
+ spin_unlock_irq(&sighand->siglock);
+ goto relock;
+ }
- if (!signr)
- break; /* will return 0 */
+ signr = dequeue_signal(current, &current->blocked, info);
- if (signr != SIGKILL) {
- signr = ptrace_signal(signr, info,
- regs, cookie);
- if (!signr)
- continue;
- }
+ if (!signr)
+ break; /* will return 0 */
- ka = &sighand->action[signr-1];
+ if (unlikely(current->ptrace) && signr != SIGKILL) {
+ signr = ptrace_signal(signr, info,
+ regs, cookie);
+ if (!signr)
+ continue;
}
+ ka = &sighand->action[signr-1];
+
/* Trace actually delivered signals. */
trace_signal_deliver(signr, info, ka);
@@ -2253,7 +2369,7 @@ void exit_signals(struct task_struct *tsk)
signotset(&unblocked);
retarget_shared_pending(tsk, &unblocked);
- if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) &&
+ if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) &&
task_participate_group_stop(tsk))
group_stop = CLD_STOPPED;
out:
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 40cf63ddd4b..fca82c32042 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -315,16 +315,24 @@ static inline void invoke_softirq(void)
{
if (!force_irqthreads)
__do_softirq();
- else
+ else {
+ __local_bh_disable((unsigned long)__builtin_return_address(0),
+ SOFTIRQ_OFFSET);
wakeup_softirqd();
+ __local_bh_enable(SOFTIRQ_OFFSET);
+ }
}
#else
static inline void invoke_softirq(void)
{
if (!force_irqthreads)
do_softirq();
- else
+ else {
+ __local_bh_disable((unsigned long)__builtin_return_address(0),
+ SOFTIRQ_OFFSET);
wakeup_softirqd();
+ __local_bh_enable(SOFTIRQ_OFFSET);
+ }
}
#endif
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0400553f0d0..25fb1b0e53f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -221,7 +221,7 @@ typedef unsigned long mayday_mask_t;
* per-CPU workqueues:
*/
struct workqueue_struct {
- unsigned int flags; /* I: WQ_* flags */
+ unsigned int flags; /* W: WQ_* flags */
union {
struct cpu_workqueue_struct __percpu *pcpu;
struct cpu_workqueue_struct *single;
@@ -240,6 +240,7 @@ struct workqueue_struct {
mayday_mask_t mayday_mask; /* cpus requesting rescue */
struct worker *rescuer; /* I: rescue worker */
+ int nr_drainers; /* W: drain in progress */
int saved_max_active; /* W: saved cwq max_active */
const char *name; /* I: workqueue name */
#ifdef CONFIG_LOCKDEP
@@ -990,7 +991,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
debug_work_activate(work);
/* if dying, only works from the same workqueue are allowed */
- if (unlikely(wq->flags & WQ_DYING) &&
+ if (unlikely(wq->flags & WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq)))
return;
@@ -2381,6 +2382,54 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(flush_workqueue);
+/**
+ * drain_workqueue - drain a workqueue
+ * @wq: workqueue to drain
+ *
+ * Wait until the workqueue becomes empty. While draining is in progress,
+ * only chain queueing is allowed. IOW, only currently pending or running
+ * work items on @wq can queue further work items on it. @wq is flushed
+ * repeatedly until it becomes empty. The number of flushing is detemined
+ * by the depth of chaining and should be relatively short. Whine if it
+ * takes too long.
+ */
+void drain_workqueue(struct workqueue_struct *wq)
+{
+ unsigned int flush_cnt = 0;
+ unsigned int cpu;
+
+ /*
+ * __queue_work() needs to test whether there are drainers, is much
+ * hotter than drain_workqueue() and already looks at @wq->flags.
+ * Use WQ_DRAINING so that queue doesn't have to check nr_drainers.
+ */
+ spin_lock(&workqueue_lock);
+ if (!wq->nr_drainers++)
+ wq->flags |= WQ_DRAINING;
+ spin_unlock(&workqueue_lock);
+reflush:
+ flush_workqueue(wq);
+
+ for_each_cwq_cpu(cpu, wq) {
+ struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+ if (!cwq->nr_active && list_empty(&cwq->delayed_works))
+ continue;
+
+ if (++flush_cnt == 10 ||
+ (flush_cnt % 100 == 0 && flush_cnt <= 1000))
+ pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n",
+ wq->name, flush_cnt);
+ goto reflush;
+ }
+
+ spin_lock(&workqueue_lock);
+ if (!--wq->nr_drainers)
+ wq->flags &= ~WQ_DRAINING;
+ spin_unlock(&workqueue_lock);
+}
+EXPORT_SYMBOL_GPL(drain_workqueue);
+
static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
bool wait_executing)
{
@@ -3009,34 +3058,10 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
*/
void destroy_workqueue(struct workqueue_struct *wq)
{
- unsigned int flush_cnt = 0;
unsigned int cpu;
- /*
- * Mark @wq dying and drain all pending works. Once WQ_DYING is
- * set, only chain queueing is allowed. IOW, only currently
- * pending or running work items on @wq can queue further work
- * items on it. @wq is flushed repeatedly until it becomes empty.
- * The number of flushing is detemined by the depth of chaining and
- * should be relatively short. Whine if it takes too long.
- */
- wq->flags |= WQ_DYING;
-reflush:
- flush_workqueue(wq);
-
- for_each_cwq_cpu(cpu, wq) {
- struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-
- if (!cwq->nr_active && list_empty(&cwq->delayed_works))
- continue;
-
- if (++flush_cnt == 10 ||
- (flush_cnt % 100 == 0 && flush_cnt <= 1000))
- printk(KERN_WARNING "workqueue %s: flush on "
- "destruction isn't complete after %u tries\n",
- wq->name, flush_cnt);
- goto reflush;
- }
+ /* drain it before proceeding with destruction */
+ drain_workqueue(wq);
/*
* wq list is used to freeze wq, remove from list after