diff options
Diffstat (limited to 'kernel')
44 files changed, 1769 insertions, 1009 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 93950031706..52501b5d490 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -55,6 +55,9 @@ #include <net/sock.h> #include <net/netlink.h> #include <linux/skbuff.h> +#ifdef CONFIG_SECURITY +#include <linux/security.h> +#endif #include <linux/netlink.h> #include <linux/freezer.h> #include <linux/tty.h> @@ -1502,6 +1505,32 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, } } +#ifdef CONFIG_SECURITY +/** + * audit_log_secctx - Converts and logs SELinux context + * @ab: audit_buffer + * @secid: security number + * + * This is a helper function that calls security_secid_to_secctx to convert + * secid to secctx and then adds the (converted) SELinux context to the audit + * log by calling audit_log_format, thus also preventing leak of internal secid + * to userspace. If secid cannot be converted audit_panic is called. + */ +void audit_log_secctx(struct audit_buffer *ab, u32 secid) +{ + u32 len; + char *secctx; + + if (security_secid_to_secctx(secid, &secctx, &len)) { + audit_panic("Cannot convert secid to context"); + } else { + audit_log_format(ab, " obj=%s", secctx); + security_release_secctx(secctx, len); + } +} +EXPORT_SYMBOL(audit_log_secctx); +#endif + EXPORT_SYMBOL(audit_log_start); EXPORT_SYMBOL(audit_log_end); EXPORT_SYMBOL(audit_log_format); diff --git a/kernel/events/core.c b/kernel/events/core.c index d863b3c057b..9efe7108cca 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7402,26 +7402,12 @@ static int __perf_cgroup_move(void *info) return 0; } -static void perf_cgroup_move(struct task_struct *task) +static void +perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task) { task_function_call(task, __perf_cgroup_move, task); } -static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task, - bool threadgroup) -{ - perf_cgroup_move(task); - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &task->thread_group, thread_group) { - perf_cgroup_move(c); - } - rcu_read_unlock(); - } -} - static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, struct cgroup *old_cgrp, struct task_struct *task) { @@ -7433,7 +7419,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, if (!(task->flags & PF_EXITING)) return; - perf_cgroup_move(task); + perf_cgroup_attach_task(cgrp, task); } struct cgroup_subsys perf_subsys = { @@ -7442,6 +7428,6 @@ struct cgroup_subsys perf_subsys = { .create = perf_cgroup_create, .destroy = perf_cgroup_destroy, .exit = perf_cgroup_exit, - .attach = perf_cgroup_attach, + .attach_task = perf_cgroup_attach_task, }; #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/exit.c b/kernel/exit.c index 20a40647152..73bb192a3d3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -169,7 +169,6 @@ void release_task(struct task_struct * p) struct task_struct *leader; int zap_leader; repeat: - tracehook_prepare_release_task(p); /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); @@ -179,7 +178,7 @@ repeat: proc_flush_task(p); write_lock_irq(&tasklist_lock); - tracehook_finish_release_task(p); + ptrace_release_task(p); __exit_signal(p); /* @@ -190,22 +189,12 @@ repeat: zap_leader = 0; leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { - BUG_ON(task_detached(leader)); - do_notify_parent(leader, leader->exit_signal); /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, * then we are the one who should release the leader. - * - * do_notify_parent() will have marked it self-reaping in - * that case. - */ - zap_leader = task_detached(leader); - - /* - * This maintains the invariant that release_task() - * only runs on a task in EXIT_DEAD, just for sanity. */ + zap_leader = do_notify_parent(leader, leader->exit_signal); if (zap_leader) leader->exit_state = EXIT_DEAD; } @@ -277,18 +266,16 @@ int is_current_pgrp_orphaned(void) return retval; } -static int has_stopped_jobs(struct pid *pgrp) +static bool has_stopped_jobs(struct pid *pgrp) { - int retval = 0; struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - if (!task_is_stopped(p)) - continue; - retval = 1; - break; + if (p->signal->flags & SIGNAL_STOP_STOPPED) + return true; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); - return retval; + + return false; } /* @@ -561,29 +548,28 @@ void exit_files(struct task_struct *tsk) #ifdef CONFIG_MM_OWNER /* - * Task p is exiting and it owned mm, lets find a new owner for it + * A task is exiting. If it owned this mm, find a new owner for the mm. */ -static inline int -mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) -{ - /* - * If there are other users of the mm and the owner (us) is exiting - * we need to find a new owner to take on the responsibility. - */ - if (atomic_read(&mm->mm_users) <= 1) - return 0; - if (mm->owner != p) - return 0; - return 1; -} - void mm_update_next_owner(struct mm_struct *mm) { struct task_struct *c, *g, *p = current; retry: - if (!mm_need_new_owner(mm, p)) + /* + * If the exiting or execing task is not the owner, it's + * someone else's problem. + */ + if (mm->owner != p) return; + /* + * The current owner is exiting/execing and there are no other + * candidates. Do not leave the mm pointing to a possibly + * freed task structure. + */ + if (atomic_read(&mm->mm_users) <= 1) { + mm->owner = NULL; + return; + } read_lock(&tasklist_lock); /* @@ -752,7 +738,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, { list_move_tail(&p->sibling, &p->real_parent->children); - if (task_detached(p)) + if (p->exit_state == EXIT_DEAD) return; /* * If this is a threaded reparent there is no need to @@ -765,10 +751,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, p->exit_signal = SIGCHLD; /* If it has exited notify the new parent about this child's death. */ - if (!task_ptrace(p) && + if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { - do_notify_parent(p, p->exit_signal); - if (task_detached(p)) { + if (do_notify_parent(p, p->exit_signal)) { p->exit_state = EXIT_DEAD; list_move_tail(&p->sibling, dead); } @@ -795,7 +780,7 @@ static void forget_original_parent(struct task_struct *father) do { t->real_parent = reaper; if (t->parent == father) { - BUG_ON(task_ptrace(t)); + BUG_ON(t->ptrace); t->parent = t->real_parent; } if (t->pdeath_signal) @@ -820,8 +805,7 @@ static void forget_original_parent(struct task_struct *father) */ static void exit_notify(struct task_struct *tsk, int group_dead) { - int signal; - void *cookie; + bool autoreap; /* * This does two things: @@ -852,26 +836,33 @@ static void exit_notify(struct task_struct *tsk, int group_dead) * we have changed execution domain as these two values started * the same after a fork. */ - if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && + if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD && (tsk->parent_exec_id != tsk->real_parent->self_exec_id || tsk->self_exec_id != tsk->parent_exec_id)) tsk->exit_signal = SIGCHLD; - signal = tracehook_notify_death(tsk, &cookie, group_dead); - if (signal >= 0) - signal = do_notify_parent(tsk, signal); + if (unlikely(tsk->ptrace)) { + int sig = thread_group_leader(tsk) && + thread_group_empty(tsk) && + !ptrace_reparented(tsk) ? + tsk->exit_signal : SIGCHLD; + autoreap = do_notify_parent(tsk, sig); + } else if (thread_group_leader(tsk)) { + autoreap = thread_group_empty(tsk) && + do_notify_parent(tsk, tsk->exit_signal); + } else { + autoreap = true; + } - tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; + tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); - tracehook_report_death(tsk, signal, cookie, group_dead); - /* If the process is dead, release it - nobody will wait for it */ - if (signal == DEATH_REAP) + if (autoreap) release_task(tsk); } @@ -924,7 +915,7 @@ NORET_TYPE void do_exit(long code) */ set_fs(USER_DS); - tracehook_report_exit(&code); + ptrace_event(PTRACE_EVENT_EXIT, code); validate_creds_for_do_exit(tsk); @@ -1236,9 +1227,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) traced = ptrace_reparented(p); /* * It can be ptraced but not reparented, check - * !task_detached() to filter out sub-threads. + * thread_group_leader() to filter out sub-threads. */ - if (likely(!traced) && likely(!task_detached(p))) { + if (likely(!traced) && thread_group_leader(p)) { struct signal_struct *psig; struct signal_struct *sig; unsigned long maxrss; @@ -1346,16 +1337,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); /* - * If this is not a detached task, notify the parent. - * If it's still not detached after that, don't release - * it now. + * If this is not a sub-thread, notify the parent. + * If parent wants a zombie, don't release it now. */ - if (!task_detached(p)) { - do_notify_parent(p, p->exit_signal); - if (!task_detached(p)) { - p->exit_state = EXIT_ZOMBIE; - p = NULL; - } + if (thread_group_leader(p) && + !do_notify_parent(p, p->exit_signal)) { + p->exit_state = EXIT_ZOMBIE; + p = NULL; } write_unlock_irq(&tasklist_lock); } @@ -1368,7 +1356,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) static int *task_stopped_code(struct task_struct *p, bool ptrace) { if (ptrace) { - if (task_is_stopped_or_traced(p)) + if (task_is_stopped_or_traced(p) && + !(p->jobctl & JOBCTL_LISTENING)) return &p->exit_code; } else { if (p->signal->flags & SIGNAL_STOP_STOPPED) @@ -1564,7 +1553,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * Notification and reaping will be cascaded to the real * parent when the ptracer detaches. */ - if (likely(!ptrace) && unlikely(task_ptrace(p))) { + if (likely(!ptrace) && unlikely(p->ptrace)) { /* it will become visible, clear notask_error */ wo->notask_error = 0; return 0; @@ -1607,8 +1596,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * own children, it should create a separate process which * takes the role of real parent. */ - if (likely(!ptrace) && task_ptrace(p) && - same_thread_group(p->parent, p->real_parent)) + if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p)) return 0; /* diff --git a/kernel/fork.c b/kernel/fork.c index 0276c30401a..ca339c5c581 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -37,7 +37,6 @@ #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/jiffies.h> -#include <linux/tracehook.h> #include <linux/futex.h> #include <linux/compat.h> #include <linux/kthread.h> @@ -1013,7 +1012,7 @@ static void rt_mutex_init_task(struct task_struct *p) { raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES - plist_head_init_raw(&p->pi_waiters, &p->pi_lock); + plist_head_init(&p->pi_waiters); p->pi_blocked_on = NULL; #endif } @@ -1340,7 +1339,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, } if (likely(p->pid)) { - tracehook_finish_clone(p, clone_flags, trace); + ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); if (thread_group_leader(p)) { if (is_child_reaper(pid)) @@ -1481,10 +1480,22 @@ long do_fork(unsigned long clone_flags, } /* - * When called from kernel_thread, don't do user tracing stuff. + * Determine whether and which event to report to ptracer. When + * called from kernel_thread or CLONE_UNTRACED is explicitly + * requested, no event is reported; otherwise, report if the event + * for the type of forking is enabled. */ - if (likely(user_mode(regs))) - trace = tracehook_prepare_clone(clone_flags); + if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) { + if (clone_flags & CLONE_VFORK) + trace = PTRACE_EVENT_VFORK; + else if ((clone_flags & CSIGNAL) != SIGCHLD) + trace = PTRACE_EVENT_CLONE; + else + trace = PTRACE_EVENT_FORK; + + if (likely(!ptrace_event_enabled(current, trace))) + trace = 0; + } p = copy_process(clone_flags, stack_start, regs, stack_size, child_tidptr, NULL, trace); @@ -1508,26 +1519,26 @@ long do_fork(unsigned long clone_flags, } audit_finish_fork(p); - tracehook_report_clone(regs, clone_flags, nr, p); /* * We set PF_STARTING at creation in case tracing wants to * use this to distinguish a fully live task from one that - * hasn't gotten to tracehook_report_clone() yet. Now we - * clear it and set the child going. + * hasn't finished SIGSTOP raising yet. Now we clear it + * and set the child going. */ p->flags &= ~PF_STARTING; wake_up_new_task(p); - tracehook_report_clone_complete(trace, regs, - clone_flags, nr, p); + /* forking complete and child started to run, tell ptracer */ + if (unlikely(trace)) + ptrace_event(trace, nr); if (clone_flags & CLONE_VFORK) { freezer_do_not_count(); wait_for_completion(&vfork); freezer_count(); - tracehook_report_vfork_done(p, nr); + ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); } } else { nr = PTR_ERR(p); diff --git a/kernel/futex.c b/kernel/futex.c index fe28dc282ea..3fbc76cbb9a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2697,7 +2697,7 @@ static int __init futex_init(void) futex_cmpxchg_enabled = 1; for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { - plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); + plist_head_init(&futex_queues[i].chain); spin_lock_init(&futex_queues[i].lock); } diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index b8cadf70b1f..5bf924d80b5 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -2,7 +2,8 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" - depends on DEBUG_FS && CONSTRUCTORS + depends on DEBUG_FS + select CONSTRUCTORS default n ---help--- This option enables gcov-based code profiling (e.g. for code coverage diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 31a9db71190..3a2cab407b9 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -101,10 +101,10 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) } /** - * irq_gc_ack - Ack pending interrupt + * irq_gc_ack_set_bit - Ack pending interrupt via setting bit * @d: irq_data */ -void irq_gc_ack(struct irq_data *d) +void irq_gc_ack_set_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); u32 mask = 1 << (d->irq - gc->irq_base); @@ -115,6 +115,20 @@ void irq_gc_ack(struct irq_data *d) } /** + * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit + * @d: irq_data + */ +void irq_gc_ack_clr_bit(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = ~(1 << (d->irq - gc->irq_base)); + + irq_gc_lock(gc); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_gc_unlock(gc); +} + +/** * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt * @d: irq_data */ diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 90cb55f6d7e..470d08c82bb 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -133,12 +133,6 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) switch (res) { case IRQ_WAKE_THREAD: /* - * Set result to handled so the spurious check - * does not trigger. - */ - res = IRQ_HANDLED; - - /* * Catch drivers which return WAKE_THREAD but * did not set up a thread function */ diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 886e80347b3..4c60a50e66b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -257,13 +257,11 @@ int __init early_irq_init(void) count = ARRAY_SIZE(irq_desc); for (i = 0; i < count; i++) { - desc[i].irq_data.irq = i; - desc[i].irq_data.chip = &no_irq_chip; desc[i].kstat_irqs = alloc_percpu(unsigned int); - irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); - alloc_masks(desc + i, GFP_KERNEL, node); - desc_smp_init(desc + i, node); + alloc_masks(&desc[i], GFP_KERNEL, node); + raw_spin_lock_init(&desc[i].lock); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); + desc_set_defaults(i, &desc[i], node); } return arch_early_irq_init(); } @@ -346,6 +344,12 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) if (!cnt) return -EINVAL; + if (irq >= 0) { + if (from > irq) + return -EINVAL; + from = irq; + } + mutex_lock(&sparse_irq_lock); start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f7ce0021e1c..0a7840aeb0f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -491,6 +491,9 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); int ret = 0; + if (!desc) + return -EINVAL; + /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ @@ -723,13 +726,16 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } * context. So we need to disable bh here to avoid deadlocks and other * side effects. */ -static void +static irqreturn_t irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) { + irqreturn_t ret; + local_bh_disable(); - action->thread_fn(action->irq, action->dev_id); + ret = action->thread_fn(action->irq, action->dev_id); irq_finalize_oneshot(desc, action, false); local_bh_enable(); + return ret; } /* @@ -737,10 +743,14 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) * preemtible - many of them need to sleep and wait for slow busses to * complete. */ -static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action) +static irqreturn_t irq_thread_fn(struct irq_desc *desc, + struct irqaction *action) { - action->thread_fn(action->irq, action->dev_id); + irqreturn_t ret; + + ret = action->thread_fn(action->irq, action->dev_id); irq_finalize_oneshot(desc, action, false); + return ret; } /* @@ -753,7 +763,8 @@ static int irq_thread(void *data) }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); - void (*handler_fn)(struct irq_desc *desc, struct irqaction *action); + irqreturn_t (*handler_fn)(struct irq_desc *desc, + struct irqaction *action); int wake; if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, @@ -783,8 +794,12 @@ static int irq_thread(void *data) desc->istate |= IRQS_PENDING; raw_spin_unlock_irq(&desc->lock); } else { + irqreturn_t action_ret; + raw_spin_unlock_irq(&desc->lock); - handler_fn(desc, action); + action_ret = handler_fn(desc, action); + if (!noirqdebug) + note_interrupt(action->irq, desc, action_ret); } wake = atomic_dec_and_test(&desc->threads_active); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index dfbd550401b..aa57d5da18c 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -167,6 +167,13 @@ out: jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } +static inline int bad_action_ret(irqreturn_t action_ret) +{ + if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD))) + return 0; + return 1; +} + /* * If 99,900 of the previous 100,000 interrupts have not been handled * then assume that the IRQ is stuck in some manner. Drop a diagnostic @@ -182,7 +189,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *action; unsigned long flags; - if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { + if (bad_action_ret(action_ret)) { printk(KERN_ERR "irq event %d: bogus return value %x\n", irq, action_ret); } else { @@ -201,10 +208,11 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, raw_spin_lock_irqsave(&desc->lock, flags); action = desc->action; while (action) { - printk(KERN_ERR "[<%p>]", action->handler); - print_symbol(" (%s)", - (unsigned long)action->handler); - printk("\n"); + printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); + if (action->thread_fn) + printk(KERN_CONT " threaded [<%p>] %pf", + action->thread_fn, action->thread_fn); + printk(KERN_CONT "\n"); action = action->next; } raw_spin_unlock_irqrestore(&desc->lock, flags); @@ -262,7 +270,16 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, if (desc->istate & IRQS_POLL_INPROGRESS) return; - if (unlikely(action_ret != IRQ_HANDLED)) { + /* we get here again via the threaded handler */ + if (action_ret == IRQ_WAKE_THREAD) + return; + + if (bad_action_ret(action_ret)) { + report_bad_irq(irq, desc, action_ret); + return; + } + + if (unlikely(action_ret == IRQ_NONE)) { /* * If we are seeing only the odd spurious IRQ caused by * bus asynchronicity then don't eventually trigger an error, @@ -274,8 +291,6 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, else desc->irqs_unhandled++; desc->last_unhandled = jiffies; - if (unlikely(action_ret != IRQ_NONE)) - report_bad_irq(irq, desc, action_ret); } if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { diff --git a/kernel/jump_label.c b/kernel/jump_label.c index fa27e750dbc..a8ce45097f3 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -375,15 +375,19 @@ int jump_label_text_reserved(void *start, void *end) static void jump_label_update(struct jump_label_key *key, int enable) { - struct jump_entry *entry = key->entries; - - /* if there are no users, entry can be NULL */ - if (entry) - __jump_label_update(key, entry, __stop___jump_table, enable); + struct jump_entry *entry = key->entries, *stop = __stop___jump_table; #ifdef CONFIG_MODULES + struct module *mod = __module_address((jump_label_t)key); + __jump_label_mod_update(key, enable); + + if (mod) + stop = mod->jump_entries + mod->num_jump_entries; #endif + /* if there are no users, entry can be NULL */ + if (entry) + __jump_label_update(key, entry, stop, enable); } #endif diff --git a/kernel/kmod.c b/kernel/kmod.c index ad6a81c58b4..47613dfb7b2 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -156,12 +156,6 @@ static int ____call_usermodehelper(void *data) */ set_user_nice(current, 0); - if (sub_info->init) { - retval = sub_info->init(sub_info); - if (retval) - goto fail; - } - retval = -ENOMEM; new = prepare_kernel_cred(current); if (!new) @@ -173,6 +167,14 @@ static int ____call_usermodehelper(void *data) new->cap_inheritable); spin_unlock(&umh_sysctl_lock); + if (sub_info->init) { + retval = sub_info->init(sub_info, new); + if (retval) { + abort_creds(new); + goto fail; + } + } + commit_creds(new); retval = kernel_execve(sub_info->path, @@ -388,7 +390,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup); * context in which call_usermodehelper_exec is called. */ void call_usermodehelper_setfns(struct subprocess_info *info, - int (*init)(struct subprocess_info *info), + int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *info), void *data) { diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 81968a065b4..3956f5149e2 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2468,6 +2468,9 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) BUG_ON(usage_bit >= LOCK_USAGE_STATES); + if (hlock_class(hlock)->key == &__lockdep_no_validate__) + continue; + if (!mark_lock(curr, hlock, usage_bit)) return 0; } @@ -3436,7 +3439,7 @@ int lock_is_held(struct lockdep_map *lock) int ret = 0; if (unlikely(current->lockdep_recursion)) - return ret; + return 1; /* avoid false negative lockdep_assert_held() */ raw_local_irq_save(flags); check_flags(flags); diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 6824ca7d4d0..37f05d0f079 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -74,7 +74,7 @@ static DEFINE_SPINLOCK(pm_qos_lock); static struct pm_qos_object null_pm_qos; static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); static struct pm_qos_object cpu_dma_pm_qos = { - .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), + .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests), .notifiers = &cpu_dma_lat_notifier, .name = "cpu_dma_latency", .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, @@ -84,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = { static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); static struct pm_qos_object network_lat_pm_qos = { - .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), + .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests), .notifiers = &network_lat_notifier, .name = "network_latency", .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, @@ -95,7 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = { static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); static struct pm_qos_object network_throughput_pm_qos = { - .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), + .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests), .notifiers = &network_throughput_notifier, .name = "network_throughput", .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 87f4d24b55b..7b856b3458d 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -224,6 +224,10 @@ config PM_OPP implementations a ready to use framework to manage OPPs. For more information, read <file:Documentation/power/opp.txt> -config PM_RUNTIME_CLK +config PM_CLK def_bool y - depends on PM_RUNTIME && HAVE_CLK + depends on PM && HAVE_CLK + +config PM_GENERIC_DOMAINS + bool + depends on PM diff --git a/kernel/power/main.c b/kernel/power/main.c index 2981af4ce7c..6c601f87196 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -37,8 +37,9 @@ EXPORT_SYMBOL_GPL(unregister_pm_notifier); int pm_notifier_call_chain(unsigned long val) { - return (blocking_notifier_call_chain(&pm_chain_head, val, NULL) - == NOTIFY_BAD) ? -EINVAL : 0; + int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL); + + return notifier_to_errno(ret); } /* If set, devices may be suspended and resumed asynchronously. */ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index ace55889f70..06efa54f93d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1211,7 +1211,11 @@ static void free_unnecessary_pages(void) to_free_highmem = alloc_highmem - save; } else { to_free_highmem = 0; - to_free_normal -= save - alloc_highmem; + save -= alloc_highmem; + if (to_free_normal > save) + to_free_normal -= save; + else + to_free_normal = 0; } memory_bm_position_reset(©_bm); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 1c41ba21541..b6b71ad2208 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -44,6 +44,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops) suspend_ops = ops; mutex_unlock(&pm_mutex); } +EXPORT_SYMBOL_GPL(suspend_set_ops); bool valid_state(suspend_state_t state) { @@ -65,6 +66,7 @@ int suspend_valid_only_mem(suspend_state_t state) { return state == PM_SUSPEND_MEM; } +EXPORT_SYMBOL_GPL(suspend_valid_only_mem); static int suspend_test(int level) { @@ -126,12 +128,13 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void) } /** - * suspend_enter - enter the desired system sleep state. - * @state: state to enter + * suspend_enter - enter the desired system sleep state. + * @state: State to enter + * @wakeup: Returns information that suspend should not be entered again. * - * This function should be called after devices have been suspended. + * This function should be called after devices have been suspended. */ -static int suspend_enter(suspend_state_t state) +static int suspend_enter(suspend_state_t state, bool *wakeup) { int error; @@ -165,7 +168,8 @@ static int suspend_enter(suspend_state_t state) error = syscore_suspend(); if (!error) { - if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { + *wakeup = pm_wakeup_pending(); + if (!(suspend_test(TEST_CORE) || *wakeup)) { error = suspend_ops->enter(state); events_check_enabled = false; } @@ -199,6 +203,7 @@ static int suspend_enter(suspend_state_t state) int suspend_devices_and_enter(suspend_state_t state) { int error; + bool wakeup = false; if (!suspend_ops) return -ENOSYS; @@ -220,7 +225,10 @@ int suspend_devices_and_enter(suspend_state_t state) if (suspend_test(TEST_DEVICES)) goto Recover_platform; - error = suspend_enter(state); + do { + error = suspend_enter(state, &wakeup); + } while (!error && !wakeup + && suspend_ops->suspend_again && suspend_ops->suspend_again()); Resume_devices: suspend_test_start(); diff --git a/kernel/power/user.c b/kernel/power/user.c index 7d02d33be69..42ddbc6f0de 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -113,8 +113,10 @@ static int snapshot_open(struct inode *inode, struct file *filp) if (error) pm_notifier_call_chain(PM_POST_RESTORE); } - if (error) + if (error) { + free_basic_memory_bitmaps(); atomic_inc(&snapshot_device_available); + } data->frozen = 0; data->ready = 0; data->platform_support = 0; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2df115790cd..9de3ecfd20f 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -23,8 +23,15 @@ #include <linux/uaccess.h> #include <linux/regset.h> #include <linux/hw_breakpoint.h> +#include <linux/cn_proc.h> +static int ptrace_trapping_sleep_fn(void *flags) +{ + schedule(); + return 0; +} + /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. @@ -77,13 +84,20 @@ void __ptrace_unlink(struct task_struct *child) spin_lock(&child->sighand->siglock); /* - * Reinstate GROUP_STOP_PENDING if group stop is in effect and + * Clear all pending traps and TRAPPING. TRAPPING should be + * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly. + */ + task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK); + task_clear_jobctl_trapping(child); + + /* + * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and * @child isn't dead. */ if (!(child->flags & PF_EXITING) && (child->signal->flags & SIGNAL_STOP_STOPPED || child->signal->group_stop_count)) - child->group_stop |= GROUP_STOP_PENDING; + child->jobctl |= JOBCTL_STOP_PENDING; /* * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick @@ -91,16 +105,30 @@ void __ptrace_unlink(struct task_struct *child) * is in TASK_TRACED; otherwise, we might unduly disrupt * TASK_KILLABLE sleeps. */ - if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child)) + if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child)) signal_wake_up(child, task_is_traced(child)); spin_unlock(&child->sighand->siglock); } -/* - * Check that we have indeed attached to the thing.. +/** + * ptrace_check_attach - check whether ptracee is ready for ptrace operation + * @child: ptracee to check for + * @ignore_state: don't check whether @child is currently %TASK_TRACED + * + * Check whether @child is being ptraced by %current and ready for further + * ptrace operations. If @ignore_state is %false, @child also should be in + * %TASK_TRACED state and on return the child is guaranteed to be traced + * and not executing. If @ignore_state is %true, @child can be in any + * state. + * + * CONTEXT: + * Grabs and releases tasklist_lock and @child->sighand->siglock. + * + * RETURNS: + * 0 on success, -ESRCH if %child is not ready. */ -int ptrace_check_attach(struct task_struct *child, int kill) +int ptrace_check_attach(struct task_struct *child, bool ignore_state) { int ret = -ESRCH; @@ -119,13 +147,14 @@ int ptrace_check_attach(struct task_struct *child, int kill) */ spin_lock_irq(&child->sighand->siglock); WARN_ON_ONCE(task_is_stopped(child)); - if (task_is_traced(child) || kill) + if (ignore_state || (task_is_traced(child) && + !(child->jobctl & JOBCTL_LISTENING))) ret = 0; spin_unlock_irq(&child->sighand->siglock); } read_unlock(&tasklist_lock); - if (!ret && !kill) + if (!ret && !ignore_state) ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; /* All systems go.. */ @@ -182,11 +211,28 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) return !err; } -static int ptrace_attach(struct task_struct *task) +static int ptrace_attach(struct task_struct *task, long request, + unsigned long flags) { - bool wait_trap = false; + bool seize = (request == PTRACE_SEIZE); int retval; + /* + * SEIZE will enable new ptrace behaviors which will be implemented + * gradually. SEIZE_DEVEL is used to prevent applications + * expecting full SEIZE behaviors trapping on kernel commits which + * are still in the process of implementing them. + * + * Only test programs for new ptrace behaviors being implemented + * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO. + * + * Once SEIZE behaviors are completely implemented, this flag and + * the following test will be removed. + */ + retval = -EIO; + if (seize && !(flags & PTRACE_SEIZE_DEVEL)) + goto out; + audit_ptrace(task); retval = -EPERM; @@ -218,16 +264,21 @@ static int ptrace_attach(struct task_struct *task) goto unlock_tasklist; task->ptrace = PT_PTRACED; + if (seize) + task->ptrace |= PT_SEIZED; if (task_ns_capable(task, CAP_SYS_PTRACE)) task->ptrace |= PT_PTRACE_CAP; __ptrace_link(task, current); - send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); + + /* SEIZE doesn't trap tracee on attach */ + if (!seize) + send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); spin_lock(&task->sighand->siglock); /* - * If the task is already STOPPED, set GROUP_STOP_PENDING and + * If the task is already STOPPED, set JOBCTL_TRAP_STOP and * TRAPPING, and kick it so that it transits to TRACED. TRAPPING * will be cleared if the child completes the transition or any * event which clears the group stop states happens. We'll wait @@ -243,11 +294,9 @@ static int ptrace_attach(struct task_struct *task) * The following task_is_stopped() test is safe as both transitions * in and out of STOPPED are protected by siglock. */ - if (task_is_stopped(task)) { - task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING; + if (task_is_stopped(task) && + task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) signal_wake_up(task, 1); - wait_trap = true; - } spin_unlock(&task->sighand->siglock); @@ -257,9 +306,12 @@ unlock_tasklist: unlock_creds: mutex_unlock(&task->signal->cred_guard_mutex); out: - if (wait_trap) - wait_event(current->signal->wait_chldexit, - !(task->group_stop & GROUP_STOP_TRAPPING)); + if (!retval) { + wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, + ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE); + proc_ptrace_connector(task, PTRACE_ATTACH); + } + return retval; } @@ -322,25 +374,27 @@ static int ignoring_children(struct sighand_struct *sigh) */ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) { + bool dead; + __ptrace_unlink(p); - if (p->exit_state == EXIT_ZOMBIE) { - if (!task_detached(p) && thread_group_empty(p)) { - if (!same_thread_group(p->real_parent, tracer)) - do_notify_parent(p, p->exit_signal); - else if (ignoring_children(tracer->sighand)) { - __wake_up_parent(p, tracer); - p->exit_signal = -1; - } - } - if (task_detached(p)) { - /* Mark it as in the process of being reaped. */ - p->exit_state = EXIT_DEAD; - return true; + if (p->exit_state != EXIT_ZOMBIE) + return false; + + dead = !thread_group_leader(p); + + if (!dead && thread_group_empty(p)) { + if (!same_thread_group(p->real_parent, tracer)) + dead = do_notify_parent(p, p->exit_signal); + else if (ignoring_children(tracer->sighand)) { + __wake_up_parent(p, tracer); + dead = true; } } - - return false; + /* Mark it as in the process of being reaped. */ + if (dead) + p->exit_state = EXIT_DEAD; + return dead; } static int ptrace_detach(struct task_struct *child, unsigned int data) @@ -365,6 +419,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) } write_unlock_irq(&tasklist_lock); + proc_ptrace_connector(child, PTRACE_DETACH); if (unlikely(dead)) release_task(child); @@ -611,10 +666,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data) { + bool seized = child->ptrace & PT_SEIZED; int ret = -EIO; - siginfo_t siginfo; + siginfo_t siginfo, *si; void __user *datavp = (void __user *) data; unsigned long __user *datalp = datavp; + unsigned long flags; switch (request) { case PTRACE_PEEKTEXT: @@ -647,6 +704,62 @@ int ptrace_request(struct task_struct *child, long request, ret = ptrace_setsiginfo(child, &siginfo); break; + case PTRACE_INTERRUPT: + /* + * Stop tracee without any side-effect on signal or job + * control. At least one trap is guaranteed to happen + * after this request. If @child is already trapped, the + * current trap is not disturbed and another trap will + * happen after the current trap is ended with PTRACE_CONT. + * + * The actual trap might not be PTRACE_EVENT_STOP trap but + * the pending condition is cleared regardless. + */ + if (unlikely(!seized || !lock_task_sighand(child, &flags))) + break; + + /* + * INTERRUPT doesn't disturb existing trap sans one + * exception. If ptracer issued LISTEN for the current + * STOP, this INTERRUPT should clear LISTEN and re-trap + * tracee into STOP. + */ + if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP))) + signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); + + unlock_task_sighand(child, &flags); + ret = 0; + break; + + case PTRACE_LISTEN: + /* + * Listen for events. Tracee must be in STOP. It's not + * resumed per-se but is not considered to be in TRACED by + * wait(2) or ptrace(2). If an async event (e.g. group + * stop state change) happens, tracee will enter STOP trap + * again. Alternatively, ptracer can issue INTERRUPT to + * finish listening and re-trap tracee into STOP. + */ + if (unlikely(!seized || !lock_task_sighand(child, &flags))) + break; + + si = child->last_siginfo; + if (unlikely(!si || si->si_code >> 8 != PTRACE_EVENT_STOP)) + break; + + child->jobctl |= JOBCTL_LISTENING; + + /* + * If NOTIFY is set, it means event happened between start + * of this trap and now. Trigger re-trap immediately. + */ + if (child->jobctl & JOBCTL_TRAP_NOTIFY) + signal_wake_up(child, true); + + unlock_task_sighand(child, &flags); + ret = 0; + break; + case PTRACE_DETACH: /* detach a process that was attached. */ ret = ptrace_detach(child, data); break; @@ -761,8 +874,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, goto out; } - if (request == PTRACE_ATTACH) { - ret = ptrace_attach(child); + if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { + ret = ptrace_attach(child, request, data); /* * Some architectures need to do book-keeping after * a ptrace attach. @@ -772,7 +885,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, goto out_put_task_struct; } - ret = ptrace_check_attach(child, request == PTRACE_KILL); + ret = ptrace_check_attach(child, request == PTRACE_KILL || + request == PTRACE_INTERRUPT); if (ret < 0) goto out_put_task_struct; @@ -903,8 +1017,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, goto out; } - if (request == PTRACE_ATTACH) { - ret = ptrace_attach(child); + if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { + ret = ptrace_attach(child, request, data); /* * Some architectures need to do book-keeping after * a ptrace attach. @@ -914,7 +1028,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, goto out_put_task_struct; } - ret = ptrace_check_attach(child, request == PTRACE_KILL); + ret = ptrace_check_attach(child, request == PTRACE_KILL || + request == PTRACE_INTERRUPT); if (!ret) ret = compat_arch_ptrace(child, request, addr, data); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 89419ff92e9..ba06207b1dd 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -84,10 +84,35 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); static struct rcu_state *rcu_state; +/* + * The rcu_scheduler_active variable transitions from zero to one just + * before the first task is spawned. So when this variable is zero, RCU + * can assume that there is but one task, allowing RCU to (for example) + * optimized synchronize_sched() to a simple barrier(). When this variable + * is one, RCU must actually do all the hard work required to detect real + * grace periods. This variable is also used to suppress boot-time false + * positives from lockdep-RCU error checking. + */ int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); /* + * The rcu_scheduler_fully_active variable transitions from zero to one + * during the early_initcall() processing, which is after the scheduler + * is capable of creating new tasks. So RCU processing (for example, + * creating tasks for RCU priority boosting) must be delayed until after + * rcu_scheduler_fully_active transitions from zero to one. We also + * currently delay invocation of any RCU callbacks until after this point. + * + * It might later prove better for people registering RCU callbacks during + * early boot to take responsibility for these callbacks, but one step at + * a time. + */ +static int rcu_scheduler_fully_active __read_mostly; + +#ifdef CONFIG_RCU_BOOST + +/* * Control variables for per-CPU and per-rcu_node kthreads. These * handle all flavors of RCU. */ @@ -96,10 +121,12 @@ DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); DEFINE_PER_CPU(char, rcu_cpu_has_work); -static char rcu_kthreads_spawnable; + +#endif /* #ifdef CONFIG_RCU_BOOST */ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); -static void invoke_rcu_cpu_kthread(void); +static void invoke_rcu_core(void); +static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); #define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ @@ -1088,14 +1115,8 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp; - struct task_struct *t; - /* Stop the CPU's kthread. */ - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (t != NULL) { - per_cpu(rcu_cpu_kthread_task, cpu) = NULL; - kthread_stop(t); - } + rcu_stop_cpu_kthread(cpu); /* Exclude any attempts to start a new grace period. */ raw_spin_lock_irqsave(&rsp->onofflock, flags); @@ -1231,7 +1252,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* Re-raise the RCU softirq if there are callbacks remaining. */ if (cpu_has_callbacks_ready_to_invoke(rdp)) - invoke_rcu_cpu_kthread(); + invoke_rcu_core(); } /* @@ -1277,7 +1298,7 @@ void rcu_check_callbacks(int cpu, int user) } rcu_preempt_check_callbacks(cpu); if (rcu_pending(cpu)) - invoke_rcu_cpu_kthread(); + invoke_rcu_core(); } #ifdef CONFIG_SMP @@ -1442,13 +1463,14 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) } /* If there are callbacks ready, invoke them. */ - rcu_do_batch(rsp, rdp); + if (cpu_has_callbacks_ready_to_invoke(rdp)) + invoke_rcu_callbacks(rsp, rdp); } /* * Do softirq processing for the current CPU. */ -static void rcu_process_callbacks(void) +static void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); @@ -1465,342 +1487,22 @@ static void rcu_process_callbacks(void) * the current CPU with interrupts disabled, the rcu_cpu_kthread_task * cannot disappear out from under us. */ -static void invoke_rcu_cpu_kthread(void) -{ - unsigned long flags; - - local_irq_save(flags); - __this_cpu_write(rcu_cpu_has_work, 1); - if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) { - local_irq_restore(flags); - return; - } - wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); - local_irq_restore(flags); -} - -/* - * Wake up the specified per-rcu_node-structure kthread. - * Because the per-rcu_node kthreads are immortal, we don't need - * to do anything to keep them alive. - */ -static void invoke_rcu_node_kthread(struct rcu_node *rnp) -{ - struct task_struct *t; - - t = rnp->node_kthread_task; - if (t != NULL) - wake_up_process(t); -} - -/* - * Set the specified CPU's kthread to run RT or not, as specified by - * the to_rt argument. The CPU-hotplug locks are held, so the task - * is not going away. - */ -static void rcu_cpu_kthread_setrt(int cpu, int to_rt) -{ - int policy; - struct sched_param sp; - struct task_struct *t; - - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (t == NULL) - return; - if (to_rt) { - policy = SCHED_FIFO; - sp.sched_priority = RCU_KTHREAD_PRIO; - } else { - policy = SCHED_NORMAL; - sp.sched_priority = 0; - } - sched_setscheduler_nocheck(t, policy, &sp); -} - -/* - * Timer handler to initiate the waking up of per-CPU kthreads that - * have yielded the CPU due to excess numbers of RCU callbacks. - * We wake up the per-rcu_node kthread, which in turn will wake up - * the booster kthread. - */ -static void rcu_cpu_kthread_timer(unsigned long arg) -{ - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); - struct rcu_node *rnp = rdp->mynode; - - atomic_or(rdp->grpmask, &rnp->wakemask); - invoke_rcu_node_kthread(rnp); -} - -/* - * Drop to non-real-time priority and yield, but only after posting a - * timer that will cause us to regain our real-time priority if we - * remain preempted. Either way, we restore our real-time priority - * before returning. - */ -static void rcu_yield(void (*f)(unsigned long), unsigned long arg) -{ - struct sched_param sp; - struct timer_list yield_timer; - - setup_timer_on_stack(&yield_timer, f, arg); - mod_timer(&yield_timer, jiffies + 2); - sp.sched_priority = 0; - sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); - set_user_nice(current, 19); - schedule(); - sp.sched_priority = RCU_KTHREAD_PRIO; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); - del_timer(&yield_timer); -} - -/* - * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. - * This can happen while the corresponding CPU is either coming online - * or going offline. We cannot wait until the CPU is fully online - * before starting the kthread, because the various notifier functions - * can wait for RCU grace periods. So we park rcu_cpu_kthread() until - * the corresponding CPU is online. - * - * Return 1 if the kthread needs to stop, 0 otherwise. - * - * Caller must disable bh. This function can momentarily enable it. - */ -static int rcu_cpu_kthread_should_stop(int cpu) -{ - while (cpu_is_offline(cpu) || - !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) || - smp_processor_id() != cpu) { - if (kthread_should_stop()) - return 1; - per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; - per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id(); - local_bh_enable(); - schedule_timeout_uninterruptible(1); - if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) - set_cpus_allowed_ptr(current, cpumask_of(cpu)); - local_bh_disable(); - } - per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; - return 0; -} - -/* - * Per-CPU kernel thread that invokes RCU callbacks. This replaces the - * earlier RCU softirq. - */ -static int rcu_cpu_kthread(void *arg) -{ - int cpu = (int)(long)arg; - unsigned long flags; - int spincnt = 0; - unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); - char work; - char *workp = &per_cpu(rcu_cpu_has_work, cpu); - - for (;;) { - *statusp = RCU_KTHREAD_WAITING; - rcu_wait(*workp != 0 || kthread_should_stop()); - local_bh_disable(); - if (rcu_cpu_kthread_should_stop(cpu)) { - local_bh_enable(); - break; - } - *statusp = RCU_KTHREAD_RUNNING; - per_cpu(rcu_cpu_kthread_loops, cpu)++; - local_irq_save(flags); - work = *workp; - *workp = 0; - local_irq_restore(flags); - if (work) - rcu_process_callbacks(); - local_bh_enable(); - if (*workp != 0) - spincnt++; - else - spincnt = 0; - if (spincnt > 10) { - *statusp = RCU_KTHREAD_YIELDING; - rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); - spincnt = 0; - } - } - *statusp = RCU_KTHREAD_STOPPED; - return 0; -} - -/* - * Spawn a per-CPU kthread, setting up affinity and priority. - * Because the CPU hotplug lock is held, no other CPU will be attempting - * to manipulate rcu_cpu_kthread_task. There might be another CPU - * attempting to access it during boot, but the locking in kthread_bind() - * will enforce sufficient ordering. - */ -static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) -{ - struct sched_param sp; - struct task_struct *t; - - if (!rcu_kthreads_spawnable || - per_cpu(rcu_cpu_kthread_task, cpu) != NULL) - return 0; - t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); - if (IS_ERR(t)) - return PTR_ERR(t); - kthread_bind(t, cpu); - per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; - WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); - per_cpu(rcu_cpu_kthread_task, cpu) = t; - sp.sched_priority = RCU_KTHREAD_PRIO; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - return 0; -} - -/* - * Per-rcu_node kthread, which is in charge of waking up the per-CPU - * kthreads when needed. We ignore requests to wake up kthreads - * for offline CPUs, which is OK because force_quiescent_state() - * takes care of this case. - */ -static int rcu_node_kthread(void *arg) +static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) { - int cpu; - unsigned long flags; - unsigned long mask; - struct rcu_node *rnp = (struct rcu_node *)arg; - struct sched_param sp; - struct task_struct *t; - - for (;;) { - rnp->node_kthread_status = RCU_KTHREAD_WAITING; - rcu_wait(atomic_read(&rnp->wakemask) != 0); - rnp->node_kthread_status = RCU_KTHREAD_RUNNING; - raw_spin_lock_irqsave(&rnp->lock, flags); - mask = atomic_xchg(&rnp->wakemask, 0); - rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { - if ((mask & 0x1) == 0) - continue; - preempt_disable(); - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (!cpu_online(cpu) || t == NULL) { - preempt_enable(); - continue; - } - per_cpu(rcu_cpu_has_work, cpu) = 1; - sp.sched_priority = RCU_KTHREAD_PRIO; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - preempt_enable(); - } - } - /* NOTREACHED */ - rnp->node_kthread_status = RCU_KTHREAD_STOPPED; - return 0; -} - -/* - * Set the per-rcu_node kthread's affinity to cover all CPUs that are - * served by the rcu_node in question. The CPU hotplug lock is still - * held, so the value of rnp->qsmaskinit will be stable. - * - * We don't include outgoingcpu in the affinity set, use -1 if there is - * no outgoing CPU. If there are no CPUs left in the affinity set, - * this function allows the kthread to execute on any CPU. - */ -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) -{ - cpumask_var_t cm; - int cpu; - unsigned long mask = rnp->qsmaskinit; - - if (rnp->node_kthread_task == NULL) + if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active))) return; - if (!alloc_cpumask_var(&cm, GFP_KERNEL)) + if (likely(!rsp->boost)) { + rcu_do_batch(rsp, rdp); return; - cpumask_clear(cm); - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) - if ((mask & 0x1) && cpu != outgoingcpu) - cpumask_set_cpu(cpu, cm); - if (cpumask_weight(cm) == 0) { - cpumask_setall(cm); - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) - cpumask_clear_cpu(cpu, cm); - WARN_ON_ONCE(cpumask_weight(cm) == 0); } - set_cpus_allowed_ptr(rnp->node_kthread_task, cm); - rcu_boost_kthread_setaffinity(rnp, cm); - free_cpumask_var(cm); + invoke_rcu_callbacks_kthread(); } -/* - * Spawn a per-rcu_node kthread, setting priority and affinity. - * Called during boot before online/offline can happen, or, if - * during runtime, with the main CPU-hotplug locks held. So only - * one of these can be executing at a time. - */ -static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, - struct rcu_node *rnp) +static void invoke_rcu_core(void) { - unsigned long flags; - int rnp_index = rnp - &rsp->node[0]; - struct sched_param sp; - struct task_struct *t; - - if (!rcu_kthreads_spawnable || - rnp->qsmaskinit == 0) - return 0; - if (rnp->node_kthread_task == NULL) { - t = kthread_create(rcu_node_kthread, (void *)rnp, - "rcun%d", rnp_index); - if (IS_ERR(t)) - return PTR_ERR(t); - raw_spin_lock_irqsave(&rnp->lock, flags); - rnp->node_kthread_task = t; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - sp.sched_priority = 99; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - } - return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); + raise_softirq(RCU_SOFTIRQ); } -static void rcu_wake_one_boost_kthread(struct rcu_node *rnp); - -/* - * Spawn all kthreads -- called as soon as the scheduler is running. - */ -static int __init rcu_spawn_kthreads(void) -{ - int cpu; - struct rcu_node *rnp; - struct task_struct *t; - - rcu_kthreads_spawnable = 1; - for_each_possible_cpu(cpu) { - per_cpu(rcu_cpu_has_work, cpu) = 0; - if (cpu_online(cpu)) { - (void)rcu_spawn_one_cpu_kthread(cpu); - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (t) - wake_up_process(t); - } - } - rnp = rcu_get_root(rcu_state); - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); - if (rnp->node_kthread_task) - wake_up_process(rnp->node_kthread_task); - if (NUM_RCU_NODES > 1) { - rcu_for_each_leaf_node(rcu_state, rnp) { - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); - t = rnp->node_kthread_task; - if (t) - wake_up_process(t); - rcu_wake_one_boost_kthread(rnp); - } - } - return 0; -} -early_initcall(rcu_spawn_kthreads); - static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_state *rsp) @@ -2207,44 +1909,6 @@ static void __cpuinit rcu_prepare_cpu(int cpu) rcu_preempt_init_percpu_data(cpu); } -static void __cpuinit rcu_prepare_kthreads(int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); - struct rcu_node *rnp = rdp->mynode; - - /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ - if (rcu_kthreads_spawnable) { - (void)rcu_spawn_one_cpu_kthread(cpu); - if (rnp->node_kthread_task == NULL) - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); - } -} - -/* - * kthread_create() creates threads in TASK_UNINTERRUPTIBLE state, - * but the RCU threads are woken on demand, and if demand is low this - * could be a while triggering the hung task watchdog. - * - * In order to avoid this, poke all tasks once the CPU is fully - * up and running. - */ -static void __cpuinit rcu_online_kthreads(int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); - struct rcu_node *rnp = rdp->mynode; - struct task_struct *t; - - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (t) - wake_up_process(t); - - t = rnp->node_kthread_task; - if (t) - wake_up_process(t); - - rcu_wake_one_boost_kthread(rnp); -} - /* * Handle CPU online/offline notification events. */ @@ -2262,7 +1926,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, rcu_prepare_kthreads(cpu); break; case CPU_ONLINE: - rcu_online_kthreads(cpu); case CPU_DOWN_FAILED: rcu_node_kthread_setaffinity(rnp, -1); rcu_cpu_kthread_setrt(cpu, 1); @@ -2410,6 +2073,7 @@ void __init rcu_init(void) rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); /* * We don't need protection against CPU-hotplug here because diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7b9a08b4aae..01b2ccda26f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -369,6 +369,7 @@ struct rcu_state { /* period because */ /* force_quiescent_state() */ /* was running. */ + u8 boost; /* Subject to priority boost. */ unsigned long gpnum; /* Current gp number. */ unsigned long completed; /* # of last completed gp. */ @@ -426,6 +427,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags); +static void rcu_stop_cpu_kthread(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static void rcu_print_task_stall(struct rcu_node *rnp); @@ -450,11 +452,19 @@ static void rcu_preempt_send_cbs_to_online(void); static void __init __rcu_init_preempt(void); static void rcu_needs_cpu_flush(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); +static void invoke_rcu_callbacks_kthread(void); +#ifdef CONFIG_RCU_BOOST +static void rcu_preempt_do_callbacks(void); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, cpumask_var_t cm); -static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp, int rnp_index); +static void invoke_rcu_node_kthread(struct rcu_node *rnp); +static void rcu_yield(void (*f)(unsigned long), unsigned long arg); +#endif /* #ifdef CONFIG_RCU_BOOST */ +static void rcu_cpu_kthread_setrt(int cpu, int to_rt); +static void __cpuinit rcu_prepare_kthreads(int cpu); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c8bff3099a8..8aafbb80b8b 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -68,6 +68,7 @@ struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); static struct rcu_state *rcu_state = &rcu_preempt_state; +static void rcu_read_unlock_special(struct task_struct *t); static int rcu_preempted_readers_exp(struct rcu_node *rnp); /* @@ -147,7 +148,7 @@ static void rcu_preempt_note_context_switch(int cpu) struct rcu_data *rdp; struct rcu_node *rnp; - if (t->rcu_read_lock_nesting && + if (t->rcu_read_lock_nesting > 0 && (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ @@ -190,6 +191,14 @@ static void rcu_preempt_note_context_switch(int cpu) rnp->gp_tasks = &t->rcu_node_entry; } raw_spin_unlock_irqrestore(&rnp->lock, flags); + } else if (t->rcu_read_lock_nesting < 0 && + t->rcu_read_unlock_special) { + + /* + * Complete exit from RCU read-side critical section on + * behalf of preempted instance of __rcu_read_unlock(). + */ + rcu_read_unlock_special(t); } /* @@ -284,7 +293,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t, * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ -static void rcu_read_unlock_special(struct task_struct *t) +static noinline void rcu_read_unlock_special(struct task_struct *t) { int empty; int empty_exp; @@ -309,7 +318,7 @@ static void rcu_read_unlock_special(struct task_struct *t) } /* Hardware IRQ handlers cannot block. */ - if (in_irq()) { + if (in_irq() || in_serving_softirq()) { local_irq_restore(flags); return; } @@ -342,6 +351,11 @@ static void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST if (&t->rcu_node_entry == rnp->boost_tasks) rnp->boost_tasks = np; + /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */ + if (t->rcu_boosted) { + special |= RCU_READ_UNLOCK_BOOSTED; + t->rcu_boosted = 0; + } #endif /* #ifdef CONFIG_RCU_BOOST */ t->rcu_blocked_node = NULL; @@ -358,7 +372,6 @@ static void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ if (special & RCU_READ_UNLOCK_BOOSTED) { - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; rt_mutex_unlock(t->rcu_boost_mutex); t->rcu_boost_mutex = NULL; } @@ -387,13 +400,22 @@ void __rcu_read_unlock(void) struct task_struct *t = current; barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ - --t->rcu_read_lock_nesting; - barrier(); /* decrement before load of ->rcu_read_unlock_special */ - if (t->rcu_read_lock_nesting == 0 && - unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) - rcu_read_unlock_special(t); + if (t->rcu_read_lock_nesting != 1) + --t->rcu_read_lock_nesting; + else { + t->rcu_read_lock_nesting = INT_MIN; + barrier(); /* assign before ->rcu_read_unlock_special load */ + if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); + barrier(); /* ->rcu_read_unlock_special load before assign */ + t->rcu_read_lock_nesting = 0; + } #ifdef CONFIG_PROVE_LOCKING - WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0); + { + int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); + + WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); + } #endif /* #ifdef CONFIG_PROVE_LOCKING */ } EXPORT_SYMBOL_GPL(__rcu_read_unlock); @@ -589,7 +611,8 @@ static void rcu_preempt_check_callbacks(int cpu) rcu_preempt_qs(cpu); return; } - if (per_cpu(rcu_preempt_data, cpu).qs_pending) + if (t->rcu_read_lock_nesting > 0 && + per_cpu(rcu_preempt_data, cpu).qs_pending) t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; } @@ -602,6 +625,15 @@ static void rcu_preempt_process_callbacks(void) &__get_cpu_var(rcu_preempt_data)); } +#ifdef CONFIG_RCU_BOOST + +static void rcu_preempt_do_callbacks(void) +{ + rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data)); +} + +#endif /* #ifdef CONFIG_RCU_BOOST */ + /* * Queue a preemptible-RCU callback for invocation after a grace period. */ @@ -686,9 +718,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) raw_spin_lock_irqsave(&rnp->lock, flags); for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) + if (!sync_rcu_preempt_exp_done(rnp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); break; + } if (rnp->parent == NULL) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); wake_up(&sync_rcu_preempt_exp_wq); break; } @@ -698,7 +733,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) raw_spin_lock(&rnp->lock); /* irqs already disabled */ rnp->expmask &= ~mask; } - raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -1165,7 +1199,7 @@ static int rcu_boost(struct rcu_node *rnp) t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&mtx, t); t->rcu_boost_mutex = &mtx; - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; + t->rcu_boosted = 1; raw_spin_unlock_irqrestore(&rnp->lock, flags); rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ @@ -1249,6 +1283,23 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) } /* + * Wake up the per-CPU kthread to invoke RCU callbacks. + */ +static void invoke_rcu_callbacks_kthread(void) +{ + unsigned long flags; + + local_irq_save(flags); + __this_cpu_write(rcu_cpu_has_work, 1); + if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) { + local_irq_restore(flags); + return; + } + wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); + local_irq_restore(flags); +} + +/* * Set the affinity of the boost kthread. The CPU-hotplug locks are * held, so no one should be messing with the existence of the boost * kthread. @@ -1288,6 +1339,7 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, if (&rcu_preempt_state != rsp) return 0; + rsp->boost = 1; if (rnp->boost_kthread_task != NULL) return 0; t = kthread_create(rcu_boost_kthread, (void *)rnp, @@ -1299,13 +1351,372 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, raw_spin_unlock_irqrestore(&rnp->lock, flags); sp.sched_priority = RCU_KTHREAD_PRIO; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ return 0; } -static void __cpuinit rcu_wake_one_boost_kthread(struct rcu_node *rnp) +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Stop the RCU's per-CPU kthread when its CPU goes offline,. + */ +static void rcu_stop_cpu_kthread(int cpu) +{ + struct task_struct *t; + + /* Stop the CPU's kthread. */ + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (t != NULL) { + per_cpu(rcu_cpu_kthread_task, cpu) = NULL; + kthread_stop(t); + } +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +static void rcu_kthread_do_work(void) { - if (rnp->boost_kthread_task) - wake_up_process(rnp->boost_kthread_task); + rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); + rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); + rcu_preempt_do_callbacks(); +} + +/* + * Wake up the specified per-rcu_node-structure kthread. + * Because the per-rcu_node kthreads are immortal, we don't need + * to do anything to keep them alive. + */ +static void invoke_rcu_node_kthread(struct rcu_node *rnp) +{ + struct task_struct *t; + + t = rnp->node_kthread_task; + if (t != NULL) + wake_up_process(t); +} + +/* + * Set the specified CPU's kthread to run RT or not, as specified by + * the to_rt argument. The CPU-hotplug locks are held, so the task + * is not going away. + */ +static void rcu_cpu_kthread_setrt(int cpu, int to_rt) +{ + int policy; + struct sched_param sp; + struct task_struct *t; + + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (t == NULL) + return; + if (to_rt) { + policy = SCHED_FIFO; + sp.sched_priority = RCU_KTHREAD_PRIO; + } else { + policy = SCHED_NORMAL; + sp.sched_priority = 0; + } + sched_setscheduler_nocheck(t, policy, &sp); +} + +/* + * Timer handler to initiate the waking up of per-CPU kthreads that + * have yielded the CPU due to excess numbers of RCU callbacks. + * We wake up the per-rcu_node kthread, which in turn will wake up + * the booster kthread. + */ +static void rcu_cpu_kthread_timer(unsigned long arg) +{ + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); + struct rcu_node *rnp = rdp->mynode; + + atomic_or(rdp->grpmask, &rnp->wakemask); + invoke_rcu_node_kthread(rnp); +} + +/* + * Drop to non-real-time priority and yield, but only after posting a + * timer that will cause us to regain our real-time priority if we + * remain preempted. Either way, we restore our real-time priority + * before returning. + */ +static void rcu_yield(void (*f)(unsigned long), unsigned long arg) +{ + struct sched_param sp; + struct timer_list yield_timer; + + setup_timer_on_stack(&yield_timer, f, arg); + mod_timer(&yield_timer, jiffies + 2); + sp.sched_priority = 0; + sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); + set_user_nice(current, 19); + schedule(); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + del_timer(&yield_timer); +} + +/* + * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. + * This can happen while the corresponding CPU is either coming online + * or going offline. We cannot wait until the CPU is fully online + * before starting the kthread, because the various notifier functions + * can wait for RCU grace periods. So we park rcu_cpu_kthread() until + * the corresponding CPU is online. + * + * Return 1 if the kthread needs to stop, 0 otherwise. + * + * Caller must disable bh. This function can momentarily enable it. + */ +static int rcu_cpu_kthread_should_stop(int cpu) +{ + while (cpu_is_offline(cpu) || + !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) || + smp_processor_id() != cpu) { + if (kthread_should_stop()) + return 1; + per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; + per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id(); + local_bh_enable(); + schedule_timeout_uninterruptible(1); + if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) + set_cpus_allowed_ptr(current, cpumask_of(cpu)); + local_bh_disable(); + } + per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; + return 0; +} + +/* + * Per-CPU kernel thread that invokes RCU callbacks. This replaces the + * earlier RCU softirq. + */ +static int rcu_cpu_kthread(void *arg) +{ + int cpu = (int)(long)arg; + unsigned long flags; + int spincnt = 0; + unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); + char work; + char *workp = &per_cpu(rcu_cpu_has_work, cpu); + + for (;;) { + *statusp = RCU_KTHREAD_WAITING; + rcu_wait(*workp != 0 || kthread_should_stop()); + local_bh_disable(); + if (rcu_cpu_kthread_should_stop(cpu)) { + local_bh_enable(); + break; + } + *statusp = RCU_KTHREAD_RUNNING; + per_cpu(rcu_cpu_kthread_loops, cpu)++; + local_irq_save(flags); + work = *workp; + *workp = 0; + local_irq_restore(flags); + if (work) + rcu_kthread_do_work(); + local_bh_enable(); + if (*workp != 0) + spincnt++; + else + spincnt = 0; + if (spincnt > 10) { + *statusp = RCU_KTHREAD_YIELDING; + rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); + spincnt = 0; + } + } + *statusp = RCU_KTHREAD_STOPPED; + return 0; +} + +/* + * Spawn a per-CPU kthread, setting up affinity and priority. + * Because the CPU hotplug lock is held, no other CPU will be attempting + * to manipulate rcu_cpu_kthread_task. There might be another CPU + * attempting to access it during boot, but the locking in kthread_bind() + * will enforce sufficient ordering. + * + * Please note that we cannot simply refuse to wake up the per-CPU + * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state, + * which can result in softlockup complaints if the task ends up being + * idle for more than a couple of minutes. + * + * However, please note also that we cannot bind the per-CPU kthread to its + * CPU until that CPU is fully online. We also cannot wait until the + * CPU is fully online before we create its per-CPU kthread, as this would + * deadlock the system when CPU notifiers tried waiting for grace + * periods. So we bind the per-CPU kthread to its CPU only if the CPU + * is online. If its CPU is not yet fully online, then the code in + * rcu_cpu_kthread() will wait until it is fully online, and then do + * the binding. + */ +static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) +{ + struct sched_param sp; + struct task_struct *t; + + if (!rcu_scheduler_fully_active || + per_cpu(rcu_cpu_kthread_task, cpu) != NULL) + return 0; + t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); + if (IS_ERR(t)) + return PTR_ERR(t); + if (cpu_online(cpu)) + kthread_bind(t, cpu); + per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; + WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + per_cpu(rcu_cpu_kthread_task, cpu) = t; + wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */ + return 0; +} + +/* + * Per-rcu_node kthread, which is in charge of waking up the per-CPU + * kthreads when needed. We ignore requests to wake up kthreads + * for offline CPUs, which is OK because force_quiescent_state() + * takes care of this case. + */ +static int rcu_node_kthread(void *arg) +{ + int cpu; + unsigned long flags; + unsigned long mask; + struct rcu_node *rnp = (struct rcu_node *)arg; + struct sched_param sp; + struct task_struct *t; + + for (;;) { + rnp->node_kthread_status = RCU_KTHREAD_WAITING; + rcu_wait(atomic_read(&rnp->wakemask) != 0); + rnp->node_kthread_status = RCU_KTHREAD_RUNNING; + raw_spin_lock_irqsave(&rnp->lock, flags); + mask = atomic_xchg(&rnp->wakemask, 0); + rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { + if ((mask & 0x1) == 0) + continue; + preempt_disable(); + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (!cpu_online(cpu) || t == NULL) { + preempt_enable(); + continue; + } + per_cpu(rcu_cpu_has_work, cpu) = 1; + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + preempt_enable(); + } + } + /* NOTREACHED */ + rnp->node_kthread_status = RCU_KTHREAD_STOPPED; + return 0; +} + +/* + * Set the per-rcu_node kthread's affinity to cover all CPUs that are + * served by the rcu_node in question. The CPU hotplug lock is still + * held, so the value of rnp->qsmaskinit will be stable. + * + * We don't include outgoingcpu in the affinity set, use -1 if there is + * no outgoing CPU. If there are no CPUs left in the affinity set, + * this function allows the kthread to execute on any CPU. + */ +static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +{ + cpumask_var_t cm; + int cpu; + unsigned long mask = rnp->qsmaskinit; + + if (rnp->node_kthread_task == NULL) + return; + if (!alloc_cpumask_var(&cm, GFP_KERNEL)) + return; + cpumask_clear(cm); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) + if ((mask & 0x1) && cpu != outgoingcpu) + cpumask_set_cpu(cpu, cm); + if (cpumask_weight(cm) == 0) { + cpumask_setall(cm); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) + cpumask_clear_cpu(cpu, cm); + WARN_ON_ONCE(cpumask_weight(cm) == 0); + } + set_cpus_allowed_ptr(rnp->node_kthread_task, cm); + rcu_boost_kthread_setaffinity(rnp, cm); + free_cpumask_var(cm); +} + +/* + * Spawn a per-rcu_node kthread, setting priority and affinity. + * Called during boot before online/offline can happen, or, if + * during runtime, with the main CPU-hotplug locks held. So only + * one of these can be executing at a time. + */ +static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, + struct rcu_node *rnp) +{ + unsigned long flags; + int rnp_index = rnp - &rsp->node[0]; + struct sched_param sp; + struct task_struct *t; + + if (!rcu_scheduler_fully_active || + rnp->qsmaskinit == 0) + return 0; + if (rnp->node_kthread_task == NULL) { + t = kthread_create(rcu_node_kthread, (void *)rnp, + "rcun%d", rnp_index); + if (IS_ERR(t)) + return PTR_ERR(t); + raw_spin_lock_irqsave(&rnp->lock, flags); + rnp->node_kthread_task = t; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + sp.sched_priority = 99; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ + } + return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); +} + +/* + * Spawn all kthreads -- called as soon as the scheduler is running. + */ +static int __init rcu_spawn_kthreads(void) +{ + int cpu; + struct rcu_node *rnp; + + rcu_scheduler_fully_active = 1; + for_each_possible_cpu(cpu) { + per_cpu(rcu_cpu_has_work, cpu) = 0; + if (cpu_online(cpu)) + (void)rcu_spawn_one_cpu_kthread(cpu); + } + rnp = rcu_get_root(rcu_state); + (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + if (NUM_RCU_NODES > 1) { + rcu_for_each_leaf_node(rcu_state, rnp) + (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + } + return 0; +} +early_initcall(rcu_spawn_kthreads); + +static void __cpuinit rcu_prepare_kthreads(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); + struct rcu_node *rnp = rdp->mynode; + + /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ + if (rcu_scheduler_fully_active) { + (void)rcu_spawn_one_cpu_kthread(cpu); + if (rnp->node_kthread_task == NULL) + (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + } } #else /* #ifdef CONFIG_RCU_BOOST */ @@ -1315,23 +1726,39 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) raw_spin_unlock_irqrestore(&rnp->lock, flags); } -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, - cpumask_var_t cm) +static void invoke_rcu_callbacks_kthread(void) { + WARN_ON_ONCE(1); } static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) { } -static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp, - int rnp_index) +#ifdef CONFIG_HOTPLUG_CPU + +static void rcu_stop_cpu_kthread(int cpu) +{ +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +{ +} + +static void rcu_cpu_kthread_setrt(int cpu, int to_rt) +{ +} + +static int __init rcu_scheduler_really_started(void) { + rcu_scheduler_fully_active = 1; return 0; } +early_initcall(rcu_scheduler_really_started); -static void __cpuinit rcu_wake_one_boost_kthread(struct rcu_node *rnp) +static void __cpuinit rcu_prepare_kthreads(int cpu) { } @@ -1509,7 +1936,7 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); * * Because it is not legal to invoke rcu_process_callbacks() with irqs * disabled, we do one pass of force_quiescent_state(), then do a - * invoke_rcu_cpu_kthread() to cause rcu_process_callbacks() to be invoked + * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. */ int rcu_needs_cpu(int cpu) @@ -1560,7 +1987,7 @@ int rcu_needs_cpu(int cpu) /* If RCU callbacks are still pending, RCU still needs this CPU. */ if (c) - invoke_rcu_cpu_kthread(); + invoke_rcu_core(); return c; } diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 9678cc3650f..4e144876dc6 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,6 +46,8 @@ #define RCU_TREE_NONCORE #include "rcutree.h" +#ifdef CONFIG_RCU_BOOST + DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu); DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); @@ -58,6 +60,8 @@ static char convert_kthread_status(unsigned int kthread_status) return "SRWOY"[kthread_status]; } +#endif /* #ifdef CONFIG_RCU_BOOST */ + static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) @@ -76,7 +80,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, " ql=%ld qs=%c%c%c%c kt=%d/%c/%d ktl=%x b=%ld", + seq_printf(m, " ql=%ld qs=%c%c%c%c", rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]], @@ -84,13 +88,16 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->nxttail[RCU_NEXT_READY_TAIL]], ".W"[rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_WAIT_TAIL]], - ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]], + ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); +#ifdef CONFIG_RCU_BOOST + seq_printf(m, " kt=%d/%c/%d ktl=%x", per_cpu(rcu_cpu_has_work, rdp->cpu), convert_kthread_status(per_cpu(rcu_cpu_kthread_status, rdp->cpu)), per_cpu(rcu_cpu_kthread_cpu, rdp->cpu), - per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff, - rdp->blimit); + per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); +#endif /* #ifdef CONFIG_RCU_BOOST */ + seq_printf(m, " b=%ld", rdp->blimit); seq_printf(m, " ci=%lu co=%lu ca=%lu\n", rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } @@ -147,18 +154,21 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, ",%ld,\"%c%c%c%c\",%d,\"%c\",%ld", rdp->qlen, + seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]], ".R"[rdp->nxttail[RCU_WAIT_TAIL] != rdp->nxttail[RCU_NEXT_READY_TAIL]], ".W"[rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_WAIT_TAIL]], - ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]], + ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); +#ifdef CONFIG_RCU_BOOST + seq_printf(m, ",%d,\"%c\"", per_cpu(rcu_cpu_has_work, rdp->cpu), convert_kthread_status(per_cpu(rcu_cpu_kthread_status, - rdp->cpu)), - rdp->blimit); + rdp->cpu))); +#endif /* #ifdef CONFIG_RCU_BOOST */ + seq_printf(m, ",%ld", rdp->blimit); seq_printf(m, ",%lu,%lu,%lu\n", rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } @@ -169,7 +179,11 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) #ifdef CONFIG_NO_HZ seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); #endif /* #ifdef CONFIG_NO_HZ */ - seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); + seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); +#ifdef CONFIG_RCU_BOOST + seq_puts(m, "\"kt\",\"ktl\""); +#endif /* #ifdef CONFIG_RCU_BOOST */ + seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); #ifdef CONFIG_TREE_PREEMPT_RCU seq_puts(m, "\"rcu_preempt:\"\n"); PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); diff --git a/kernel/resource.c b/kernel/resource.c index 798e2fae2a0..3ff40178dce 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -38,6 +38,14 @@ struct resource iomem_resource = { }; EXPORT_SYMBOL(iomem_resource); +/* constraints to be met while allocating resources */ +struct resource_constraint { + resource_size_t min, max, align; + resource_size_t (*alignf)(void *, const struct resource *, + resource_size_t, resource_size_t); + void *alignf_data; +}; + static DEFINE_RWLOCK(resource_lock); static void *r_next(struct seq_file *m, void *v, loff_t *pos) @@ -384,16 +392,13 @@ static bool resource_contains(struct resource *res1, struct resource *res2) } /* - * Find empty slot in the resource tree given range and alignment. + * Find empty slot in the resource tree with the given range and + * alignment constraints */ -static int find_resource(struct resource *root, struct resource *new, - resource_size_t size, resource_size_t min, - resource_size_t max, resource_size_t align, - resource_size_t (*alignf)(void *, - const struct resource *, - resource_size_t, - resource_size_t), - void *alignf_data) +static int __find_resource(struct resource *root, struct resource *old, + struct resource *new, + resource_size_t size, + struct resource_constraint *constraint) { struct resource *this = root->child; struct resource tmp = *new, avail, alloc; @@ -404,25 +409,26 @@ static int find_resource(struct resource *root, struct resource *new, * Skip past an allocated resource that starts at 0, since the assignment * of this->start - 1 to tmp->end below would cause an underflow. */ - if (this && this->start == 0) { - tmp.start = this->end + 1; + if (this && this->start == root->start) { + tmp.start = (this == old) ? old->start : this->end + 1; this = this->sibling; } for(;;) { if (this) - tmp.end = this->start - 1; + tmp.end = (this == old) ? this->end : this->start - 1; else tmp.end = root->end; - resource_clip(&tmp, min, max); + resource_clip(&tmp, constraint->min, constraint->max); arch_remove_reservations(&tmp); /* Check for overflow after ALIGN() */ avail = *new; - avail.start = ALIGN(tmp.start, align); + avail.start = ALIGN(tmp.start, constraint->align); avail.end = tmp.end; if (avail.start >= tmp.start) { - alloc.start = alignf(alignf_data, &avail, size, align); + alloc.start = constraint->alignf(constraint->alignf_data, &avail, + size, constraint->align); alloc.end = alloc.start + size - 1; if (resource_contains(&avail, &alloc)) { new->start = alloc.start; @@ -432,14 +438,75 @@ static int find_resource(struct resource *root, struct resource *new, } if (!this) break; - tmp.start = this->end + 1; + if (this != old) + tmp.start = this->end + 1; this = this->sibling; } return -EBUSY; } +/* + * Find empty slot in the resource tree given range and alignment. + */ +static int find_resource(struct resource *root, struct resource *new, + resource_size_t size, + struct resource_constraint *constraint) +{ + return __find_resource(root, NULL, new, size, constraint); +} + /** - * allocate_resource - allocate empty slot in the resource tree given range & alignment + * reallocate_resource - allocate a slot in the resource tree given range & alignment. + * The resource will be relocated if the new size cannot be reallocated in the + * current location. + * + * @root: root resource descriptor + * @old: resource descriptor desired by caller + * @newsize: new size of the resource descriptor + * @constraint: the size and alignment constraints to be met. + */ +int reallocate_resource(struct resource *root, struct resource *old, + resource_size_t newsize, + struct resource_constraint *constraint) +{ + int err=0; + struct resource new = *old; + struct resource *conflict; + + write_lock(&resource_lock); + + if ((err = __find_resource(root, old, &new, newsize, constraint))) + goto out; + + if (resource_contains(&new, old)) { + old->start = new.start; + old->end = new.end; + goto out; + } + + if (old->child) { + err = -EBUSY; + goto out; + } + + if (resource_contains(old, &new)) { + old->start = new.start; + old->end = new.end; + } else { + __release_resource(old); + *old = new; + conflict = __request_resource(root, old); + BUG_ON(conflict); + } +out: + write_unlock(&resource_lock); + return err; +} + + +/** + * allocate_resource - allocate empty slot in the resource tree given range & alignment. + * The resource will be reallocated with a new size if it was already allocated * @root: root resource descriptor * @new: resource descriptor desired by caller * @size: requested resource region size @@ -459,12 +526,25 @@ int allocate_resource(struct resource *root, struct resource *new, void *alignf_data) { int err; + struct resource_constraint constraint; if (!alignf) alignf = simple_align_resource; + constraint.min = min; + constraint.max = max; + constraint.align = align; + constraint.alignf = alignf; + constraint.alignf_data = alignf_data; + + if ( new->parent ) { + /* resource is already allocated, try reallocating with + the new constraints */ + return reallocate_resource(root, new, size, &constraint); + } + write_lock(&resource_lock); - err = find_resource(root, new, size, min, max, align, alignf, alignf_data); + err = find_resource(root, new, size, &constraint); if (err >= 0 && __request_resource(root, new)) err = -EBUSY; write_unlock(&resource_lock); diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index ab449117aaf..255e1662acd 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -890,7 +890,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name) { lock->owner = NULL; raw_spin_lock_init(&lock->wait_lock); - plist_head_init_raw(&lock->wait_list, &lock->wait_lock); + plist_head_init(&lock->wait_list); debug_rt_mutex_init(lock, name); } diff --git a/kernel/sched.c b/kernel/sched.c index cbb3a0eee58..c518b05fd06 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -292,8 +292,8 @@ static DEFINE_SPINLOCK(task_group_lock); * (The default weight is 1024 - so there's no practical * limitation from this.) */ -#define MIN_SHARES 2 -#define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION)) +#define MIN_SHARES (1UL << 1) +#define MAX_SHARES (1UL << 18) static int root_task_group_load = ROOT_TASK_GROUP_LOAD; #endif @@ -605,10 +605,10 @@ static inline int cpu_of(struct rq *rq) /* * Return the group to which this tasks belongs. * - * We use task_subsys_state_check() and extend the RCU verification - * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach() - * holds that lock for each task it moves into the cgroup. Therefore - * by holding that lock, we pin the task to the current cgroup. + * We use task_subsys_state_check() and extend the RCU verification with + * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each + * task it moves into the cgroup. Therefore by holding either of those locks, + * we pin the task to the current cgroup. */ static inline struct task_group *task_group(struct task_struct *p) { @@ -616,7 +616,8 @@ static inline struct task_group *task_group(struct task_struct *p) struct cgroup_subsys_state *css; css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&p->pi_lock)); + lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&task_rq(p)->lock)); tg = container_of(css, struct task_group, css); return autogroup_task_group(p, tg); @@ -2200,6 +2201,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); #ifdef CONFIG_LOCKDEP + /* + * The caller should hold either p->pi_lock or rq->lock, when changing + * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. + * + * sched_move_task() holds both and thus holding either pins the cgroup, + * see set_task_rq(). + * + * Furthermore, all task_rq users should acquire both locks, see + * task_rq_lock(). + */ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || lockdep_is_held(&task_rq(p)->lock))); #endif @@ -2447,6 +2458,10 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } rcu_read_unlock(); } + + if (wake_flags & WF_MIGRATED) + schedstat_inc(p, se.statistics.nr_wakeups_migrate); + #endif /* CONFIG_SMP */ schedstat_inc(rq, ttwu_count); @@ -2455,9 +2470,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) if (wake_flags & WF_SYNC) schedstat_inc(p, se.statistics.nr_wakeups_sync); - if (cpu != task_cpu(p)) - schedstat_inc(p, se.statistics.nr_wakeups_migrate); - #endif /* CONFIG_SCHEDSTATS */ } @@ -2532,13 +2544,9 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -static void sched_ttwu_pending(void) +static void sched_ttwu_do_pending(struct task_struct *list) { struct rq *rq = this_rq(); - struct task_struct *list = xchg(&rq->wake_list, NULL); - - if (!list) - return; raw_spin_lock(&rq->lock); @@ -2551,9 +2559,45 @@ static void sched_ttwu_pending(void) raw_spin_unlock(&rq->lock); } +#ifdef CONFIG_HOTPLUG_CPU + +static void sched_ttwu_pending(void) +{ + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + sched_ttwu_do_pending(list); +} + +#endif /* CONFIG_HOTPLUG_CPU */ + void scheduler_ipi(void) { - sched_ttwu_pending(); + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + /* + * Not all reschedule IPI handlers call irq_enter/irq_exit, since + * traditionally all their work was done from the interrupt return + * path. Now that we actually do some work, we need to make sure + * we do call them. + * + * Some archs already do call them, luckily irq_enter/exit nest + * properly. + * + * Arguably we should visit all archs and update all handlers, + * however a fair share of IPIs are still resched only so this would + * somewhat pessimize the simple resched case. + */ + irq_enter(); + sched_ttwu_do_pending(list); + irq_exit(); } static void ttwu_queue_remote(struct task_struct *p, int cpu) @@ -2600,6 +2644,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) #if defined(CONFIG_SMP) if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { + sched_clock_cpu(cpu); /* sync clocks x-cpu */ ttwu_queue_remote(p, cpu); return; } @@ -2674,8 +2719,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_class->task_waking(p); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (task_cpu(p) != cpu) + if (task_cpu(p) != cpu) { + wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); + } #endif /* CONFIG_SMP */ ttwu_queue(p, cpu); @@ -6542,7 +6589,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!group->cpu_power) { + if (!group->sgp->power) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -6566,9 +6613,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->cpu_power != SCHED_POWER_SCALE) { + if (group->sgp->power != SCHED_POWER_SCALE) { printk(KERN_CONT " (cpu_power = %d)", - group->cpu_power); + group->sgp->power); } group = group->next; @@ -6759,11 +6806,39 @@ static struct root_domain *alloc_rootdomain(void) return rd; } +static void free_sched_groups(struct sched_group *sg, int free_sgp) +{ + struct sched_group *tmp, *first; + + if (!sg) + return; + + first = sg; + do { + tmp = sg->next; + + if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) + kfree(sg->sgp); + + kfree(sg); + sg = tmp; + } while (sg != first); +} + static void free_sched_domain(struct rcu_head *rcu) { struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); - if (atomic_dec_and_test(&sd->groups->ref)) + + /* + * If its an overlapping domain it has private groups, iterate and + * nuke them all. + */ + if (sd->flags & SD_OVERLAP) { + free_sched_groups(sd->groups, 1); + } else if (atomic_dec_and_test(&sd->groups->ref)) { + kfree(sd->groups->sgp); kfree(sd->groups); + } kfree(sd); } @@ -6930,6 +7005,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; struct sd_data { struct sched_domain **__percpu sd; struct sched_group **__percpu sg; + struct sched_group_power **__percpu sgp; }; struct s_data { @@ -6949,15 +7025,73 @@ struct sched_domain_topology_level; typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); +#define SDTL_OVERLAP 0x01 + struct sched_domain_topology_level { sched_domain_init_f init; sched_domain_mask_f mask; + int flags; struct sd_data data; }; -/* - * Assumes the sched_domain tree is fully constructed - */ +static int +build_overlap_sched_groups(struct sched_domain *sd, int cpu) +{ + struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; + const struct cpumask *span = sched_domain_span(sd); + struct cpumask *covered = sched_domains_tmpmask; + struct sd_data *sdd = sd->private; + struct sched_domain *child; + int i; + + cpumask_clear(covered); + + for_each_cpu(i, span) { + struct cpumask *sg_span; + + if (cpumask_test_cpu(i, covered)) + continue; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(i)); + + if (!sg) + goto fail; + + sg_span = sched_group_cpus(sg); + + child = *per_cpu_ptr(sdd->sd, i); + if (child->child) { + child = child->child; + cpumask_copy(sg_span, sched_domain_span(child)); + } else + cpumask_set_cpu(i, sg_span); + + cpumask_or(covered, covered, sg_span); + + sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); + atomic_inc(&sg->sgp->ref); + + if (cpumask_test_cpu(cpu, sg_span)) + groups = sg; + + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + last->next = first; + } + sd->groups = groups; + + return 0; + +fail: + free_sched_groups(first, 0); + + return -ENOMEM; +} + static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) { struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); @@ -6966,24 +7100,24 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) if (child) cpu = cpumask_first(sched_domain_span(child)); - if (sg) + if (sg) { *sg = *per_cpu_ptr(sdd->sg, cpu); + (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); + atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ + } return cpu; } /* - * build_sched_groups takes the cpumask we wish to span, and a pointer - * to a function which identifies what group(along with sched group) a CPU - * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids - * (due to the fact that we keep track of groups covered with a struct cpumask). - * * build_sched_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. + * + * Assumes the sched_domain tree is fully constructed */ -static void -build_sched_groups(struct sched_domain *sd) +static int +build_sched_groups(struct sched_domain *sd, int cpu) { struct sched_group *first = NULL, *last = NULL; struct sd_data *sdd = sd->private; @@ -6991,6 +7125,12 @@ build_sched_groups(struct sched_domain *sd) struct cpumask *covered; int i; + get_group(cpu, sdd, &sd->groups); + atomic_inc(&sd->groups->ref); + + if (cpu != cpumask_first(sched_domain_span(sd))) + return 0; + lockdep_assert_held(&sched_domains_mutex); covered = sched_domains_tmpmask; @@ -7005,7 +7145,7 @@ build_sched_groups(struct sched_domain *sd) continue; cpumask_clear(sched_group_cpus(sg)); - sg->cpu_power = 0; + sg->sgp->power = 0; for_each_cpu(j, span) { if (get_group(j, sdd, NULL) != group) @@ -7022,6 +7162,8 @@ build_sched_groups(struct sched_domain *sd) last = sg; } last->next = first; + + return 0; } /* @@ -7036,12 +7178,17 @@ build_sched_groups(struct sched_domain *sd) */ static void init_sched_groups_power(int cpu, struct sched_domain *sd) { - WARN_ON(!sd || !sd->groups); + struct sched_group *sg = sd->groups; - if (cpu != group_first_cpu(sd->groups)) - return; + WARN_ON(!sd || !sg); + + do { + sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + sg = sg->next; + } while (sg != sd->groups); - sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); + if (cpu != group_first_cpu(sg)) + return; update_group_power(sd, cpu); } @@ -7162,15 +7309,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, static void claim_allocations(int cpu, struct sched_domain *sd) { struct sd_data *sdd = sd->private; - struct sched_group *sg = sd->groups; WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); *per_cpu_ptr(sdd->sd, cpu) = NULL; - if (cpu == cpumask_first(sched_group_cpus(sg))) { - WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); + if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; - } + + if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) + *per_cpu_ptr(sdd->sgp, cpu) = NULL; } #ifdef CONFIG_SCHED_SMT @@ -7195,7 +7342,7 @@ static struct sched_domain_topology_level default_topology[] = { #endif { sd_init_CPU, cpu_cpu_mask, }, #ifdef CONFIG_NUMA - { sd_init_NODE, cpu_node_mask, }, + { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, { sd_init_ALLNODES, cpu_allnodes_mask, }, #endif { NULL, }, @@ -7219,9 +7366,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sg) return -ENOMEM; + sdd->sgp = alloc_percpu(struct sched_group_power *); + if (!sdd->sgp) + return -ENOMEM; + for_each_cpu(j, cpu_map) { struct sched_domain *sd; struct sched_group *sg; + struct sched_group_power *sgp; sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); @@ -7236,6 +7388,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map) return -ENOMEM; *per_cpu_ptr(sdd->sg, j) = sg; + + sgp = kzalloc_node(sizeof(struct sched_group_power), + GFP_KERNEL, cpu_to_node(j)); + if (!sgp) + return -ENOMEM; + + *per_cpu_ptr(sdd->sgp, j) = sgp; } } @@ -7251,11 +7410,15 @@ static void __sdt_free(const struct cpumask *cpu_map) struct sd_data *sdd = &tl->data; for_each_cpu(j, cpu_map) { - kfree(*per_cpu_ptr(sdd->sd, j)); + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); + if (sd && (sd->flags & SD_OVERLAP)) + free_sched_groups(sd->groups, 0); kfree(*per_cpu_ptr(sdd->sg, j)); + kfree(*per_cpu_ptr(sdd->sgp, j)); } free_percpu(sdd->sd); free_percpu(sdd->sg); + free_percpu(sdd->sgp); } } @@ -7301,8 +7464,13 @@ static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_topology_level *tl; sd = NULL; - for (tl = sched_domain_topology; tl->init; tl++) + for (tl = sched_domain_topology; tl->init; tl++) { sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); + if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) + sd->flags |= SD_OVERLAP; + if (cpumask_equal(cpu_map, sched_domain_span(sd))) + break; + } while (sd->child) sd = sd->child; @@ -7314,13 +7482,13 @@ static int build_sched_domains(const struct cpumask *cpu_map, for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { sd->span_weight = cpumask_weight(sched_domain_span(sd)); - get_group(i, sd->private, &sd->groups); - atomic_inc(&sd->groups->ref); - - if (i != cpumask_first(sched_domain_span(sd))) - continue; - - build_sched_groups(sd); + if (sd->flags & SD_OVERLAP) { + if (build_overlap_sched_groups(sd, i)) + goto error; + } else { + if (build_sched_groups(sd, i)) + goto error; + } } } @@ -7742,6 +7910,9 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) #endif #endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); +#ifndef CONFIG_64BIT + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; +#endif } static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) @@ -7766,7 +7937,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; - plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); + plist_head_init(&rt_rq->pushable_tasks); #endif rt_rq->rt_time = 0; @@ -7971,7 +8142,7 @@ void __init sched_init(void) #endif #ifdef CONFIG_RT_MUTEXES - plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); + plist_head_init(&init_task.pi_waiters); #endif /* @@ -8435,10 +8606,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) if (!tg->se[0]) return -EINVAL; - if (shares < MIN_SHARES) - shares = MIN_SHARES; - else if (shares > MAX_SHARES) - shares = MAX_SHARES; + shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); mutex_lock(&shares_mutex); if (tg->shares == shares) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 433491c2dc8..c768588e180 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1585,7 +1585,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power; + avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; if (local_group) { this_load = avg_load; @@ -2631,7 +2631,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power >>= SCHED_POWER_SHIFT; } - sdg->cpu_power_orig = power; + sdg->sgp->power_orig = power; if (sched_feat(ARCH_POWER)) power *= arch_scale_freq_power(sd, cpu); @@ -2647,7 +2647,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power = 1; cpu_rq(cpu)->cpu_power = power; - sdg->cpu_power = power; + sdg->sgp->power = power; } static void update_group_power(struct sched_domain *sd, int cpu) @@ -2665,11 +2665,11 @@ static void update_group_power(struct sched_domain *sd, int cpu) group = child->groups; do { - power += group->cpu_power; + power += group->sgp->power; group = group->next; } while (group != child->groups); - sdg->cpu_power = power; + sdg->sgp->power = power; } /* @@ -2691,7 +2691,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) /* * If ~90% of the cpu_power is still there, we're good. */ - if (group->cpu_power * 32 > group->cpu_power_orig * 29) + if (group->sgp->power * 32 > group->sgp->power_orig * 29) return 1; return 0; @@ -2771,7 +2771,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, } /* Adjust by relative CPU power of the group */ - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power; + sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; /* * Consider the group unbalanced when the imbalance is larger @@ -2788,7 +2788,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) sgs->group_imb = 1; - sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, + sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, SCHED_POWER_SCALE); if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(sd, group); @@ -2877,7 +2877,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, return; sds->total_load += sgs.group_load; - sds->total_pwr += sg->cpu_power; + sds->total_pwr += sg->sgp->power; /* * In case the child domain prefers tasks go to siblings @@ -2962,7 +2962,7 @@ static int check_asym_packing(struct sched_domain *sd, if (this_cpu > busiest_cpu) return 0; - *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, + *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); return 1; } @@ -2993,7 +2993,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, scaled_busy_load_per_task = sds->busiest_load_per_task * SCHED_POWER_SCALE; - scaled_busy_load_per_task /= sds->busiest->cpu_power; + scaled_busy_load_per_task /= sds->busiest->sgp->power; if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= (scaled_busy_load_per_task * imbn)) { @@ -3007,28 +3007,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, * moving them. */ - pwr_now += sds->busiest->cpu_power * + pwr_now += sds->busiest->sgp->power * min(sds->busiest_load_per_task, sds->max_load); - pwr_now += sds->this->cpu_power * + pwr_now += sds->this->sgp->power * min(sds->this_load_per_task, sds->this_load); pwr_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / - sds->busiest->cpu_power; + sds->busiest->sgp->power; if (sds->max_load > tmp) - pwr_move += sds->busiest->cpu_power * + pwr_move += sds->busiest->sgp->power * min(sds->busiest_load_per_task, sds->max_load - tmp); /* Amount of load we'd add */ - if (sds->max_load * sds->busiest->cpu_power < + if (sds->max_load * sds->busiest->sgp->power < sds->busiest_load_per_task * SCHED_POWER_SCALE) - tmp = (sds->max_load * sds->busiest->cpu_power) / - sds->this->cpu_power; + tmp = (sds->max_load * sds->busiest->sgp->power) / + sds->this->sgp->power; else tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / - sds->this->cpu_power; - pwr_move += sds->this->cpu_power * + sds->this->sgp->power; + pwr_move += sds->this->sgp->power * min(sds->this_load_per_task, sds->this_load + tmp); pwr_move /= SCHED_POWER_SCALE; @@ -3074,7 +3074,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); - load_above_capacity /= sds->busiest->cpu_power; + load_above_capacity /= sds->busiest->sgp->power; } /* @@ -3090,8 +3090,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * sds->busiest->cpu_power, - (sds->avg_load - sds->this_load) * sds->this->cpu_power) + *imbalance = min(max_pull * sds->busiest->sgp->power, + (sds->avg_load - sds->this_load) * sds->this->sgp->power) / SCHED_POWER_SCALE; /* diff --git a/kernel/sched_features.h b/kernel/sched_features.h index be40f7371ee..1e7066d76c2 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -70,3 +70,5 @@ SCHED_FEAT(NONIRQ_POWER, 1) * using the scheduler IPI. Reduces rq->lock contention/bounces. */ SCHED_FEAT(TTWU_QUEUE, 1) + +SCHED_FEAT(FORCE_SD_OVERLAP, 0) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 88725c939e0..10d018212ba 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1096,7 +1096,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag * to move current somewhere else, making room for our non-migratable * task. */ - if (p->prio == rq->curr->prio && !need_resched()) + if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) check_preempt_equal_prio(rq, p); #endif } @@ -1239,6 +1239,10 @@ static int find_lowest_rq(struct task_struct *task) int this_cpu = smp_processor_id(); int cpu = task_cpu(task); + /* Make sure the mask is initialized first */ + if (unlikely(!lowest_mask)) + return -1; + if (task->rt.nr_cpus_allowed == 1) return -1; /* No other targets possible */ diff --git a/kernel/signal.c b/kernel/signal.c index 86c32b884f8..d7f70aed1cc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -87,7 +87,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) /* * Tracers may want to know about even ignored signals. */ - return !tracehook_consider_ignored_signal(t, sig); + return !t->ptrace; } /* @@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) static int recalc_sigpending_tsk(struct task_struct *t) { - if ((t->group_stop & GROUP_STOP_PENDING) || + if ((t->jobctl & JOBCTL_PENDING_MASK) || PENDING(&t->pending, &t->blocked) || PENDING(&t->signal->shared_pending, &t->blocked)) { set_tsk_thread_flag(t, TIF_SIGPENDING); @@ -150,9 +150,7 @@ void recalc_sigpending_and_wake(struct task_struct *t) void recalc_sigpending(void) { - if (unlikely(tracehook_force_sigpending())) - set_thread_flag(TIF_SIGPENDING); - else if (!recalc_sigpending_tsk(current) && !freezing(current)) + if (!recalc_sigpending_tsk(current) && !freezing(current)) clear_thread_flag(TIF_SIGPENDING); } @@ -224,47 +222,93 @@ static inline void print_dropped_signal(int sig) } /** - * task_clear_group_stop_trapping - clear group stop trapping bit + * task_set_jobctl_pending - set jobctl pending bits * @task: target task + * @mask: pending bits to set * - * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us. Clear it - * and wake up the ptracer. Note that we don't need any further locking. - * @task->siglock guarantees that @task->parent points to the ptracer. + * Clear @mask from @task->jobctl. @mask must be subset of + * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK | + * %JOBCTL_TRAPPING. If stop signo is being set, the existing signo is + * cleared. If @task is already being killed or exiting, this function + * becomes noop. + * + * CONTEXT: + * Must be called with @task->sighand->siglock held. + * + * RETURNS: + * %true if @mask is set, %false if made noop because @task was dying. + */ +bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask) +{ + BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME | + JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); + BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK)); + + if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING))) + return false; + + if (mask & JOBCTL_STOP_SIGMASK) + task->jobctl &= ~JOBCTL_STOP_SIGMASK; + + task->jobctl |= mask; + return true; +} + +/** + * task_clear_jobctl_trapping - clear jobctl trapping bit + * @task: target task + * + * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED. + * Clear it and wake up the ptracer. Note that we don't need any further + * locking. @task->siglock guarantees that @task->parent points to the + * ptracer. * * CONTEXT: * Must be called with @task->sighand->siglock held. */ -static void task_clear_group_stop_trapping(struct task_struct *task) +void task_clear_jobctl_trapping(struct task_struct *task) { - if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) { - task->group_stop &= ~GROUP_STOP_TRAPPING; - __wake_up_sync_key(&task->parent->signal->wait_chldexit, - TASK_UNINTERRUPTIBLE, 1, task); + if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { + task->jobctl &= ~JOBCTL_TRAPPING; + wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); } } /** - * task_clear_group_stop_pending - clear pending group stop + * task_clear_jobctl_pending - clear jobctl pending bits * @task: target task + * @mask: pending bits to clear * - * Clear group stop states for @task. + * Clear @mask from @task->jobctl. @mask must be subset of + * %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other + * STOP bits are cleared together. + * + * If clearing of @mask leaves no stop or trap pending, this function calls + * task_clear_jobctl_trapping(). * * CONTEXT: * Must be called with @task->sighand->siglock held. */ -void task_clear_group_stop_pending(struct task_struct *task) +void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask) { - task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME | - GROUP_STOP_DEQUEUED); + BUG_ON(mask & ~JOBCTL_PENDING_MASK); + + if (mask & JOBCTL_STOP_PENDING) + mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED; + + task->jobctl &= ~mask; + + if (!(task->jobctl & JOBCTL_PENDING_MASK)) + task_clear_jobctl_trapping(task); } /** * task_participate_group_stop - participate in a group stop * @task: task participating in a group stop * - * @task has GROUP_STOP_PENDING set and is participating in a group stop. + * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop. * Group stop states are cleared and the group stop count is consumed if - * %GROUP_STOP_CONSUME was set. If the consumption completes the group + * %JOBCTL_STOP_CONSUME was set. If the consumption completes the group * stop, the appropriate %SIGNAL_* flags are set. * * CONTEXT: @@ -277,11 +321,11 @@ void task_clear_group_stop_pending(struct task_struct *task) static bool task_participate_group_stop(struct task_struct *task) { struct signal_struct *sig = task->signal; - bool consume = task->group_stop & GROUP_STOP_CONSUME; + bool consume = task->jobctl & JOBCTL_STOP_CONSUME; - WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING)); + WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING)); - task_clear_group_stop_pending(task); + task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING); if (!consume) return false; @@ -449,7 +493,8 @@ int unhandled_signal(struct task_struct *tsk, int sig) return 1; if (handler != SIG_IGN && handler != SIG_DFL) return 0; - return !tracehook_consider_fatal_signal(tsk, sig); + /* if ptraced, let the tracer determine */ + return !tsk->ptrace; } /* @@ -604,7 +649,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * is to alert stop-signal processing code when another * processor has come along and cleared the flag. */ - current->group_stop |= GROUP_STOP_DEQUEUED; + current->jobctl |= JOBCTL_STOP_DEQUEUED; } if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { /* @@ -773,6 +818,32 @@ static int check_kill_permission(int sig, struct siginfo *info, return security_task_kill(t, info, sig, 0); } +/** + * ptrace_trap_notify - schedule trap to notify ptracer + * @t: tracee wanting to notify tracer + * + * This function schedules sticky ptrace trap which is cleared on the next + * TRAP_STOP to notify ptracer of an event. @t must have been seized by + * ptracer. + * + * If @t is running, STOP trap will be taken. If trapped for STOP and + * ptracer is listening for events, tracee is woken up so that it can + * re-trap for the new event. If trapped otherwise, STOP trap will be + * eventually taken without returning to userland after the existing traps + * are finished by PTRACE_CONT. + * + * CONTEXT: + * Must be called with @task->sighand->siglock held. + */ +static void ptrace_trap_notify(struct task_struct *t) +{ + WARN_ON_ONCE(!(t->ptrace & PT_SEIZED)); + assert_spin_locked(&t->sighand->siglock); + + task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); + signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); +} + /* * Handle magic process-wide effects of stop/continue signals. Unlike * the signal actions, these happen immediately at signal-generation @@ -809,9 +880,12 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); t = p; do { - task_clear_group_stop_pending(t); + task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); - wake_up_state(t, __TASK_STOPPED); + if (likely(!(t->ptrace & PT_SEIZED))) + wake_up_state(t, __TASK_STOPPED); + else + ptrace_trap_notify(t); } while_each_thread(p, t); /* @@ -908,8 +982,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) if (sig_fatal(p, sig) && !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && !sigismember(&t->real_blocked, sig) && - (sig == SIGKILL || - !tracehook_consider_fatal_signal(t, sig))) { + (sig == SIGKILL || !t->ptrace)) { /* * This signal will be fatal to the whole group. */ @@ -925,7 +998,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) signal->group_stop_count = 0; t = p; do { - task_clear_group_stop_pending(t); + task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } while_each_thread(p, t); @@ -1160,7 +1233,7 @@ int zap_other_threads(struct task_struct *p) p->signal->group_stop_count = 0; while_each_thread(p, t) { - task_clear_group_stop_pending(t); + task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); count++; /* Don't bother with already dead threads */ @@ -1178,18 +1251,25 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, { struct sighand_struct *sighand; - rcu_read_lock(); for (;;) { + local_irq_save(*flags); + rcu_read_lock(); sighand = rcu_dereference(tsk->sighand); - if (unlikely(sighand == NULL)) + if (unlikely(sighand == NULL)) { + rcu_read_unlock(); + local_irq_restore(*flags); break; + } - spin_lock_irqsave(&sighand->siglock, *flags); - if (likely(sighand == tsk->sighand)) + spin_lock(&sighand->siglock); + if (likely(sighand == tsk->sighand)) { + rcu_read_unlock(); break; - spin_unlock_irqrestore(&sighand->siglock, *flags); + } + spin_unlock(&sighand->siglock); + rcu_read_unlock(); + local_irq_restore(*flags); } - rcu_read_unlock(); return sighand; } @@ -1504,22 +1584,22 @@ ret: * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. * - * Returns -1 if our parent ignored us and so we've switched to - * self-reaping, or else @sig. + * Returns true if our parent ignored us and so we've switched to + * self-reaping. */ -int do_notify_parent(struct task_struct *tsk, int sig) +bool do_notify_parent(struct task_struct *tsk, int sig) { struct siginfo info; unsigned long flags; struct sighand_struct *psig; - int ret = sig; + bool autoreap = false; BUG_ON(sig == -1); /* do_notify_parent_cldstop should have been called instead. */ BUG_ON(task_is_stopped_or_traced(tsk)); - BUG_ON(!task_ptrace(tsk) && + BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); info.si_signo = sig; @@ -1558,7 +1638,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) psig = tsk->parent->sighand; spin_lock_irqsave(&psig->siglock, flags); - if (!task_ptrace(tsk) && sig == SIGCHLD && + if (!tsk->ptrace && sig == SIGCHLD && (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { /* @@ -1576,16 +1656,16 @@ int do_notify_parent(struct task_struct *tsk, int sig) * is implementation-defined: we do (if you don't want * it, just use SIG_IGN instead). */ - ret = tsk->exit_signal = -1; + autoreap = true; if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) - sig = -1; + sig = 0; } - if (valid_signal(sig) && sig > 0) + if (valid_signal(sig) && sig) __group_send_sig_info(sig, &info, tsk->parent); __wake_up_parent(tsk, tsk->parent); spin_unlock_irqrestore(&psig->siglock, flags); - return ret; + return autoreap; } /** @@ -1658,7 +1738,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, static inline int may_ptrace_stop(void) { - if (!likely(task_ptrace(current))) + if (!likely(current->ptrace)) return 0; /* * Are we in the middle of do_coredump? @@ -1687,15 +1767,6 @@ static int sigkill_pending(struct task_struct *tsk) } /* - * Test whether the target task of the usual cldstop notification - the - * real_parent of @child - is in the same group as the ptracer. - */ -static bool real_parent_is_ptracer(struct task_struct *child) -{ - return same_thread_group(child->parent, child->real_parent); -} - -/* * This must be called with current->sighand->siglock held. * * This should be the path for all ptrace stops. @@ -1732,31 +1803,34 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) } /* - * If @why is CLD_STOPPED, we're trapping to participate in a group - * stop. Do the bookkeeping. Note that if SIGCONT was delievered - * while siglock was released for the arch hook, PENDING could be - * clear now. We act as if SIGCONT is received after TASK_TRACED - * is entered - ignore it. + * We're committing to trapping. TRACED should be visible before + * TRAPPING is cleared; otherwise, the tracer might fail do_wait(). + * Also, transition to TRACED and updates to ->jobctl should be + * atomic with respect to siglock and should be done after the arch + * hook as siglock is released and regrabbed across it. */ - if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING)) - gstop_done = task_participate_group_stop(current); + set_current_state(TASK_TRACED); current->last_siginfo = info; current->exit_code = exit_code; /* - * TRACED should be visible before TRAPPING is cleared; otherwise, - * the tracer might fail do_wait(). + * If @why is CLD_STOPPED, we're trapping to participate in a group + * stop. Do the bookkeeping. Note that if SIGCONT was delievered + * across siglock relocks since INTERRUPT was scheduled, PENDING + * could be clear now. We act as if SIGCONT is received after + * TASK_TRACED is entered - ignore it. */ - set_current_state(TASK_TRACED); + if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING)) + gstop_done = task_participate_group_stop(current); - /* - * We're committing to trapping. Clearing GROUP_STOP_TRAPPING and - * transition to TASK_TRACED should be atomic with respect to - * siglock. This hsould be done after the arch hook as siglock is - * released and regrabbed across it. - */ - task_clear_group_stop_trapping(current); + /* any trap clears pending STOP trap, STOP trap clears NOTIFY */ + task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP); + if (info && info->si_code >> 8 == PTRACE_EVENT_STOP) + task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY); + + /* entering a trap, clear TRAPPING */ + task_clear_jobctl_trapping(current); spin_unlock_irq(¤t->sighand->siglock); read_lock(&tasklist_lock); @@ -1772,7 +1846,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) * separately unless they're gonna be duplicates. */ do_notify_parent_cldstop(current, true, why); - if (gstop_done && !real_parent_is_ptracer(current)) + if (gstop_done && ptrace_reparented(current)) do_notify_parent_cldstop(current, false, why); /* @@ -1792,9 +1866,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) * * If @gstop_done, the ptracer went away between group stop * completion and here. During detach, it would have set - * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED - * in do_signal_stop() on return, so notifying the real - * parent of the group stop completion is enough. + * JOBCTL_STOP_PENDING on us and we'll re-enter + * TASK_STOPPED in do_signal_stop() on return, so notifying + * the real parent of the group stop completion is enough. */ if (gstop_done) do_notify_parent_cldstop(current, false, why); @@ -1820,6 +1894,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) spin_lock_irq(¤t->sighand->siglock); current->last_siginfo = NULL; + /* LISTENING can be set only during STOP traps, clear it */ + current->jobctl &= ~JOBCTL_LISTENING; + /* * Queued signals ignored us while we were stopped for tracing. * So check for any that we should take before resuming user mode. @@ -1828,44 +1905,66 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) recalc_sigpending_tsk(current); } -void ptrace_notify(int exit_code) +static void ptrace_do_notify(int signr, int exit_code, int why) { siginfo_t info; - BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); - memset(&info, 0, sizeof info); - info.si_signo = SIGTRAP; + info.si_signo = signr; info.si_code = exit_code; info.si_pid = task_pid_vnr(current); info.si_uid = current_uid(); /* Let the debugger run. */ + ptrace_stop(exit_code, why, 1, &info); +} + +void ptrace_notify(int exit_code) +{ + BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); + spin_lock_irq(¤t->sighand->siglock); - ptrace_stop(exit_code, CLD_TRAPPED, 1, &info); + ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); spin_unlock_irq(¤t->sighand->siglock); } -/* - * This performs the stopping for SIGSTOP and other stop signals. - * We have to stop all threads in the thread group. - * Returns non-zero if we've actually stopped and released the siglock. - * Returns zero if we didn't stop and still hold the siglock. +/** + * do_signal_stop - handle group stop for SIGSTOP and other stop signals + * @signr: signr causing group stop if initiating + * + * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr + * and participate in it. If already set, participate in the existing + * group stop. If participated in a group stop (and thus slept), %true is + * returned with siglock released. + * + * If ptraced, this function doesn't handle stop itself. Instead, + * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock + * untouched. The caller must ensure that INTERRUPT trap handling takes + * places afterwards. + * + * CONTEXT: + * Must be called with @current->sighand->siglock held, which is released + * on %true return. + * + * RETURNS: + * %false if group stop is already cancelled or ptrace trap is scheduled. + * %true if participated in group stop. */ -static int do_signal_stop(int signr) +static bool do_signal_stop(int signr) + __releases(¤t->sighand->siglock) { struct signal_struct *sig = current->signal; - if (!(current->group_stop & GROUP_STOP_PENDING)) { - unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME; + if (!(current->jobctl & JOBCTL_STOP_PENDING)) { + unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; struct task_struct *t; - /* signr will be recorded in task->group_stop for retries */ - WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK); + /* signr will be recorded in task->jobctl for retries */ + WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK); - if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) || + if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) || unlikely(signal_group_exit(sig))) - return 0; + return false; /* * There is no group stop already in progress. We must * initiate one now. @@ -1888,28 +1987,32 @@ static int do_signal_stop(int signr) if (!(sig->flags & SIGNAL_STOP_STOPPED)) sig->group_exit_code = signr; else - WARN_ON_ONCE(!task_ptrace(current)); + WARN_ON_ONCE(!current->ptrace); + + sig->group_stop_count = 0; + + if (task_set_jobctl_pending(current, signr | gstop)) + sig->group_stop_count++; - current->group_stop &= ~GROUP_STOP_SIGMASK; - current->group_stop |= signr | gstop; - sig->group_stop_count = 1; for (t = next_thread(current); t != current; t = next_thread(t)) { - t->group_stop &= ~GROUP_STOP_SIGMASK; /* * Setting state to TASK_STOPPED for a group * stop is always done with the siglock held, * so this check has no races. */ - if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) { - t->group_stop |= signr | gstop; + if (!task_is_stopped(t) && + task_set_jobctl_pending(t, signr | gstop)) { sig->group_stop_count++; - signal_wake_up(t, 0); + if (likely(!(t->ptrace & PT_SEIZED))) + signal_wake_up(t, 0); + else + ptrace_trap_notify(t); } } } -retry: - if (likely(!task_ptrace(current))) { + + if (likely(!current->ptrace)) { int notify = 0; /* @@ -1940,43 +2043,65 @@ retry: /* Now we don't run again until woken by SIGCONT or SIGKILL */ schedule(); - - spin_lock_irq(¤t->sighand->siglock); + return true; } else { - ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK, - CLD_STOPPED, 0, NULL); - current->exit_code = 0; + /* + * While ptraced, group stop is handled by STOP trap. + * Schedule it and let the caller deal with it. + */ + task_set_jobctl_pending(current, JOBCTL_TRAP_STOP); + return false; } +} - /* - * GROUP_STOP_PENDING could be set if another group stop has - * started since being woken up or ptrace wants us to transit - * between TASK_STOPPED and TRACED. Retry group stop. - */ - if (current->group_stop & GROUP_STOP_PENDING) { - WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK)); - goto retry; +/** + * do_jobctl_trap - take care of ptrace jobctl traps + * + * When PT_SEIZED, it's used for both group stop and explicit + * SEIZE/INTERRUPT traps. Both generate PTRACE_EVENT_STOP trap with + * accompanying siginfo. If stopped, lower eight bits of exit_code contain + * the stop signal; otherwise, %SIGTRAP. + * + * When !PT_SEIZED, it's used only for group stop trap with stop signal + * number as exit_code and no siginfo. + * + * CONTEXT: + * Must be called with @current->sighand->siglock held, which may be + * released and re-acquired before returning with intervening sleep. + */ +static void do_jobctl_trap(void) +{ + struct signal_struct *signal = current->signal; + int signr = current->jobctl & JOBCTL_STOP_SIGMASK; + + if (current->ptrace & PT_SEIZED) { + if (!signal->group_stop_count && + !(signal->flags & SIGNAL_STOP_STOPPED)) + signr = SIGTRAP; + WARN_ON_ONCE(!signr); + ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8), + CLD_STOPPED); + } else { + WARN_ON_ONCE(!signr); + ptrace_stop(signr, CLD_STOPPED, 0, NULL); + current->exit_code = 0; } - - /* PTRACE_ATTACH might have raced with task killing, clear trapping */ - task_clear_group_stop_trapping(current); - - spin_unlock_irq(¤t->sighand->siglock); - - tracehook_finish_jctl(); - - return 1; } static int ptrace_signal(int signr, siginfo_t *info, struct pt_regs *regs, void *cookie) { - if (!task_ptrace(current)) - return signr; - ptrace_signal_deliver(regs, cookie); - - /* Let the debugger run. */ + /* + * We do not check sig_kernel_stop(signr) but set this marker + * unconditionally because we do not know whether debugger will + * change signr. This flag has no meaning unless we are going + * to stop after return from ptrace_stop(). In this case it will + * be checked in do_signal_stop(), we should only stop if it was + * not cleared by SIGCONT while we were sleeping. See also the + * comment in dequeue_signal(). + */ + current->jobctl |= JOBCTL_STOP_DEQUEUED; ptrace_stop(signr, CLD_TRAPPED, 0, info); /* We're back. Did the debugger cancel the sig? */ @@ -2032,7 +2157,6 @@ relock: * the CLD_ si_code into SIGNAL_CLD_MASK bits. */ if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { - struct task_struct *leader; int why; if (signal->flags & SIGNAL_CLD_CONTINUED) @@ -2053,13 +2177,11 @@ relock: * a duplicate. */ read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, false, why); - leader = current->group_leader; - if (task_ptrace(leader) && !real_parent_is_ptracer(leader)) - do_notify_parent_cldstop(leader, true, why); - + if (ptrace_reparented(current->group_leader)) + do_notify_parent_cldstop(current->group_leader, + true, why); read_unlock(&tasklist_lock); goto relock; @@ -2067,37 +2189,31 @@ relock: for (;;) { struct k_sigaction *ka; - /* - * Tracing can induce an artificial signal and choose sigaction. - * The return value in @signr determines the default action, - * but @info->si_signo is the signal number we will report. - */ - signr = tracehook_get_signal(current, regs, info, return_ka); - if (unlikely(signr < 0)) + + if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) && + do_signal_stop(0)) goto relock; - if (unlikely(signr != 0)) - ka = return_ka; - else { - if (unlikely(current->group_stop & - GROUP_STOP_PENDING) && do_signal_stop(0)) - goto relock; - signr = dequeue_signal(current, ¤t->blocked, - info); + if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) { + do_jobctl_trap(); + spin_unlock_irq(&sighand->siglock); + goto relock; + } - if (!signr) - break; /* will return 0 */ + signr = dequeue_signal(current, ¤t->blocked, info); - if (signr != SIGKILL) { - signr = ptrace_signal(signr, info, - regs, cookie); - if (!signr) - continue; - } + if (!signr) + break; /* will return 0 */ - ka = &sighand->action[signr-1]; + if (unlikely(current->ptrace) && signr != SIGKILL) { + signr = ptrace_signal(signr, info, + regs, cookie); + if (!signr) + continue; } + ka = &sighand->action[signr-1]; + /* Trace actually delivered signals. */ trace_signal_deliver(signr, info, ka); @@ -2253,7 +2369,7 @@ void exit_signals(struct task_struct *tsk) signotset(&unblocked); retarget_shared_pending(tsk, &unblocked); - if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) && + if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) && task_participate_group_stop(tsk)) group_stop = CLD_STOPPED; out: @@ -2365,7 +2481,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) /** * sys_rt_sigprocmask - change the list of currently blocked signals * @how: whether to add, remove, or set signals - * @set: stores pending signals + * @nset: stores pending signals * @oset: previous value of signal mask if non-null * @sigsetsize: size of sigset_t type */ diff --git a/kernel/smp.c b/kernel/smp.c index 73a19519355..fb67dfa8394 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -74,7 +74,7 @@ static struct notifier_block __cpuinitdata hotplug_cfd_notifier = { .notifier_call = hotplug_cfd, }; -static int __cpuinit init_call_single_data(void) +void __init call_function_init(void) { void *cpu = (void *)(long)smp_processor_id(); int i; @@ -88,10 +88,7 @@ static int __cpuinit init_call_single_data(void) hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); register_cpu_notifier(&hotplug_cfd_notifier); - - return 0; } -early_initcall(init_call_single_data); /* * csd_lock/csd_unlock used to serialize access to per-cpu csd resources diff --git a/kernel/softirq.c b/kernel/softirq.c index 13960170cad..fca82c32042 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -58,7 +58,7 @@ DEFINE_PER_CPU(struct task_struct *, ksoftirqd); char *softirq_to_name[NR_SOFTIRQS] = { "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", - "TASKLET", "SCHED", "HRTIMER" + "TASKLET", "SCHED", "HRTIMER", "RCU" }; /* @@ -315,16 +315,24 @@ static inline void invoke_softirq(void) { if (!force_irqthreads) __do_softirq(); - else + else { + __local_bh_disable((unsigned long)__builtin_return_address(0), + SOFTIRQ_OFFSET); wakeup_softirqd(); + __local_bh_enable(SOFTIRQ_OFFSET); + } } #else static inline void invoke_softirq(void) { if (!force_irqthreads) do_softirq(); - else + else { + __local_bh_disable((unsigned long)__builtin_return_address(0), + SOFTIRQ_OFFSET); wakeup_softirqd(); + __local_bh_enable(SOFTIRQ_OFFSET); + } } #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4fc92445a29..f175d98bd35 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -938,6 +938,12 @@ static struct ctl_table kern_table[] = { }, #endif #ifdef CONFIG_PERF_EVENTS + /* + * User-space scripts rely on the existence of this file + * as a feature check for perf_events being enabled. + * + * So it's an ABI, do not remove! + */ { .procname = "perf_event_paranoid", .data = &sysctl_perf_event_paranoid, diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 9ffea360a77..fc0f2200541 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -285,16 +285,18 @@ ret: static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) { struct listener_list *listeners; - struct listener *s, *tmp; + struct listener *s, *tmp, *s2; unsigned int cpu; if (!cpumask_subset(mask, cpu_possible_mask)) return -EINVAL; + s = NULL; if (isadd == REGISTER) { for_each_cpu(cpu, mask) { - s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, - cpu_to_node(cpu)); + if (!s) + s = kmalloc_node(sizeof(struct listener), + GFP_KERNEL, cpu_to_node(cpu)); if (!s) goto cleanup; s->pid = pid; @@ -303,9 +305,16 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) listeners = &per_cpu(listener_array, cpu); down_write(&listeners->sem); + list_for_each_entry_safe(s2, tmp, &listeners->list, list) { + if (s2->pid == pid) + goto next_cpu; + } list_add(&s->list, &listeners->list); + s = NULL; +next_cpu: up_write(&listeners->sem); } + kfree(s); return 0; } diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 2d966244ea6..59f369f98a0 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -42,15 +42,75 @@ static struct alarm_base { clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; +/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */ +static ktime_t freezer_delta; +static DEFINE_SPINLOCK(freezer_delta_lock); + #ifdef CONFIG_RTC_CLASS /* rtc timer and device for setting alarm wakeups at suspend */ static struct rtc_timer rtctimer; static struct rtc_device *rtcdev; -#endif +static DEFINE_SPINLOCK(rtcdev_lock); -/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */ -static ktime_t freezer_delta; -static DEFINE_SPINLOCK(freezer_delta_lock); +/** + * has_wakealarm - check rtc device has wakealarm ability + * @dev: current device + * @name_ptr: name to be returned + * + * This helper function checks to see if the rtc device can wake + * from suspend. + */ +static int has_wakealarm(struct device *dev, void *name_ptr) +{ + struct rtc_device *candidate = to_rtc_device(dev); + + if (!candidate->ops->set_alarm) + return 0; + if (!device_may_wakeup(candidate->dev.parent)) + return 0; + + *(const char **)name_ptr = dev_name(dev); + return 1; +} + +/** + * alarmtimer_get_rtcdev - Return selected rtcdevice + * + * This function returns the rtc device to use for wakealarms. + * If one has not already been chosen, it checks to see if a + * functional rtc device is available. + */ +static struct rtc_device *alarmtimer_get_rtcdev(void) +{ + struct device *dev; + char *str; + unsigned long flags; + struct rtc_device *ret; + + spin_lock_irqsave(&rtcdev_lock, flags); + if (!rtcdev) { + /* Find an rtc device and init the rtc_timer */ + dev = class_find_device(rtc_class, NULL, &str, has_wakealarm); + /* If we have a device then str is valid. See has_wakealarm() */ + if (dev) { + rtcdev = rtc_class_open(str); + /* + * Drop the reference we got in class_find_device, + * rtc_open takes its own. + */ + put_device(dev); + rtc_timer_init(&rtctimer, NULL, NULL); + } + } + ret = rtcdev; + spin_unlock_irqrestore(&rtcdev_lock, flags); + + return ret; +} +#else +#define alarmtimer_get_rtcdev() (0) +#define rtcdev (0) +#endif /** @@ -166,6 +226,7 @@ static int alarmtimer_suspend(struct device *dev) struct rtc_time tm; ktime_t min, now; unsigned long flags; + struct rtc_device *rtc; int i; spin_lock_irqsave(&freezer_delta_lock, flags); @@ -173,8 +234,9 @@ static int alarmtimer_suspend(struct device *dev) freezer_delta = ktime_set(0, 0); spin_unlock_irqrestore(&freezer_delta_lock, flags); + rtc = rtcdev; /* If we have no rtcdev, just return */ - if (!rtcdev) + if (!rtc) return 0; /* Find the soonest timer to expire*/ @@ -199,12 +261,12 @@ static int alarmtimer_suspend(struct device *dev) WARN_ON(min.tv64 < NSEC_PER_SEC); /* Setup an rtc timer to fire that far in the future */ - rtc_timer_cancel(rtcdev, &rtctimer); - rtc_read_time(rtcdev, &tm); + rtc_timer_cancel(rtc, &rtctimer); + rtc_read_time(rtc, &tm); now = rtc_tm_to_ktime(tm); now = ktime_add(now, min); - rtc_timer_start(rtcdev, &rtctimer, now, ktime_set(0, 0)); + rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); return 0; } @@ -322,6 +384,9 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) { clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; + if (!alarmtimer_get_rtcdev()) + return -ENOTSUPP; + return hrtimer_get_res(baseid, tp); } @@ -336,6 +401,9 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) { struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; + if (!alarmtimer_get_rtcdev()) + return -ENOTSUPP; + *tp = ktime_to_timespec(base->gettime()); return 0; } @@ -351,6 +419,9 @@ static int alarm_timer_create(struct k_itimer *new_timer) enum alarmtimer_type type; struct alarm_base *base; + if (!alarmtimer_get_rtcdev()) + return -ENOTSUPP; + if (!capable(CAP_WAKE_ALARM)) return -EPERM; @@ -385,6 +456,9 @@ static void alarm_timer_get(struct k_itimer *timr, */ static int alarm_timer_del(struct k_itimer *timr) { + if (!rtcdev) + return -ENOTSUPP; + alarm_cancel(&timr->it.alarmtimer); return 0; } @@ -402,6 +476,9 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, struct itimerspec *new_setting, struct itimerspec *old_setting) { + if (!rtcdev) + return -ENOTSUPP; + /* Save old values */ old_setting->it_interval = ktime_to_timespec(timr->it.alarmtimer.period); @@ -541,6 +618,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, int ret = 0; struct restart_block *restart; + if (!alarmtimer_get_rtcdev()) + return -ENOTSUPP; + if (!capable(CAP_WAKE_ALARM)) return -EPERM; @@ -638,65 +718,3 @@ static int __init alarmtimer_init(void) } device_initcall(alarmtimer_init); -#ifdef CONFIG_RTC_CLASS -/** - * has_wakealarm - check rtc device has wakealarm ability - * @dev: current device - * @name_ptr: name to be returned - * - * This helper function checks to see if the rtc device can wake - * from suspend. - */ -static int __init has_wakealarm(struct device *dev, void *name_ptr) -{ - struct rtc_device *candidate = to_rtc_device(dev); - - if (!candidate->ops->set_alarm) - return 0; - if (!device_may_wakeup(candidate->dev.parent)) - return 0; - - *(const char **)name_ptr = dev_name(dev); - return 1; -} - -/** - * alarmtimer_init_late - Late initializing of alarmtimer code - * - * This function locates a rtc device to use for wakealarms. - * Run as late_initcall to make sure rtc devices have been - * registered. - */ -static int __init alarmtimer_init_late(void) -{ - struct device *dev; - char *str; - - /* Find an rtc device and init the rtc_timer */ - dev = class_find_device(rtc_class, NULL, &str, has_wakealarm); - /* If we have a device then str is valid. See has_wakealarm() */ - if (dev) { - rtcdev = rtc_class_open(str); - /* - * Drop the reference we got in class_find_device, - * rtc_open takes its own. - */ - put_device(dev); - } - if (!rtcdev) { - printk(KERN_WARNING "No RTC device found, ALARM timers will" - " not wake from suspend"); - } - rtc_timer_init(&rtctimer, NULL, NULL); - - return 0; -} -#else -static int __init alarmtimer_init_late(void) -{ - printk(KERN_WARNING "Kernel not built with RTC support, ALARM timers" - " will not wake from suspend"); - return 0; -} -#endif -late_initcall(alarmtimer_init_late); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index c027d4f602f..e4c699dfa4e 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -182,7 +182,10 @@ void clockevents_register_device(struct clock_event_device *dev) unsigned long flags; BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); - BUG_ON(!dev->cpumask); + if (!dev->cpumask) { + WARN_ON(num_possible_cpus() > 1); + dev->cpumask = cpumask_of(smp_processor_id()); + } raw_spin_lock_irqsave(&clockevents_lock, flags); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1c95fd67732..e0980f0d9a0 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -185,7 +185,6 @@ static struct clocksource *watchdog; static struct timer_list watchdog_timer; static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); static DEFINE_SPINLOCK(watchdog_lock); -static cycle_t watchdog_last; static int watchdog_running; static int clocksource_watchdog_kthread(void *data); @@ -254,11 +253,6 @@ static void clocksource_watchdog(unsigned long data) if (!watchdog_running) goto out; - wdnow = watchdog->read(watchdog); - wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask, - watchdog->mult, watchdog->shift); - watchdog_last = wdnow; - list_for_each_entry(cs, &watchdog_list, wd_list) { /* Clocksource already marked unstable? */ @@ -268,19 +262,28 @@ static void clocksource_watchdog(unsigned long data) continue; } + local_irq_disable(); csnow = cs->read(cs); + wdnow = watchdog->read(watchdog); + local_irq_enable(); /* Clocksource initialized ? */ if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { cs->flags |= CLOCK_SOURCE_WATCHDOG; - cs->wd_last = csnow; + cs->wd_last = wdnow; + cs->cs_last = csnow; continue; } - /* Check the deviation from the watchdog clocksource. */ - cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) & + wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask, + watchdog->mult, watchdog->shift); + + cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) & cs->mask, cs->mult, cs->shift); - cs->wd_last = csnow; + cs->cs_last = csnow; + cs->wd_last = wdnow; + + /* Check the deviation from the watchdog clocksource. */ if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { clocksource_unstable(cs, cs_nsec - wd_nsec); continue; @@ -318,7 +321,6 @@ static inline void clocksource_start_watchdog(void) return; init_timer(&watchdog_timer); watchdog_timer.function = clocksource_watchdog; - watchdog_last = watchdog->read(watchdog); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); watchdog_running = 1; diff --git a/kernel/timer.c b/kernel/timer.c index fd6198692b5..8cff36119e4 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -749,16 +749,15 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) unsigned long expires_limit, mask; int bit; - expires_limit = expires; - if (timer->slack >= 0) { expires_limit = expires + timer->slack; } else { - unsigned long now = jiffies; + long delta = expires - jiffies; + + if (delta < 256) + return expires; - /* No slack, if already expired else auto slack 0.4% */ - if (time_after(expires, now)) - expires_limit = expires + (expires - now)/256; + expires_limit = expires + delta / 256; } mask = expires ^ expires_limit; if (mask == 0) @@ -795,6 +794,8 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) */ int mod_timer(struct timer_list *timer, unsigned long expires) { + expires = apply_slack(timer, expires); + /* * This is a common optimization triggered by the * networking code - if the timer is re-modified @@ -803,8 +804,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires) if (timer_pending(timer) && timer->expires == expires) return 1; - expires = apply_slack(timer, expires); - return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); } EXPORT_SYMBOL(mod_timer); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1ee417fcbfa..908038f5744 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2740,7 +2740,7 @@ static int ftrace_process_regex(struct ftrace_hash *hash, { char *func, *command, *next = buff; struct ftrace_func_command *p; - int ret; + int ret = -EINVAL; func = strsep(&next, ":"); @@ -3330,6 +3330,7 @@ static int ftrace_process_locs(struct module *mod, { unsigned long *p; unsigned long addr; + unsigned long flags; mutex_lock(&ftrace_lock); p = start; @@ -3346,7 +3347,13 @@ static int ftrace_process_locs(struct module *mod, ftrace_record_ip(addr); } + /* + * Disable interrupts to prevent interrupts from executing + * code that is being modified. + */ + local_irq_save(flags); ftrace_update_code(mod); + local_irq_restore(flags); mutex_unlock(&ftrace_lock); return 0; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f925c45f0af..27d13b36b8b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1870,8 +1870,12 @@ fs_initcall(init_kprobe_trace); #ifdef CONFIG_FTRACE_STARTUP_TEST -static int kprobe_trace_selftest_target(int a1, int a2, int a3, - int a4, int a5, int a6) +/* + * The "__used" keeps gcc from removing the function symbol + * from the kallsyms table. + */ +static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, + int a4, int a5, int a6) { return a1 + a2 + a3 + a4 + a5 + a6; } diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index dff763b7baf..1f06468a10d 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -240,13 +240,10 @@ static const char **find_next(void *v, loff_t *pos) const char **fmt = v; int start_index; - if (!fmt) - fmt = __start___trace_bprintk_fmt + *pos; - start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; if (*pos < start_index) - return fmt; + return __start___trace_bprintk_fmt + *pos; return find_next_mod_format(start_index, v, fmt, pos); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0400553f0d0..25fb1b0e53f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -221,7 +221,7 @@ typedef unsigned long mayday_mask_t; * per-CPU workqueues: */ struct workqueue_struct { - unsigned int flags; /* I: WQ_* flags */ + unsigned int flags; /* W: WQ_* flags */ union { struct cpu_workqueue_struct __percpu *pcpu; struct cpu_workqueue_struct *single; @@ -240,6 +240,7 @@ struct workqueue_struct { mayday_mask_t mayday_mask; /* cpus requesting rescue */ struct worker *rescuer; /* I: rescue worker */ + int nr_drainers; /* W: drain in progress */ int saved_max_active; /* W: saved cwq max_active */ const char *name; /* I: workqueue name */ #ifdef CONFIG_LOCKDEP @@ -990,7 +991,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, debug_work_activate(work); /* if dying, only works from the same workqueue are allowed */ - if (unlikely(wq->flags & WQ_DYING) && + if (unlikely(wq->flags & WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; @@ -2381,6 +2382,54 @@ out_unlock: } EXPORT_SYMBOL_GPL(flush_workqueue); +/** + * drain_workqueue - drain a workqueue + * @wq: workqueue to drain + * + * Wait until the workqueue becomes empty. While draining is in progress, + * only chain queueing is allowed. IOW, only currently pending or running + * work items on @wq can queue further work items on it. @wq is flushed + * repeatedly until it becomes empty. The number of flushing is detemined + * by the depth of chaining and should be relatively short. Whine if it + * takes too long. + */ +void drain_workqueue(struct workqueue_struct *wq) +{ + unsigned int flush_cnt = 0; + unsigned int cpu; + + /* + * __queue_work() needs to test whether there are drainers, is much + * hotter than drain_workqueue() and already looks at @wq->flags. + * Use WQ_DRAINING so that queue doesn't have to check nr_drainers. + */ + spin_lock(&workqueue_lock); + if (!wq->nr_drainers++) + wq->flags |= WQ_DRAINING; + spin_unlock(&workqueue_lock); +reflush: + flush_workqueue(wq); + + for_each_cwq_cpu(cpu, wq) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + if (!cwq->nr_active && list_empty(&cwq->delayed_works)) + continue; + + if (++flush_cnt == 10 || + (flush_cnt % 100 == 0 && flush_cnt <= 1000)) + pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n", + wq->name, flush_cnt); + goto reflush; + } + + spin_lock(&workqueue_lock); + if (!--wq->nr_drainers) + wq->flags &= ~WQ_DRAINING; + spin_unlock(&workqueue_lock); +} +EXPORT_SYMBOL_GPL(drain_workqueue); + static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, bool wait_executing) { @@ -3009,34 +3058,10 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key); */ void destroy_workqueue(struct workqueue_struct *wq) { - unsigned int flush_cnt = 0; unsigned int cpu; - /* - * Mark @wq dying and drain all pending works. Once WQ_DYING is - * set, only chain queueing is allowed. IOW, only currently - * pending or running work items on @wq can queue further work - * items on it. @wq is flushed repeatedly until it becomes empty. - * The number of flushing is detemined by the depth of chaining and - * should be relatively short. Whine if it takes too long. - */ - wq->flags |= WQ_DYING; -reflush: - flush_workqueue(wq); - - for_each_cwq_cpu(cpu, wq) { - struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - - if (!cwq->nr_active && list_empty(&cwq->delayed_works)) - continue; - - if (++flush_cnt == 10 || - (flush_cnt % 100 == 0 && flush_cnt <= 1000)) - printk(KERN_WARNING "workqueue %s: flush on " - "destruction isn't complete after %u tries\n", - wq->name, flush_cnt); - goto reflush; - } + /* drain it before proceeding with destruction */ + drain_workqueue(wq); /* * wq list is used to freeze wq, remove from list after |