diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/async.c | 27 | ||||
-rw-r--r-- | kernel/compat.c | 23 | ||||
-rw-r--r-- | kernel/context_tracking.c | 75 | ||||
-rw-r--r-- | kernel/debug/kdb/kdb_main.c | 2 | ||||
-rw-r--r-- | kernel/events/core.c | 20 | ||||
-rw-r--r-- | kernel/fork.c | 6 | ||||
-rw-r--r-- | kernel/module.c | 154 | ||||
-rw-r--r-- | kernel/pid.c | 2 | ||||
-rw-r--r-- | kernel/printk.c | 9 | ||||
-rw-r--r-- | kernel/ptrace.c | 74 | ||||
-rw-r--r-- | kernel/rcu.h | 7 | ||||
-rw-r--r-- | kernel/rcupdate.c | 60 | ||||
-rw-r--r-- | kernel/rcutiny.c | 8 | ||||
-rw-r--r-- | kernel/rcutiny_plugin.h | 56 | ||||
-rw-r--r-- | kernel/rcutorture.c | 66 | ||||
-rw-r--r-- | kernel/rcutree.c | 260 | ||||
-rw-r--r-- | kernel/rcutree.h | 11 | ||||
-rw-r--r-- | kernel/rcutree_plugin.h | 13 | ||||
-rw-r--r-- | kernel/sched/core.c | 3 | ||||
-rw-r--r-- | kernel/sched/debug.c | 4 | ||||
-rw-r--r-- | kernel/sched/fair.c | 2 | ||||
-rw-r--r-- | kernel/sched/rt.c | 2 | ||||
-rw-r--r-- | kernel/signal.c | 24 | ||||
-rw-r--r-- | kernel/smp.c | 13 | ||||
-rw-r--r-- | kernel/srcu.c | 37 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_clock.c | 1 |
27 files changed, 714 insertions, 247 deletions
diff --git a/kernel/async.c b/kernel/async.c index a1d585c351d..6f34904a0b5 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -86,18 +86,27 @@ static atomic_t entry_count; */ static async_cookie_t __lowest_in_progress(struct async_domain *running) { + async_cookie_t first_running = next_cookie; /* infinity value */ + async_cookie_t first_pending = next_cookie; /* ditto */ struct async_entry *entry; + /* + * Both running and pending lists are sorted but not disjoint. + * Take the first cookies from both and return the min. + */ if (!list_empty(&running->domain)) { entry = list_first_entry(&running->domain, typeof(*entry), list); - return entry->cookie; + first_running = entry->cookie; } - list_for_each_entry(entry, &async_pending, list) - if (entry->running == running) - return entry->cookie; + list_for_each_entry(entry, &async_pending, list) { + if (entry->running == running) { + first_pending = entry->cookie; + break; + } + } - return next_cookie; /* "infinity" value */ + return min(first_running, first_pending); } static async_cookie_t lowest_in_progress(struct async_domain *running) @@ -118,13 +127,17 @@ static void async_run_entry_fn(struct work_struct *work) { struct async_entry *entry = container_of(work, struct async_entry, work); + struct async_entry *pos; unsigned long flags; ktime_t uninitialized_var(calltime), delta, rettime; struct async_domain *running = entry->running; - /* 1) move self to the running queue */ + /* 1) move self to the running queue, make sure it stays sorted */ spin_lock_irqsave(&async_lock, flags); - list_move_tail(&entry->list, &running->domain); + list_for_each_entry_reverse(pos, &running->domain, list) + if (entry->cookie < pos->cookie) + break; + list_move_tail(&entry->list, &pos->list); spin_unlock_irqrestore(&async_lock, flags); /* 2) run (and print duration) */ diff --git a/kernel/compat.c b/kernel/compat.c index f6150e92dfc..36700e9e2be 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -535,9 +535,11 @@ asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru) return 0; } -asmlinkage long -compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, - struct compat_rusage __user *ru) +COMPAT_SYSCALL_DEFINE4(wait4, + compat_pid_t, pid, + compat_uint_t __user *, stat_addr, + int, options, + struct compat_rusage __user *, ru) { if (!ru) { return sys_wait4(pid, stat_addr, options, NULL); @@ -564,9 +566,10 @@ compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, } } -asmlinkage long compat_sys_waitid(int which, compat_pid_t pid, - struct compat_siginfo __user *uinfo, int options, - struct compat_rusage __user *uru) +COMPAT_SYSCALL_DEFINE5(waitid, + int, which, compat_pid_t, pid, + struct compat_siginfo __user *, uinfo, int, options, + struct compat_rusage __user *, uru) { siginfo_t info; struct rusage ru; @@ -584,7 +587,11 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid, return ret; if (uru) { - ret = put_compat_rusage(&ru, uru); + /* sys_waitid() overwrites everything in ru */ + if (COMPAT_USE_64BIT_TIME) + ret = copy_to_user(uru, &ru, sizeof(ru)); + else + ret = put_compat_rusage(&ru, uru); if (ret) return ret; } @@ -994,7 +1001,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, sigset_from_compat(&s, &s32); if (uts) { - if (get_compat_timespec(&t, uts)) + if (compat_get_timespec(&t, uts)) return -EFAULT; } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index e0e07fd5550..d566aba7e80 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -1,3 +1,19 @@ +/* + * Context tracking: Probe on high level context boundaries such as kernel + * and userspace. This includes syscalls and exceptions entry/exit. + * + * This is used by RCU to remove its dependency on the timer tick while a CPU + * runs in userspace. + * + * Started by Frederic Weisbecker: + * + * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com> + * + * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton, + * Steven Rostedt, Peter Zijlstra for suggestions and improvements. + * + */ + #include <linux/context_tracking.h> #include <linux/rcupdate.h> #include <linux/sched.h> @@ -6,8 +22,8 @@ struct context_tracking { /* - * When active is false, hooks are not set to - * minimize overhead: TIF flags are cleared + * When active is false, probes are unset in order + * to minimize overhead: TIF flags are cleared * and calls to user_enter/exit are ignored. This * may be further optimized using static keys. */ @@ -24,6 +40,15 @@ static DEFINE_PER_CPU(struct context_tracking, context_tracking) = { #endif }; +/** + * user_enter - Inform the context tracking that the CPU is going to + * enter userspace mode. + * + * This function must be called right before we switch from the kernel + * to userspace, when it's guaranteed the remaining kernel instructions + * to execute won't use any RCU read side critical section because this + * function sets RCU in extended quiescent state. + */ void user_enter(void) { unsigned long flags; @@ -39,40 +64,70 @@ void user_enter(void) if (in_interrupt()) return; + /* Kernel threads aren't supposed to go to userspace */ WARN_ON_ONCE(!current->mm); local_irq_save(flags); if (__this_cpu_read(context_tracking.active) && __this_cpu_read(context_tracking.state) != IN_USER) { __this_cpu_write(context_tracking.state, IN_USER); + /* + * At this stage, only low level arch entry code remains and + * then we'll run in userspace. We can assume there won't be + * any RCU read-side critical section until the next call to + * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency + * on the tick. + */ rcu_user_enter(); } local_irq_restore(flags); } + +/** + * user_exit - Inform the context tracking that the CPU is + * exiting userspace mode and entering the kernel. + * + * This function must be called after we entered the kernel from userspace + * before any use of RCU read side critical section. This potentially include + * any high level kernel code like syscalls, exceptions, signal handling, etc... + * + * This call supports re-entrancy. This way it can be called from any exception + * handler without needing to know if we came from userspace or not. + */ void user_exit(void) { unsigned long flags; - /* - * Some contexts may involve an exception occuring in an irq, - * leading to that nesting: - * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() - * This would mess up the dyntick_nesting count though. And rcu_irq_*() - * helpers are enough to protect RCU uses inside the exception. So - * just return immediately if we detect we are in an IRQ. - */ if (in_interrupt()) return; local_irq_save(flags); if (__this_cpu_read(context_tracking.state) == IN_USER) { __this_cpu_write(context_tracking.state, IN_KERNEL); + /* + * We are going to run code that may use RCU. Inform + * RCU core about that (ie: we may need the tick again). + */ rcu_user_exit(); } local_irq_restore(flags); } + +/** + * context_tracking_task_switch - context switch the syscall callbacks + * @prev: the task that is being switched out + * @next: the task that is being switched in + * + * The context tracking uses the syscall slow path to implement its user-kernel + * boundaries probes on syscalls. This way it doesn't impact the syscall fast + * path on CPUs that don't do context tracking. + * + * But we need to clear the flag on the previous task because it may later + * migrate to some CPU that doesn't do the context tracking. As such the TIF + * flag may not be desired there. + */ void context_tracking_task_switch(struct task_struct *prev, struct task_struct *next) { diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 4d5f8d5612f..8875254120b 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1970,6 +1970,8 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf("Module Size modstruct Used by\n"); list_for_each_entry(mod, kdb_modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; kdb_printf("%-20s%8u 0x%p ", mod->name, mod->core_size, (void *)mod); diff --git a/kernel/events/core.c b/kernel/events/core.c index 301079d06f2..7b6646a8c06 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -908,6 +908,15 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) } /* + * Initialize event state based on the perf_event_attr::disabled. + */ +static inline void perf_event__state_init(struct perf_event *event) +{ + event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF : + PERF_EVENT_STATE_INACTIVE; +} + +/* * Called at perf_event creation and when events are attached/detached from a * group. */ @@ -6179,8 +6188,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->overflow_handler = overflow_handler; event->overflow_handler_context = context; - if (attr->disabled) - event->state = PERF_EVENT_STATE_OFF; + perf_event__state_init(event); pmu = NULL; @@ -6609,9 +6617,17 @@ SYSCALL_DEFINE5(perf_event_open, mutex_lock(&gctx->mutex); perf_remove_from_context(group_leader); + + /* + * Removing from the context ends up with disabled + * event. What we want here is event in the initial + * startup state, ready to be add into new context. + */ + perf_event__state_init(group_leader); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { perf_remove_from_context(sibling); + perf_event__state_init(sibling); put_ctx(gctx); } mutex_unlock(&gctx->mutex); diff --git a/kernel/fork.c b/kernel/fork.c index 65ca6d27f24..c535f33bbb9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1668,8 +1668,10 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int, tls_val) #endif { - return do_fork(clone_flags, newsp, 0, - parent_tidptr, child_tidptr); + long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); + asmlinkage_protect(5, ret, clone_flags, newsp, + parent_tidptr, child_tidptr, tls_val); + return ret; } #endif diff --git a/kernel/module.c b/kernel/module.c index b10b048367e..eab08274ec9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -188,6 +188,7 @@ struct load_info { ongoing or failed initialization etc. */ static inline int strong_try_module_get(struct module *mod) { + BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED); if (mod && mod->state == MODULE_STATE_COMING) return -EBUSY; if (try_module_get(mod)) @@ -343,6 +344,9 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr, #endif }; + if (mod->state == MODULE_STATE_UNFORMED) + continue; + if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data)) return true; } @@ -450,16 +454,24 @@ const struct kernel_symbol *find_symbol(const char *name, EXPORT_SYMBOL_GPL(find_symbol); /* Search for module by name: must hold module_mutex. */ -struct module *find_module(const char *name) +static struct module *find_module_all(const char *name, + bool even_unformed) { struct module *mod; list_for_each_entry(mod, &modules, list) { + if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) + continue; if (strcmp(mod->name, name) == 0) return mod; } return NULL; } + +struct module *find_module(const char *name) +{ + return find_module_all(name, false); +} EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP @@ -525,6 +537,8 @@ bool is_module_percpu_address(unsigned long addr) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (!mod->percpu_size) continue; for_each_possible_cpu(cpu) { @@ -1048,6 +1062,8 @@ static ssize_t show_initstate(struct module_attribute *mattr, case MODULE_STATE_GOING: state = "going"; break; + default: + BUG(); } return sprintf(buffer, "%s\n", state); } @@ -1786,6 +1802,8 @@ void set_all_modules_text_rw(void) mutex_lock(&module_mutex); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if ((mod->module_core) && (mod->core_text_size)) { set_page_attributes(mod->module_core, mod->module_core + mod->core_text_size, @@ -1807,6 +1825,8 @@ void set_all_modules_text_ro(void) mutex_lock(&module_mutex); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if ((mod->module_core) && (mod->core_text_size)) { set_page_attributes(mod->module_core, mod->module_core + mod->core_text_size, @@ -2527,6 +2547,13 @@ static int copy_module_from_fd(int fd, struct load_info *info) err = -EFBIG; goto out; } + + /* Don't hand 0 to vmalloc, it whines. */ + if (stat.size == 0) { + err = -EINVAL; + goto out; + } + info->hdr = vmalloc(stat.size); if (!info->hdr) { err = -ENOMEM; @@ -2990,8 +3017,9 @@ static bool finished_loading(const char *name) bool ret; mutex_lock(&module_mutex); - mod = find_module(name); - ret = !mod || mod->state != MODULE_STATE_COMING; + mod = find_module_all(name, true); + ret = !mod || mod->state == MODULE_STATE_LIVE + || mod->state == MODULE_STATE_GOING; mutex_unlock(&module_mutex); return ret; @@ -3136,6 +3164,32 @@ static int load_module(struct load_info *info, const char __user *uargs, goto free_copy; } + /* + * We try to place it in the list now to make sure it's unique + * before we dedicate too many resources. In particular, + * temporary percpu memory exhaustion. + */ + mod->state = MODULE_STATE_UNFORMED; +again: + mutex_lock(&module_mutex); + if ((old = find_module_all(mod->name, true)) != NULL) { + if (old->state == MODULE_STATE_COMING + || old->state == MODULE_STATE_UNFORMED) { + /* Wait in case it fails to load. */ + mutex_unlock(&module_mutex); + err = wait_event_interruptible(module_wq, + finished_loading(mod->name)); + if (err) + goto free_module; + goto again; + } + err = -EEXIST; + mutex_unlock(&module_mutex); + goto free_module; + } + list_add_rcu(&mod->list, &modules); + mutex_unlock(&module_mutex); + #ifdef CONFIG_MODULE_SIG mod->sig_ok = info->sig_ok; if (!mod->sig_ok) @@ -3145,7 +3199,7 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Now module is in final location, initialize linked lists, etc. */ err = module_unload_init(mod); if (err) - goto free_module; + goto unlink_mod; /* Now we've got everything in the final locations, we can * find optional sections. */ @@ -3180,54 +3234,33 @@ static int load_module(struct load_info *info, const char __user *uargs, goto free_arch_cleanup; } - /* Mark state as coming so strong_try_module_get() ignores us. */ - mod->state = MODULE_STATE_COMING; - - /* Now sew it into the lists so we can get lockdep and oops - * info during argument parsing. No one should access us, since - * strong_try_module_get() will fail. - * lockdep/oops can run asynchronous, so use the RCU list insertion - * function to insert in a way safe to concurrent readers. - * The mutex protects against concurrent writers. - */ -again: - mutex_lock(&module_mutex); - if ((old = find_module(mod->name)) != NULL) { - if (old->state == MODULE_STATE_COMING) { - /* Wait in case it fails to load. */ - mutex_unlock(&module_mutex); - err = wait_event_interruptible(module_wq, - finished_loading(mod->name)); - if (err) - goto free_arch_cleanup; - goto again; - } - err = -EEXIST; - goto unlock; - } - - /* This has to be done once we're sure module name is unique. */ dynamic_debug_setup(info->debug, info->num_debug); - /* Find duplicate symbols */ + mutex_lock(&module_mutex); + /* Find duplicate symbols (must be called under lock). */ err = verify_export_symbols(mod); if (err < 0) - goto ddebug; + goto ddebug_cleanup; + /* This relies on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); - list_add_rcu(&mod->list, &modules); + + /* Mark state as coming so strong_try_module_get() ignores us, + * but kallsyms etc. can see us. */ + mod->state = MODULE_STATE_COMING; + mutex_unlock(&module_mutex); /* Module is ready to execute: parsing args may do that. */ err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, -32768, 32767, &ddebug_dyndbg_module_param_cb); if (err < 0) - goto unlink; + goto bug_cleanup; /* Link in to syfs. */ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); if (err < 0) - goto unlink; + goto bug_cleanup; /* Get rid of temporary copy. */ free_copy(info); @@ -3237,16 +3270,13 @@ again: return do_init_module(mod); - unlink: + bug_cleanup: + /* module_bug_cleanup needs module_mutex protection */ mutex_lock(&module_mutex); - /* Unlink carefully: kallsyms could be walking list. */ - list_del_rcu(&mod->list); module_bug_cleanup(mod); - wake_up_all(&module_wq); - ddebug: - dynamic_debug_remove(info->debug); - unlock: + ddebug_cleanup: mutex_unlock(&module_mutex); + dynamic_debug_remove(info->debug); synchronize_sched(); kfree(mod->args); free_arch_cleanup: @@ -3255,6 +3285,12 @@ again: free_modinfo(mod); free_unload: module_unload_free(mod); + unlink_mod: + mutex_lock(&module_mutex); + /* Unlink carefully: kallsyms could be walking list. */ + list_del_rcu(&mod->list); + wake_up_all(&module_wq); + mutex_unlock(&module_mutex); free_module: module_deallocate(mod, info); free_copy: @@ -3377,6 +3413,8 @@ const char *module_address_lookup(unsigned long addr, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (within_module_init(addr, mod) || within_module_core(addr, mod)) { if (modname) @@ -3400,6 +3438,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (within_module_init(addr, mod) || within_module_core(addr, mod)) { const char *sym; @@ -3424,6 +3464,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (within_module_init(addr, mod) || within_module_core(addr, mod)) { const char *sym; @@ -3451,6 +3493,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (symnum < mod->num_symtab) { *value = mod->symtab[symnum].st_value; *type = mod->symtab[symnum].st_info; @@ -3493,9 +3537,12 @@ unsigned long module_kallsyms_lookup_name(const char *name) ret = mod_find_symname(mod, colon+1); *colon = ':'; } else { - list_for_each_entry_rcu(mod, &modules, list) + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if ((ret = mod_find_symname(mod, name)) != 0) break; + } } preempt_enable(); return ret; @@ -3510,6 +3557,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, int ret; list_for_each_entry(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; for (i = 0; i < mod->num_symtab; i++) { ret = fn(data, mod->strtab + mod->symtab[i].st_name, mod, mod->symtab[i].st_value); @@ -3525,6 +3574,7 @@ static char *module_flags(struct module *mod, char *buf) { int bx = 0; + BUG_ON(mod->state == MODULE_STATE_UNFORMED); if (mod->taints || mod->state == MODULE_STATE_GOING || mod->state == MODULE_STATE_COMING) { @@ -3566,6 +3616,10 @@ static int m_show(struct seq_file *m, void *p) struct module *mod = list_entry(p, struct module, list); char buf[8]; + /* We always ignore unformed modules. */ + if (mod->state == MODULE_STATE_UNFORMED) + return 0; + seq_printf(m, "%s %u", mod->name, mod->init_size + mod->core_size); print_unload_info(m, mod); @@ -3626,6 +3680,8 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (mod->num_exentries == 0) continue; @@ -3674,10 +3730,13 @@ struct module *__module_address(unsigned long addr) if (addr < module_addr_min || addr > module_addr_max) return NULL; - list_for_each_entry_rcu(mod, &modules, list) + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (within_module_core(addr, mod) || within_module_init(addr, mod)) return mod; + } return NULL; } EXPORT_SYMBOL_GPL(__module_address); @@ -3730,8 +3789,11 @@ void print_modules(void) printk(KERN_DEFAULT "Modules linked in:"); /* Most callers should already have preempt disabled, but make sure */ preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; printk(" %s%s", mod->name, module_flags(mod, buf)); + } preempt_enable(); if (last_unloaded_module[0]) printk(" [last unloaded: %s]", last_unloaded_module); diff --git a/kernel/pid.c b/kernel/pid.c index de9af600006..f2c6a682509 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -331,7 +331,7 @@ out: return pid; out_unlock: - spin_unlock(&pidmap_lock); + spin_unlock_irq(&pidmap_lock); out_free: while (++i <= ns->level) free_pidmap(pid->numbers + i); diff --git a/kernel/printk.c b/kernel/printk.c index 0b31715f335..f24633afa46 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -88,12 +88,6 @@ static DEFINE_SEMAPHORE(console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); -#ifdef CONFIG_LOCKDEP -static struct lockdep_map console_lock_dep_map = { - .name = "console_lock" -}; -#endif - /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's @@ -1925,7 +1919,6 @@ void console_lock(void) return; console_locked = 1; console_may_schedule = 1; - mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); } EXPORT_SYMBOL(console_lock); @@ -1947,7 +1940,6 @@ int console_trylock(void) } console_locked = 1; console_may_schedule = 0; - mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_); return 1; } EXPORT_SYMBOL(console_trylock); @@ -2110,7 +2102,6 @@ skip: local_irq_restore(flags); } console_locked = 0; - mutex_release(&console_lock_dep_map, 1, _RET_IP_); /* Release the exclusive_console once it is used */ if (unlikely(exclusive_console)) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1599157336a..6cbeaae4406 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -117,11 +117,45 @@ void __ptrace_unlink(struct task_struct *child) * TASK_KILLABLE sleeps. */ if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child)) - signal_wake_up(child, task_is_traced(child)); + ptrace_signal_wake_up(child, true); spin_unlock(&child->sighand->siglock); } +/* Ensure that nothing can wake it up, even SIGKILL */ +static bool ptrace_freeze_traced(struct task_struct *task) +{ + bool ret = false; + + /* Lockless, nobody but us can set this flag */ + if (task->jobctl & JOBCTL_LISTENING) + return ret; + + spin_lock_irq(&task->sighand->siglock); + if (task_is_traced(task) && !__fatal_signal_pending(task)) { + task->state = __TASK_TRACED; + ret = true; + } + spin_unlock_irq(&task->sighand->siglock); + + return ret; +} + +static void ptrace_unfreeze_traced(struct task_struct *task) +{ + if (task->state != __TASK_TRACED) + return; + + WARN_ON(!task->ptrace || task->parent != current); + + spin_lock_irq(&task->sighand->siglock); + if (__fatal_signal_pending(task)) + wake_up_state(task, __TASK_TRACED); + else + task->state = TASK_TRACED; + spin_unlock_irq(&task->sighand->siglock); +} + /** * ptrace_check_attach - check whether ptracee is ready for ptrace operation * @child: ptracee to check for @@ -139,7 +173,7 @@ void __ptrace_unlink(struct task_struct *child) * RETURNS: * 0 on success, -ESRCH if %child is not ready. */ -int ptrace_check_attach(struct task_struct *child, bool ignore_state) +static int ptrace_check_attach(struct task_struct *child, bool ignore_state) { int ret = -ESRCH; @@ -151,24 +185,29 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state) * be changed by us so it's not changing right after this. */ read_lock(&tasklist_lock); - if ((child->ptrace & PT_PTRACED) && child->parent == current) { + if (child->ptrace && child->parent == current) { + WARN_ON(child->state == __TASK_TRACED); /* * child->sighand can't be NULL, release_task() * does ptrace_unlink() before __exit_signal(). */ - spin_lock_irq(&child->sighand->siglock); - WARN_ON_ONCE(task_is_stopped(child)); - if (ignore_state || (task_is_traced(child) && - !(child->jobctl & JOBCTL_LISTENING))) + if (ignore_state || ptrace_freeze_traced(child)) ret = 0; - spin_unlock_irq(&child->sighand->siglock); } read_unlock(&tasklist_lock); - if (!ret && !ignore_state) - ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; + if (!ret && !ignore_state) { + if (!wait_task_inactive(child, __TASK_TRACED)) { + /* + * This can only happen if may_ptrace_stop() fails and + * ptrace_stop() changes ->state back to TASK_RUNNING, + * so we should not worry about leaking __TASK_TRACED. + */ + WARN_ON(child->state == __TASK_TRACED); + ret = -ESRCH; + } + } - /* All systems go.. */ return ret; } @@ -317,7 +356,7 @@ static int ptrace_attach(struct task_struct *task, long request, */ if (task_is_stopped(task) && task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) - signal_wake_up(task, 1); + signal_wake_up_state(task, __TASK_STOPPED); spin_unlock(&task->sighand->siglock); @@ -737,7 +776,7 @@ int ptrace_request(struct task_struct *child, long request, * tracee into STOP. */ if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP))) - signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); + ptrace_signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); unlock_task_sighand(child, &flags); ret = 0; @@ -763,7 +802,7 @@ int ptrace_request(struct task_struct *child, long request, * start of this trap and now. Trigger re-trap. */ if (child->jobctl & JOBCTL_TRAP_NOTIFY) - signal_wake_up(child, true); + ptrace_signal_wake_up(child, true); ret = 0; } unlock_task_sighand(child, &flags); @@ -900,6 +939,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, goto out_put_task_struct; ret = arch_ptrace(child, request, addr, data); + if (ret || request != PTRACE_DETACH) + ptrace_unfreeze_traced(child); out_put_task_struct: put_task_struct(child); @@ -1039,8 +1080,11 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, ret = ptrace_check_attach(child, request == PTRACE_KILL || request == PTRACE_INTERRUPT); - if (!ret) + if (!ret) { ret = compat_arch_ptrace(child, request, addr, data); + if (ret || request != PTRACE_DETACH) + ptrace_unfreeze_traced(child); + } out_put_task_struct: put_task_struct(child); diff --git a/kernel/rcu.h b/kernel/rcu.h index 20dfba576c2..7f8e7590e3e 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -111,4 +111,11 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) extern int rcu_expedited; +#ifdef CONFIG_RCU_STALL_COMMON + +extern int rcu_cpu_stall_suppress; +int rcu_jiffies_till_stall_check(void); + +#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a2cf76177b4..48ab70384a4 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -404,11 +404,65 @@ EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) -void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp) +void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp, + unsigned long secs, + unsigned long c_old, unsigned long c) { - trace_rcu_torture_read(rcutorturename, rhp); + trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c); } EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); #else -#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) +#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ + do { } while (0) #endif + +#ifdef CONFIG_RCU_STALL_COMMON + +#ifdef CONFIG_PROVE_RCU +#define RCU_STALL_DELAY_DELTA (5 * HZ) +#else +#define RCU_STALL_DELAY_DELTA 0 +#endif + +int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ +int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; + +module_param(rcu_cpu_stall_suppress, int, 0644); +module_param(rcu_cpu_stall_timeout, int, 0644); + +int rcu_jiffies_till_stall_check(void) +{ + int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); + + /* + * Limit check must be consistent with the Kconfig limits + * for CONFIG_RCU_CPU_STALL_TIMEOUT. + */ + if (till_stall_check < 3) { + ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; + till_stall_check = 3; + } else if (till_stall_check > 300) { + ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; + till_stall_check = 300; + } + return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; +} + +static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) +{ + rcu_cpu_stall_suppress = 1; + return NOTIFY_DONE; +} + +static struct notifier_block rcu_panic_block = { + .notifier_call = rcu_panic, +}; + +static int __init check_cpu_stall_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); + return 0; +} +early_initcall(check_cpu_stall_init); + +#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index e7dce58f9c2..a0714a51b6d 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -51,10 +51,10 @@ static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_ctrlblk *rcp); -#include "rcutiny_plugin.h" - static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; +#include "rcutiny_plugin.h" + /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ static void rcu_idle_enter_common(long long newval) { @@ -193,7 +193,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle); * interrupts don't count, we must be running at the first interrupt * level. */ -int rcu_is_cpu_rrupt_from_idle(void) +static int rcu_is_cpu_rrupt_from_idle(void) { return rcu_dynticks_nesting <= 1; } @@ -205,6 +205,7 @@ int rcu_is_cpu_rrupt_from_idle(void) */ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { + reset_cpu_stall_ticks(rcp); if (rcp->rcucblist != NULL && rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; @@ -251,6 +252,7 @@ void rcu_bh_qs(int cpu) */ void rcu_check_callbacks(int cpu, int user) { + check_cpu_stalls(); if (user || rcu_is_cpu_rrupt_from_idle()) rcu_sched_qs(cpu); else if (!in_softirq()) diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index f85016a2309..8a233002fae 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -33,6 +33,9 @@ struct rcu_ctrlblk { struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ struct rcu_head **curtail; /* ->next pointer of last CB. */ RCU_TRACE(long qlen); /* Number of pending CBs. */ + RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ + RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ + RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ RCU_TRACE(char *name); /* Name of RCU type. */ }; @@ -54,6 +57,51 @@ int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#ifdef CONFIG_RCU_TRACE + +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ + unsigned long j; + unsigned long js; + + if (rcu_cpu_stall_suppress) + return; + rcp->ticks_this_gp++; + j = jiffies; + js = rcp->jiffies_stall; + if (*rcp->curtail && ULONG_CMP_GE(j, js)) { + pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", + rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, + jiffies - rcp->gp_start, rcp->qlen); + dump_stack(); + } + if (*rcp->curtail && ULONG_CMP_GE(j, js)) + rcp->jiffies_stall = jiffies + + 3 * rcu_jiffies_till_stall_check() + 3; + else if (ULONG_CMP_GE(j, js)) + rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +} + +static void check_cpu_stall_preempt(void); + +#endif /* #ifdef CONFIG_RCU_TRACE */ + +static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) +{ +#ifdef CONFIG_RCU_TRACE + rcp->ticks_this_gp = 0; + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +#endif /* #ifdef CONFIG_RCU_TRACE */ +} + +static void check_cpu_stalls(void) +{ + RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); + RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); + RCU_TRACE(check_cpu_stall_preempt()); +} + #ifdef CONFIG_TINY_PREEMPT_RCU #include <linux/delay.h> @@ -448,6 +496,7 @@ static void rcu_preempt_start_gp(void) /* Official start of GP. */ rcu_preempt_ctrlblk.gpnum++; RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); + reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb); /* Any blocked RCU readers block new GP. */ if (rcu_preempt_blocked_readers_any()) @@ -1054,4 +1103,11 @@ MODULE_AUTHOR("Paul E. McKenney"); MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); MODULE_LICENSE("GPL"); +static void check_cpu_stall_preempt(void) +{ +#ifdef CONFIG_TINY_PREEMPT_RCU + check_cpu_stall(&rcu_preempt_ctrlblk.rcb); +#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */ +} + #endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 31dea01c85f..e1f3a8c9672 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -46,6 +46,7 @@ #include <linux/stat.h> #include <linux/srcu.h> #include <linux/slab.h> +#include <linux/trace_clock.h> #include <asm/byteorder.h> MODULE_LICENSE("GPL"); @@ -207,6 +208,20 @@ MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); #define rcu_can_boost() 0 #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ +#ifdef CONFIG_RCU_TRACE +static u64 notrace rcu_trace_clock_local(void) +{ + u64 ts = trace_clock_local(); + unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); + return ts; +} +#else /* #ifdef CONFIG_RCU_TRACE */ +static u64 notrace rcu_trace_clock_local(void) +{ + return 0ULL; +} +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + static unsigned long shutdown_time; /* jiffies to system shutdown. */ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ @@ -845,7 +860,7 @@ static int rcu_torture_boost(void *arg) /* Wait for the next test interval. */ oldstarttime = boost_starttime; while (ULONG_CMP_LT(jiffies, oldstarttime)) { - schedule_timeout_uninterruptible(1); + schedule_timeout_interruptible(oldstarttime - jiffies); rcu_stutter_wait("rcu_torture_boost"); if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) @@ -1028,7 +1043,6 @@ void rcutorture_trace_dump(void) return; if (atomic_xchg(&beenhere, 1) != 0) return; - do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL); ftrace_dump(DUMP_ALL); } @@ -1042,13 +1056,16 @@ static void rcu_torture_timer(unsigned long unused) { int idx; int completed; + int completed_end; static DEFINE_RCU_RANDOM(rand); static DEFINE_SPINLOCK(rand_lock); struct rcu_torture *p; int pipe_count; + unsigned long long ts; idx = cur_ops->readlock(); completed = cur_ops->completed(); + ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || @@ -1058,7 +1075,6 @@ static void rcu_torture_timer(unsigned long unused) cur_ops->readunlock(idx); return; } - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); spin_lock(&rand_lock); @@ -1071,10 +1087,14 @@ static void rcu_torture_timer(unsigned long unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - if (pipe_count > 1) + completed_end = cur_ops->completed(); + if (pipe_count > 1) { + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, + completed, completed_end); rcutorture_trace_dump(); + } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = cur_ops->completed() - completed; + completed = completed_end - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -1094,11 +1114,13 @@ static int rcu_torture_reader(void *arg) { int completed; + int completed_end; int idx; DEFINE_RCU_RANDOM(rand); struct rcu_torture *p; int pipe_count; struct timer_list t; + unsigned long long ts; VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); set_user_nice(current, 19); @@ -1112,6 +1134,7 @@ rcu_torture_reader(void *arg) } idx = cur_ops->readlock(); completed = cur_ops->completed(); + ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || @@ -1122,7 +1145,6 @@ rcu_torture_reader(void *arg) schedule_timeout_interruptible(HZ); continue; } - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); cur_ops->read_delay(&rand); @@ -1132,10 +1154,14 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - if (pipe_count > 1) + completed_end = cur_ops->completed(); + if (pipe_count > 1) { + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, + ts, completed, completed_end); rcutorture_trace_dump(); + } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = cur_ops->completed() - completed; + completed = completed_end - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -1301,19 +1327,35 @@ static void rcu_torture_shuffle_tasks(void) set_cpus_allowed_ptr(reader_tasks[i], shuffle_tmp_mask); } - if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) if (fakewriter_tasks[i]) set_cpus_allowed_ptr(fakewriter_tasks[i], shuffle_tmp_mask); } - if (writer_task) set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); - if (stats_task) set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); + if (stutter_task) + set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask); + if (fqs_task) + set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask); + if (shutdown_task) + set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask); +#ifdef CONFIG_HOTPLUG_CPU + if (onoff_task) + set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + if (stall_task) + set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask); + if (barrier_cbs_tasks) + for (i = 0; i < n_barrier_cbs; i++) + if (barrier_cbs_tasks[i]) + set_cpus_allowed_ptr(barrier_cbs_tasks[i], + shuffle_tmp_mask); + if (barrier_task) + set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask); if (rcu_idle_cpu == -1) rcu_idle_cpu = num_online_cpus() - 1; @@ -1749,7 +1791,7 @@ static int rcu_torture_barrier_init(void) barrier_cbs_wq = kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), GFP_KERNEL); - if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0) + if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) return -ENOMEM; for (i = 0; i < n_barrier_cbs; i++) { init_waitqueue_head(&barrier_cbs_wq[i]); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e441b77b614..5b8ad827fd8 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -105,7 +105,7 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ * The rcu_scheduler_active variable transitions from zero to one just * before the first task is spawned. So when this variable is zero, RCU * can assume that there is but one task, allowing RCU to (for example) - * optimized synchronize_sched() to a simple barrier(). When this variable + * optimize synchronize_sched() to a simple barrier(). When this variable * is one, RCU must actually do all the hard work required to detect real * grace periods. This variable is also used to suppress boot-time false * positives from lockdep-RCU error checking. @@ -217,12 +217,6 @@ module_param(blimit, long, 0444); module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); -int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ -int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; - -module_param(rcu_cpu_stall_suppress, int, 0644); -module_param(rcu_cpu_stall_timeout, int, 0644); - static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; @@ -305,17 +299,27 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) } /* - * Does the current CPU require a yet-as-unscheduled grace period? + * Does the current CPU require a not-yet-started grace period? + * The caller must have disabled interrupts to prevent races with + * normal callback registry. */ static int cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - struct rcu_head **ntp; + int i; - ntp = rdp->nxttail[RCU_DONE_TAIL + - (ACCESS_ONCE(rsp->completed) != rdp->completed)]; - return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp && - !rcu_gp_in_progress(rsp); + if (rcu_gp_in_progress(rsp)) + return 0; /* No, a grace period is already in progress. */ + if (!rdp->nxttail[RCU_NEXT_TAIL]) + return 0; /* No, this is a no-CBs (or offline) CPU. */ + if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) + return 1; /* Yes, this CPU has newly registered callbacks. */ + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) + if (rdp->nxttail[i - 1] != rdp->nxttail[i] && + ULONG_CMP_LT(ACCESS_ONCE(rsp->completed), + rdp->nxtcompleted[i])) + return 1; /* Yes, CBs for future grace period. */ + return 0; /* No grace period needed. */ } /* @@ -336,7 +340,7 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, bool user) { - trace_rcu_dyntick("Start", oldval, 0); + trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); @@ -727,7 +731,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); * interrupt from idle, return true. The caller must have at least * disabled preemption. */ -int rcu_is_cpu_rrupt_from_idle(void) +static int rcu_is_cpu_rrupt_from_idle(void) { return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; } @@ -793,28 +797,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 0; } -static int jiffies_till_stall_check(void) -{ - int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); - - /* - * Limit check must be consistent with the Kconfig limits - * for CONFIG_RCU_CPU_STALL_TIMEOUT. - */ - if (till_stall_check < 3) { - ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; - till_stall_check = 3; - } else if (till_stall_check > 300) { - ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; - till_stall_check = 300; - } - return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; -} - static void record_gp_stall_check_time(struct rcu_state *rsp) { rsp->gp_start = jiffies; - rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); + rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); } /* @@ -857,7 +843,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3; + rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); /* @@ -935,7 +921,7 @@ static void print_cpu_stall(struct rcu_state *rsp) raw_spin_lock_irqsave(&rnp->lock, flags); if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) rsp->jiffies_stall = jiffies + - 3 * jiffies_till_stall_check() + 3; + 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); set_need_resched(); /* kick ourselves to get things going. */ @@ -966,12 +952,6 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) } } -static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) -{ - rcu_cpu_stall_suppress = 1; - return NOTIFY_DONE; -} - /** * rcu_cpu_stall_reset - prevent further stall warnings in current grace period * @@ -989,15 +969,6 @@ void rcu_cpu_stall_reset(void) rsp->jiffies_stall = jiffies + ULONG_MAX / 2; } -static struct notifier_block rcu_panic_block = { - .notifier_call = rcu_panic, -}; - -static void __init check_cpu_stall_init(void) -{ - atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); -} - /* * Update CPU-local rcu_data state to record the newly noticed grace period. * This is used both when we started the grace period and when we notice @@ -1071,6 +1042,145 @@ static void init_callback_list(struct rcu_data *rdp) } /* + * Determine the value that ->completed will have at the end of the + * next subsequent grace period. This is used to tag callbacks so that + * a CPU can invoke callbacks in a timely fashion even if that CPU has + * been dyntick-idle for an extended period with callbacks under the + * influence of RCU_FAST_NO_HZ. + * + * The caller must hold rnp->lock with interrupts disabled. + */ +static unsigned long rcu_cbs_completed(struct rcu_state *rsp, + struct rcu_node *rnp) +{ + /* + * If RCU is idle, we just wait for the next grace period. + * But we can only be sure that RCU is idle if we are looking + * at the root rcu_node structure -- otherwise, a new grace + * period might have started, but just not yet gotten around + * to initializing the current non-root rcu_node structure. + */ + if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed) + return rnp->completed + 1; + + /* + * Otherwise, wait for a possible partial grace period and + * then the subsequent full grace period. + */ + return rnp->completed + 2; +} + +/* + * If there is room, assign a ->completed number to any callbacks on + * this CPU that have not already been assigned. Also accelerate any + * callbacks that were previously assigned a ->completed number that has + * since proven to be too conservative, which can happen if callbacks get + * assigned a ->completed number while RCU is idle, but with reference to + * a non-root rcu_node structure. This function is idempotent, so it does + * not hurt to call it repeatedly. + * + * The caller must hold rnp->lock with interrupts disabled. + */ +static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) +{ + unsigned long c; + int i; + + /* If the CPU has no callbacks, nothing to do. */ + if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) + return; + + /* + * Starting from the sublist containing the callbacks most + * recently assigned a ->completed number and working down, find the + * first sublist that is not assignable to an upcoming grace period. + * Such a sublist has something in it (first two tests) and has + * a ->completed number assigned that will complete sooner than + * the ->completed number for newly arrived callbacks (last test). + * + * The key point is that any later sublist can be assigned the + * same ->completed number as the newly arrived callbacks, which + * means that the callbacks in any of these later sublist can be + * grouped into a single sublist, whether or not they have already + * been assigned a ->completed number. + */ + c = rcu_cbs_completed(rsp, rnp); + for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--) + if (rdp->nxttail[i] != rdp->nxttail[i - 1] && + !ULONG_CMP_GE(rdp->nxtcompleted[i], c)) + break; + + /* + * If there are no sublist for unassigned callbacks, leave. + * At the same time, advance "i" one sublist, so that "i" will + * index into the sublist where all the remaining callbacks should + * be grouped into. + */ + if (++i >= RCU_NEXT_TAIL) + return; + + /* + * Assign all subsequent callbacks' ->completed number to the next + * full grace period and group them all in the sublist initially + * indexed by "i". + */ + for (; i <= RCU_NEXT_TAIL; i++) { + rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; + rdp->nxtcompleted[i] = c; + } + + /* Trace depending on how much we were able to accelerate. */ + if (!*rdp->nxttail[RCU_WAIT_TAIL]) + trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB"); + else + trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB"); +} + +/* + * Move any callbacks whose grace period has completed to the + * RCU_DONE_TAIL sublist, then compact the remaining sublists and + * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL + * sublist. This function is idempotent, so it does not hurt to + * invoke it repeatedly. As long as it is not invoked -too- often... + * + * The caller must hold rnp->lock with interrupts disabled. + */ +static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) +{ + int i, j; + + /* If the CPU has no callbacks, nothing to do. */ + if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) + return; + + /* + * Find all callbacks whose ->completed numbers indicate that they + * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. + */ + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { + if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i])) + break; + rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i]; + } + /* Clean up any sublist tail pointers that were misordered above. */ + for (j = RCU_WAIT_TAIL; j < i; j++) + rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL]; + + /* Copy down callbacks to fill in empty sublists. */ + for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { + if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL]) + break; + rdp->nxttail[j] = rdp->nxttail[i]; + rdp->nxtcompleted[j] = rdp->nxtcompleted[i]; + } + + /* Classify any remaining callbacks. */ + rcu_accelerate_cbs(rsp, rnp, rdp); +} + +/* * Advance this CPU's callbacks, but only if the current grace period * has ended. This may be called only from the CPU to whom the rdp * belongs. In addition, the corresponding leaf rcu_node structure's @@ -1080,12 +1190,15 @@ static void __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { /* Did another grace period end? */ - if (rdp->completed != rnp->completed) { + if (rdp->completed == rnp->completed) { - /* Advance callbacks. No harm if list empty. */ - rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; - rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + /* No, so just accelerate recent callbacks. */ + rcu_accelerate_cbs(rsp, rnp, rdp); + + } else { + + /* Advance callbacks. */ + rcu_advance_cbs(rsp, rnp, rdp); /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; @@ -1392,17 +1505,10 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) /* * Because there is no grace period in progress right now, * any callbacks we have up to this point will be satisfied - * by the next grace period. So promote all callbacks to be - * handled after the end of the next grace period. If the - * CPU is not yet aware of the end of the previous grace period, - * we need to allow for the callback advancement that will - * occur when it does become aware. Deadlock prevents us from - * making it aware at this point: We cannot acquire a leaf - * rcu_node ->lock while holding the root rcu_node ->lock. + * by the next grace period. So this is a good place to + * assign a grace period number to recently posted callbacks. */ - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - if (rdp->completed == rsp->completed) - rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + rcu_accelerate_cbs(rsp, rnp, rdp); rsp->gp_flags = RCU_GP_FLAG_INIT; raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ @@ -1527,7 +1633,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. */ - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + rcu_accelerate_cbs(rsp, rnp, rdp); rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ } @@ -1779,7 +1885,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) long bl, count, count_lazy; int i; - /* If no callbacks are ready, just return.*/ + /* If no callbacks are ready, just return. */ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), @@ -2008,19 +2114,19 @@ __rcu_process_callbacks(struct rcu_state *rsp) WARN_ON_ONCE(rdp->beenonline == 0); - /* - * Advance callbacks in response to end of earlier grace - * period that some other CPU ended. - */ + /* Handle the end of a grace period that some other CPU ended. */ rcu_process_gp_end(rsp, rdp); /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); /* Does this CPU require a not-yet-started grace period? */ + local_irq_save(flags); if (cpu_needs_another_gp(rsp, rdp)) { - raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); + raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ rcu_start_gp(rsp, flags); /* releases above lock */ + } else { + local_irq_restore(flags); } /* If there are callbacks ready, invoke them. */ @@ -2719,9 +2825,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); -#ifdef CONFIG_RCU_USER_QS - WARN_ON_ONCE(rdp->dynticks->in_user); -#endif rdp->cpu = cpu; rdp->rsp = rsp; rcu_boot_init_nocb_percpu_data(rdp); @@ -2938,6 +3041,10 @@ static void __init rcu_init_one(struct rcu_state *rsp, BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ + /* Silence gcc 4.8 warning about array index out of range. */ + if (rcu_num_lvls > RCU_NUM_LVLS) + panic("rcu_init_one: rcu_num_lvls overflow"); + /* Initialize the level-tracking arrays. */ for (i = 0; i < rcu_num_lvls; i++) @@ -3074,7 +3181,6 @@ void __init rcu_init(void) cpu_notifier(rcu_cpu_notify, 0); for_each_online_cpu(cpu) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); - check_cpu_stall_init(); } #include "rcutree_plugin.h" diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4b69291b093..c896b5045d9 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -102,10 +102,6 @@ struct rcu_dynticks { /* idle-period nonlazy_posted snapshot. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ -#ifdef CONFIG_RCU_USER_QS - bool ignore_user_qs; /* Treat userspace as extended QS or not */ - bool in_user; /* Is the CPU in userland from RCU POV? */ -#endif }; /* RCU's kthread states for tracing. */ @@ -282,6 +278,8 @@ struct rcu_data { */ struct rcu_head *nxtlist; struct rcu_head **nxttail[RCU_NEXT_SIZE]; + unsigned long nxtcompleted[RCU_NEXT_SIZE]; + /* grace periods for sublists. */ long qlen_lazy; /* # of lazy queued callbacks */ long qlen; /* # of queued callbacks, incl lazy */ long qlen_last_fqs_check; @@ -343,11 +341,6 @@ struct rcu_data { #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ -#ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA (5 * HZ) -#else -#define RCU_STALL_DELAY_DELTA 0 -#endif #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ /* to take at least one */ /* scheduling clock irq */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index f6e5ec2932b..c1cc7e17ff9 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -40,8 +40,7 @@ #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ -static bool rcu_nocb_poll; /* Offload kthread are to poll. */ -module_param(rcu_nocb_poll, bool, 0444); +static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ static char __initdata nocb_buf[NR_CPUS * 5]; #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ @@ -2159,6 +2158,13 @@ static int __init rcu_nocb_setup(char *str) } __setup("rcu_nocbs=", rcu_nocb_setup); +static int __init parse_rcu_nocb_poll(char *arg) +{ + rcu_nocb_poll = 1; + return 0; +} +early_param("rcu_nocb_poll", parse_rcu_nocb_poll); + /* Is the specified CPU a no-CPUs CPU? */ static bool is_nocb_cpu(int cpu) { @@ -2366,10 +2372,11 @@ static int rcu_nocb_kthread(void *arg) for (;;) { /* If not polling, wait for next batch of callbacks. */ if (!rcu_nocb_poll) - wait_event(rdp->nocb_wq, rdp->nocb_head); + wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); list = ACCESS_ONCE(rdp->nocb_head); if (!list) { schedule_timeout_interruptible(1); + flush_signals(current); continue; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 257002c13bb..26058d0bebb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1523,7 +1523,8 @@ out: */ int wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_ALL, 0); + WARN_ON(task_is_stopped_or_traced(p)); + return try_to_wake_up(p, TASK_NORMAL, 0); } EXPORT_SYMBOL(wake_up_process); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2cd3c1b4e58..7ae4c4c5420 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -222,8 +222,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", cfs_rq->blocked_load_avg); - SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", - atomic64_read(&cfs_rq->tg->load_avg)); + SEQ_printf(m, " .%-30s: %lld\n", "tg_load_avg", + (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg)); SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", cfs_rq->tg_load_contrib); SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5eea8707234..81fa5364340 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2663,7 +2663,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->slack_timer); } -static void unthrottle_offline_cfs_rqs(struct rq *rq) +static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { struct cfs_rq *cfs_rq; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 418feb01344..4f02b284735 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -566,7 +566,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) static int do_balance_runtime(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); - struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; int i, weight, more = 0; u64 rt_period; diff --git a/kernel/signal.c b/kernel/signal.c index 372771e948c..3d09cf6cde7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -680,23 +680,17 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * No need to set need_resched since signal event passing * goes through ->blocked */ -void signal_wake_up(struct task_struct *t, int resume) +void signal_wake_up_state(struct task_struct *t, unsigned int state) { - unsigned int mask; - set_tsk_thread_flag(t, TIF_SIGPENDING); - /* - * For SIGKILL, we want to wake it up in the stopped/traced/killable + * TASK_WAKEKILL also means wake it up in the stopped/traced/killable * case. We don't check t->state here because there is a race with it * executing another processor and just now entering stopped state. * By using wake_up_state, we ensure the process will wake up and * handle its death signal. */ - mask = TASK_INTERRUPTIBLE; - if (resume) - mask |= TASK_WAKEKILL; - if (!wake_up_state(t, mask)) + if (!wake_up_state(t, state | TASK_INTERRUPTIBLE)) kick_process(t); } @@ -844,7 +838,7 @@ static void ptrace_trap_notify(struct task_struct *t) assert_spin_locked(&t->sighand->siglock); task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); - signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); + ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); } /* @@ -1800,6 +1794,10 @@ static inline int may_ptrace_stop(void) * If SIGKILL was already sent before the caller unlocked * ->siglock we must see ->core_state != NULL. Otherwise it * is safe to enter schedule(). + * + * This is almost outdated, a task with the pending SIGKILL can't + * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported + * after SIGKILL was already dequeued. */ if (unlikely(current->mm->core_state) && unlikely(current->mm == current->parent->mm)) @@ -1925,6 +1923,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) if (gstop_done) do_notify_parent_cldstop(current, false, why); + /* tasklist protects us from ptrace_freeze_traced() */ __set_current_state(TASK_RUNNING); if (clear_code) current->exit_code = 0; @@ -3116,8 +3115,9 @@ int __save_altstack(stack_t __user *uss, unsigned long sp) #ifdef CONFIG_COMPAT #ifdef CONFIG_GENERIC_SIGALTSTACK -asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr, - compat_stack_t __user *uoss_ptr) +COMPAT_SYSCALL_DEFINE2(sigaltstack, + const compat_stack_t __user *, uss_ptr, + compat_stack_t __user *, uoss_ptr) { stack_t uss, uoss; int ret; diff --git a/kernel/smp.c b/kernel/smp.c index 29dd40a9f2f..69f38bd98b4 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -33,6 +33,7 @@ struct call_function_data { struct call_single_data csd; atomic_t refs; cpumask_var_t cpumask; + cpumask_var_t cpumask_ipi; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); @@ -56,6 +57,9 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return notifier_from_errno(-ENOMEM); + if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, + cpu_to_node(cpu))) + return notifier_from_errno(-ENOMEM); break; #ifdef CONFIG_HOTPLUG_CPU @@ -65,6 +69,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DEAD: case CPU_DEAD_FROZEN: free_cpumask_var(cfd->cpumask); + free_cpumask_var(cfd->cpumask_ipi); break; #endif }; @@ -526,6 +531,12 @@ void smp_call_function_many(const struct cpumask *mask, return; } + /* + * After we put an entry into the list, data->cpumask + * may be cleared again when another CPU sends another IPI for + * a SMP function call, so data->cpumask will be zero. + */ + cpumask_copy(data->cpumask_ipi, data->cpumask); raw_spin_lock_irqsave(&call_function.lock, flags); /* * Place entry at the _HEAD_ of the list, so that any cpu still @@ -549,7 +560,7 @@ void smp_call_function_many(const struct cpumask *mask, smp_mb(); /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi_mask(data->cpumask); + arch_send_call_function_ipi_mask(data->cpumask_ipi); /* Optionally wait for the CPUs to complete */ if (wait) diff --git a/kernel/srcu.c b/kernel/srcu.c index 2b859828cdc..01d5ccb8bfe 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -282,12 +282,8 @@ static int srcu_readers_active(struct srcu_struct *sp) */ void cleanup_srcu_struct(struct srcu_struct *sp) { - int sum; - - sum = srcu_readers_active(sp); - WARN_ON(sum); /* Leakage unless caller handles error. */ - if (sum != 0) - return; + if (WARN_ON(srcu_readers_active(sp))) + return; /* Leakage unless caller handles error. */ free_percpu(sp->per_cpu_ref); sp->per_cpu_ref = NULL; } @@ -302,9 +298,8 @@ int __srcu_read_lock(struct srcu_struct *sp) { int idx; + idx = ACCESS_ONCE(sp->completed) & 0x1; preempt_disable(); - idx = rcu_dereference_index_check(sp->completed, - rcu_read_lock_sched_held()) & 0x1; ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; smp_mb(); /* B */ /* Avoid leaking the critical section. */ ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; @@ -321,10 +316,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); */ void __srcu_read_unlock(struct srcu_struct *sp, int idx) { - preempt_disable(); smp_mb(); /* C */ /* Avoid leaking the critical section. */ - ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1; - preempt_enable(); + this_cpu_dec(sp->per_cpu_ref->c[idx]); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -423,6 +416,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) !lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); + might_sleep(); init_completion(&rcu.completion); head->next = NULL; @@ -455,10 +449,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) * synchronize_srcu - wait for prior SRCU read-side critical-section completion * @sp: srcu_struct with which to synchronize. * - * Flip the completed counter, and wait for the old count to drain to zero. - * As with classic RCU, the updater must use some separate means of - * synchronizing concurrent updates. Can block; must be called from - * process context. + * Wait for the count to drain to zero of both indexes. To avoid the + * possible starvation of synchronize_srcu(), it waits for the count of + * the index=((->completed & 1) ^ 1) to drain to zero at first, + * and then flip the completed and wait for the count of the other index. + * + * Can block; must be called from process context. * * Note that it is illegal to call synchronize_srcu() from the corresponding * SRCU read-side critical section; doing so will result in deadlock. @@ -480,12 +476,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); * Wait for an SRCU grace period to elapse, but be more aggressive about * spinning rather than blocking when waiting. * - * Note that it is illegal to call this function while holding any lock - * that is acquired by a CPU-hotplug notifier. It is also illegal to call - * synchronize_srcu_expedited() from the corresponding SRCU read-side - * critical section; doing so will result in deadlock. However, it is - * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct - * from some other srcu_struct's read-side critical section, as long as + * Note that it is also illegal to call synchronize_srcu_expedited() + * from the corresponding SRCU read-side critical section; + * doing so will result in deadlock. However, it is perfectly legal + * to call synchronize_srcu_expedited() on one srcu_struct from some + * other srcu_struct's read-side critical section, as long as * the resulting graph of srcu_structs is acyclic. */ void synchronize_srcu_expedited(struct srcu_struct *sp) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3ffe4c5ad3f..41473b4ad7a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3998,7 +3998,7 @@ static int ftrace_module_notify(struct notifier_block *self, struct notifier_block ftrace_module_nb = { .notifier_call = ftrace_module_notify, - .priority = 0, + .priority = INT_MAX, /* Run before anything that can use kprobes */ }; extern unsigned long __start_mcount_loc[]; diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 394783531cb..1bbb1b200ce 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -44,6 +44,7 @@ u64 notrace trace_clock_local(void) return clock; } +EXPORT_SYMBOL_GPL(trace_clock_local); /* * trace_clock(): 'between' trace clock. Not completely serialized, |