diff options
Diffstat (limited to 'kernel')
66 files changed, 3146 insertions, 1360 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index cb41b9547c9..6c07f30fa9b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_SMP) += smpboot.o ifneq ($(CONFIG_SMP),y) obj-y += up.o endif diff --git a/kernel/auditsc.c b/kernel/auditsc.c index af1de0f34ea..4b96415527b 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -67,6 +67,7 @@ #include <linux/syscalls.h> #include <linux/capability.h> #include <linux/fs_struct.h> +#include <linux/compat.h> #include "audit.h" @@ -2710,13 +2711,16 @@ void audit_core_dumps(long signr) audit_log_end(ab); } -void __audit_seccomp(unsigned long syscall) +void __audit_seccomp(unsigned long syscall, long signr, int code) { struct audit_buffer *ab; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); - audit_log_abend(ab, "seccomp", SIGKILL); + audit_log_abend(ab, "seccomp", signr); audit_log_format(ab, " syscall=%ld", syscall); + audit_log_format(ab, " compat=%d", is_compat_task()); + audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); + audit_log_format(ab, " code=0x%x", code); audit_log_end(ab); } diff --git a/kernel/compat.c b/kernel/compat.c index 74ff8498809..d2c67aa49ae 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -372,25 +372,54 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) #ifdef __ARCH_WANT_SYS_SIGPROCMASK -asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, - compat_old_sigset_t __user *oset) +/* + * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the + * blocked set of signals to the supplied signal set + */ +static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set) { - old_sigset_t s; - long ret; - mm_segment_t old_fs; + memcpy(blocked->sig, &set, sizeof(set)); +} - if (set && get_user(s, set)) - return -EFAULT; - old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_sigprocmask(how, - set ? (old_sigset_t __user *) &s : NULL, - oset ? (old_sigset_t __user *) &s : NULL); - set_fs(old_fs); - if (ret == 0) - if (oset) - ret = put_user(s, oset); - return ret; +asmlinkage long compat_sys_sigprocmask(int how, + compat_old_sigset_t __user *nset, + compat_old_sigset_t __user *oset) +{ + old_sigset_t old_set, new_set; + sigset_t new_blocked; + + old_set = current->blocked.sig[0]; + + if (nset) { + if (get_user(new_set, nset)) + return -EFAULT; + new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); + + new_blocked = current->blocked; + + switch (how) { + case SIG_BLOCK: + sigaddsetmask(&new_blocked, new_set); + break; + case SIG_UNBLOCK: + sigdelsetmask(&new_blocked, new_set); + break; + case SIG_SETMASK: + compat_sig_setmask(&new_blocked, new_set); + break; + default: + return -EINVAL; + } + + set_current_blocked(&new_blocked); + } + + if (oset) { + if (put_user(old_set, oset)) + return -EFAULT; + } + + return 0; } #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index 2060c6e5702..0e6353cf147 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -17,6 +17,8 @@ #include <linux/gfp.h> #include <linux/suspend.h> +#include "smpboot.h" + #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); @@ -295,11 +297,19 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) int ret, nr_calls = 0; void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; + struct task_struct *idle; if (cpu_online(cpu) || !cpu_present(cpu)) return -EINVAL; cpu_hotplug_begin(); + + idle = idle_thread_get(cpu); + if (IS_ERR(idle)) { + ret = PTR_ERR(idle); + goto out; + } + ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; @@ -309,7 +319,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) } /* Arch-specific enabling code. */ - ret = __cpu_up(cpu); + ret = __cpu_up(cpu, idle); if (ret != 0) goto out_notify; BUG_ON(!cpu_online(cpu)); @@ -320,6 +330,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) out_notify: if (ret != 0) __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); +out: cpu_hotplug_done(); return ret; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 2382683617a..8c8bd652dd1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -270,11 +270,11 @@ static struct file_system_type cpuset_fs_type = { * are online. If none are online, walk up the cpuset hierarchy * until we find one that does have some online cpus. If we get * all the way to the top and still haven't found any online cpus, - * return cpu_online_map. Or if passed a NULL cs from an exit'ing - * task, return cpu_online_map. + * return cpu_online_mask. Or if passed a NULL cs from an exit'ing + * task, return cpu_online_mask. * * One way or another, we guarantee to return some non-empty subset - * of cpu_online_map. + * of cpu_online_mask. * * Call with callback_mutex held. */ @@ -867,7 +867,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, int retval; int is_load_balanced; - /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ + /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ if (cs == &top_cpuset) return -EACCES; @@ -2138,7 +2138,7 @@ void __init cpuset_init_smp(void) * * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty - * subset of cpu_online_map, even if this means going outside the + * subset of cpu_online_mask, even if this means going outside the * tasks cpuset. **/ diff --git a/kernel/cred.c b/kernel/cred.c index 97b36eeca4c..e70683d9ec3 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -386,6 +386,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) struct cred *new; int ret; + p->replacement_session_keyring = NULL; + if ( #ifdef CONFIG_KEYS !p->cred->thread_keyring && diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 1dc53bae56e..0557f24c6bc 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -160,37 +160,39 @@ early_param("nokgdbroundup", opt_nokgdbroundup); * Weak aliases for breakpoint management, * can be overriden by architectures when needed: */ -int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) +int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) { int err; - err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE); + err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, + BREAK_INSTR_SIZE); if (err) return err; - - return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr, - BREAK_INSTR_SIZE); + err = probe_kernel_write((char *)bpt->bpt_addr, + arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); + return err; } -int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) +int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) { - return probe_kernel_write((char *)addr, - (char *)bundle, BREAK_INSTR_SIZE); + return probe_kernel_write((char *)bpt->bpt_addr, + (char *)bpt->saved_instr, BREAK_INSTR_SIZE); } int __weak kgdb_validate_break_address(unsigned long addr) { - char tmp_variable[BREAK_INSTR_SIZE]; + struct kgdb_bkpt tmp; int err; - /* Validate setting the breakpoint and then removing it. In the + /* Validate setting the breakpoint and then removing it. If the * remove fails, the kernel needs to emit a bad message because we * are deep trouble not being able to put things back the way we * found them. */ - err = kgdb_arch_set_breakpoint(addr, tmp_variable); + tmp.bpt_addr = addr; + err = kgdb_arch_set_breakpoint(&tmp); if (err) return err; - err = kgdb_arch_remove_breakpoint(addr, tmp_variable); + err = kgdb_arch_remove_breakpoint(&tmp); if (err) printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " "memory destroyed at: %lx", addr); @@ -234,7 +236,6 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) */ int dbg_activate_sw_breakpoints(void) { - unsigned long addr; int error; int ret = 0; int i; @@ -243,16 +244,15 @@ int dbg_activate_sw_breakpoints(void) if (kgdb_break[i].state != BP_SET) continue; - addr = kgdb_break[i].bpt_addr; - error = kgdb_arch_set_breakpoint(addr, - kgdb_break[i].saved_instr); + error = kgdb_arch_set_breakpoint(&kgdb_break[i]); if (error) { ret = error; - printk(KERN_INFO "KGDB: BP install failed: %lx", addr); + printk(KERN_INFO "KGDB: BP install failed: %lx", + kgdb_break[i].bpt_addr); continue; } - kgdb_flush_swbreak_addr(addr); + kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr); kgdb_break[i].state = BP_ACTIVE; } return ret; @@ -301,7 +301,6 @@ int dbg_set_sw_break(unsigned long addr) int dbg_deactivate_sw_breakpoints(void) { - unsigned long addr; int error; int ret = 0; int i; @@ -309,15 +308,14 @@ int dbg_deactivate_sw_breakpoints(void) for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { if (kgdb_break[i].state != BP_ACTIVE) continue; - addr = kgdb_break[i].bpt_addr; - error = kgdb_arch_remove_breakpoint(addr, - kgdb_break[i].saved_instr); + error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); if (error) { - printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr); + printk(KERN_INFO "KGDB: BP remove failed: %lx\n", + kgdb_break[i].bpt_addr); ret = error; } - kgdb_flush_swbreak_addr(addr); + kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr); kgdb_break[i].state = BP_SET; } return ret; @@ -351,7 +349,6 @@ int kgdb_isremovedbreak(unsigned long addr) int dbg_remove_all_break(void) { - unsigned long addr; int error; int i; @@ -359,12 +356,10 @@ int dbg_remove_all_break(void) for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { if (kgdb_break[i].state != BP_ACTIVE) goto setundefined; - addr = kgdb_break[i].bpt_addr; - error = kgdb_arch_remove_breakpoint(addr, - kgdb_break[i].saved_instr); + error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); if (error) printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", - addr); + kgdb_break[i].bpt_addr); setundefined: kgdb_break[i].state = BP_UNDEFINED; } diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 9b5f17da1c5..bb9520f0f6f 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -743,7 +743,7 @@ kdb_printit: kdb_input_flush(); c = console_drivers; - if (!dbg_io_ops->is_console) { + if (dbg_io_ops && !dbg_io_ops->is_console) { len = strlen(moreprompt); cp = moreprompt; while (len--) { diff --git a/kernel/events/core.c b/kernel/events/core.c index a6a9ec4cd8f..fd126f82b57 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3183,7 +3183,7 @@ static void perf_event_for_each(struct perf_event *event, perf_event_for_each_child(event, func); func(event); list_for_each_entry(sibling, &event->sibling_list, group_entry) - perf_event_for_each_child(event, func); + perf_event_for_each_child(sibling, func); mutex_unlock(&ctx->mutex); } diff --git a/kernel/fork.c b/kernel/fork.c index b9372a0bff1..ad54c833116 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -34,6 +34,7 @@ #include <linux/cgroup.h> #include <linux/security.h> #include <linux/hugetlb.h> +#include <linux/seccomp.h> #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/jiffies.h> @@ -47,6 +48,7 @@ #include <linux/audit.h> #include <linux/memcontrol.h> #include <linux/ftrace.h> +#include <linux/proc_fs.h> #include <linux/profile.h> #include <linux/rmap.h> #include <linux/ksm.h> @@ -111,32 +113,67 @@ int nr_processes(void) return total; } -#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR -# define alloc_task_struct_node(node) \ - kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) -# define free_task_struct(tsk) \ - kmem_cache_free(task_struct_cachep, (tsk)) +#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR static struct kmem_cache *task_struct_cachep; + +static inline struct task_struct *alloc_task_struct_node(int node) +{ + return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); +} + +void __weak arch_release_task_struct(struct task_struct *tsk) { } + +static inline void free_task_struct(struct task_struct *tsk) +{ + arch_release_task_struct(tsk); + kmem_cache_free(task_struct_cachep, tsk); +} #endif -#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR +#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR +void __weak arch_release_thread_info(struct thread_info *ti) { } + +/* + * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a + * kmemcache based allocator. + */ +# if THREAD_SIZE >= PAGE_SIZE static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { -#ifdef CONFIG_DEBUG_STACK_USAGE - gfp_t mask = GFP_KERNEL | __GFP_ZERO; -#else - gfp_t mask = GFP_KERNEL; -#endif - struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); + struct page *page = alloc_pages_node(node, THREADINFO_GFP, + THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) { + arch_release_thread_info(ti); free_pages((unsigned long)ti, THREAD_SIZE_ORDER); } +# else +static struct kmem_cache *thread_info_cache; + +static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, + int node) +{ + return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node); +} + +static void free_thread_info(struct thread_info *ti) +{ + arch_release_thread_info(ti); + kmem_cache_free(thread_info_cache, ti); +} + +void thread_info_cache_init(void) +{ + thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE, + THREAD_SIZE, 0, NULL); + BUG_ON(thread_info_cache == NULL); +} +# endif #endif /* SLAB cache for signal_struct structures (tsk->signal) */ @@ -170,6 +207,7 @@ void free_task(struct task_struct *tsk) free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); + put_seccomp_filter(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -203,17 +241,11 @@ void __put_task_struct(struct task_struct *tsk) } EXPORT_SYMBOL_GPL(__put_task_struct); -/* - * macro override instead of weak attribute alias, to workaround - * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. - */ -#ifndef arch_task_cache_init -#define arch_task_cache_init() -#endif +void __init __weak arch_task_cache_init(void) { } void __init fork_init(unsigned long mempages) { -#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR +#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES #endif @@ -1162,6 +1194,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; ftrace_graph_init_task(p); + get_seccomp_filter(p); rt_mutex_init_task(p); @@ -1464,6 +1497,8 @@ bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: + if (unlikely(clone_flags & CLONE_NEWPID)) + pid_ns_release_proc(p->nsproxy->pid_ns); exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index c21449f85a2..6df614912b9 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -108,8 +108,10 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) touch_nmi_watchdog(); - if (sysctl_hung_task_panic) + if (sysctl_hung_task_panic) { + trigger_all_cpu_backtrace(); panic("hung_task: blocked tasks"); + } } /* diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index cf1a4a68ce4..d1a758bc972 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -62,7 +62,7 @@ config IRQ_DOMAIN_DEBUG help This option will show the mapping relationship between hardware irq numbers and Linux irq numbers. The mapping is exposed via debugfs - in the file "virq_mapping". + in the file "irq_domain_mapping". If you don't know what this means you don't need it. diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6080f6bc8c3..fc275e4f629 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -379,8 +379,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) * If its disabled or no action available * keep it masked and get out of here */ - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; goto out_unlock; + } handle_irq_event(desc); @@ -518,6 +520,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) out_unlock: raw_spin_unlock(&desc->lock); } +EXPORT_SYMBOL(handle_edge_irq); #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER /** diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index 97a8bfadc88..e75e29e4434 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h @@ -4,10 +4,10 @@ #include <linux/kallsyms.h> -#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) -#define PS(f) if (desc->istate & f) printk("%14s set\n", #f) +#define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) +#define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f) /* FIXME */ -#define PD(f) do { } while (0) +#define ___PD(f) do { } while (0) static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) { @@ -23,23 +23,23 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) print_symbol("%s\n", (unsigned long)desc->action->handler); } - P(IRQ_LEVEL); - P(IRQ_PER_CPU); - P(IRQ_NOPROBE); - P(IRQ_NOREQUEST); - P(IRQ_NOTHREAD); - P(IRQ_NOAUTOEN); + ___P(IRQ_LEVEL); + ___P(IRQ_PER_CPU); + ___P(IRQ_NOPROBE); + ___P(IRQ_NOREQUEST); + ___P(IRQ_NOTHREAD); + ___P(IRQ_NOAUTOEN); - PS(IRQS_AUTODETECT); - PS(IRQS_REPLAY); - PS(IRQS_WAITING); - PS(IRQS_PENDING); + ___PS(IRQS_AUTODETECT); + ___PS(IRQS_REPLAY); + ___PS(IRQS_WAITING); + ___PS(IRQS_PENDING); - PD(IRQS_INPROGRESS); - PD(IRQS_DISABLED); - PD(IRQS_MASKED); + ___PD(IRQS_INPROGRESS); + ___PD(IRQS_DISABLED); + ___PD(IRQS_MASKED); } -#undef P -#undef PS -#undef PD +#undef ___P +#undef ___PS +#undef ___PD diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index d86e254b95e..192a302d6cf 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -112,6 +112,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) { return radix_tree_lookup(&irq_desc_tree, irq); } +EXPORT_SYMBOL(irq_to_desc); static void delete_irq_desc(unsigned int irq) { diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3601f3fbf67..0e0ba5f840b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -23,7 +23,6 @@ static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); static DEFINE_MUTEX(revmap_trees_mutex); -static unsigned int irq_virq_count = NR_IRQS; static struct irq_domain *irq_default_domain; /** @@ -184,13 +183,16 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, } struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, + unsigned int max_irq, const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_NOMAP, ops, host_data); - if (domain) + if (domain) { + domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0; irq_domain_add(domain); + } return domain; } @@ -262,22 +264,6 @@ void irq_set_default_host(struct irq_domain *domain) irq_default_domain = domain; } -/** - * irq_set_virq_count() - Set the maximum number of linux irqs - * @count: number of linux irqs, capped with NR_IRQS - * - * This is mainly for use by platforms like iSeries who want to program - * the virtual irq number in the controller to avoid the reverse mapping - */ -void irq_set_virq_count(unsigned int count) -{ - pr_debug("irq: Trying to set virq count to %d\n", count); - - BUG_ON(count < NUM_ISA_INTERRUPTS); - if (count < NR_IRQS) - irq_virq_count = count; -} - static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq) { @@ -320,13 +306,12 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) pr_debug("irq: create_direct virq allocation failed\n"); return 0; } - if (virq >= irq_virq_count) { + if (virq >= domain->revmap_data.nomap.max_irq) { pr_err("ERROR: no free irqs available below %i maximum\n", - irq_virq_count); + domain->revmap_data.nomap.max_irq); irq_free_desc(virq); return 0; } - pr_debug("irq: create_direct obtained virq %d\n", virq); if (irq_setup_virq(domain, virq, virq)) { @@ -350,7 +335,8 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) unsigned int irq_create_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { - unsigned int virq, hint; + unsigned int hint; + int virq; pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); @@ -377,13 +363,13 @@ unsigned int irq_create_mapping(struct irq_domain *domain, return irq_domain_legacy_revmap(domain, hwirq); /* Allocate a virtual interrupt number */ - hint = hwirq % irq_virq_count; + hint = hwirq % nr_irqs; if (hint == 0) hint++; virq = irq_alloc_desc_from(hint, 0); - if (!virq) + if (virq <= 0) virq = irq_alloc_desc_from(1, 0); - if (!virq) { + if (virq <= 0) { pr_debug("irq: -> virq allocation failed\n"); return 0; } @@ -515,7 +501,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { unsigned int i; - unsigned int hint = hwirq % irq_virq_count; + unsigned int hint = hwirq % nr_irqs; /* Look for default domain if nececssary */ if (domain == NULL) @@ -536,7 +522,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain, if (data && (data->domain == domain) && (data->hwirq == hwirq)) return i; i++; - if (i >= irq_virq_count) + if (i >= nr_irqs) i = 1; } while(i != hint); return 0; @@ -642,8 +628,9 @@ static int virq_debug_show(struct seq_file *m, void *private) void *data; int i; - seq_printf(m, "%-5s %-7s %-15s %-18s %s\n", "virq", "hwirq", - "chip name", "chip data", "domain name"); + seq_printf(m, "%-5s %-7s %-15s %-*s %s\n", "irq", "hwirq", + "chip name", (int)(2 * sizeof(void *) + 2), "chip data", + "domain name"); for (i = 1; i < nr_irqs; i++) { desc = irq_to_desc(i); @@ -666,7 +653,7 @@ static int virq_debug_show(struct seq_file *m, void *private) seq_printf(m, "%-15s ", p); data = irq_desc_get_chip_data(desc); - seq_printf(m, "0x%16p ", data); + seq_printf(m, data ? "0x%p " : " %p ", data); if (desc->irq_data.domain && desc->irq_data.domain->of_node) p = desc->irq_data.domain->of_node->full_name; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 89a3ea82569..585f6381f8e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -565,8 +565,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, * IRQF_TRIGGER_* but the PIC does not support multiple * flow-types? */ - pr_debug("No set_type function for IRQ %d (%s)\n", irq, - chip ? (chip->name ? : "unknown") : "unknown"); + pr_debug("genirq: No set_type function for IRQ %d (%s)\n", irq, + chip ? (chip->name ? : "unknown") : "unknown"); return 0; } @@ -600,7 +600,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, ret = 0; break; default: - pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", + pr_err("genirq: Setting trigger mode %lu for irq %u failed (%pF)\n", flags, irq, chip->irq_set_type); } if (unmask) @@ -837,8 +837,7 @@ void exit_irq_thread(void) action = kthread_data(tsk); - printk(KERN_ERR - "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", + pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", tsk->comm ? tsk->comm : "", tsk->pid, action->irq); desc = irq_to_desc(action->irq); @@ -878,7 +877,6 @@ static int __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { struct irqaction *old, **old_ptr; - const char *old_name = NULL; unsigned long flags, thread_mask = 0; int ret, nested, shared = 0; cpumask_var_t mask; @@ -972,10 +970,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ if (!((old->flags & new->flags) & IRQF_SHARED) || ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || - ((old->flags ^ new->flags) & IRQF_ONESHOT)) { - old_name = old->name; + ((old->flags ^ new->flags) & IRQF_ONESHOT)) goto mismatch; - } /* All handlers must agree on per-cpuness */ if ((old->flags & IRQF_PERCPU) != @@ -1031,6 +1027,27 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * all existing action->thread_mask bits. */ new->thread_mask = 1 << ffz(thread_mask); + + } else if (new->handler == irq_default_primary_handler) { + /* + * The interrupt was requested with handler = NULL, so + * we use the default primary handler for it. But it + * does not have the oneshot flag set. In combination + * with level interrupts this is deadly, because the + * default primary handler just wakes the thread, then + * the irq lines is reenabled, but the device still + * has the level irq asserted. Rinse and repeat.... + * + * While this works for edge type interrupts, we play + * it safe and reject unconditionally because we can't + * say for sure which type this interrupt really + * has. The type flags are unreliable as the + * underlying chip implementation can override them. + */ + pr_err("genirq: Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", + irq); + ret = -EINVAL; + goto out_mask; } if (!shared) { @@ -1078,7 +1095,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (nmsk != omsk) /* hope the handler works with current trigger mode */ - pr_warning("IRQ %d uses trigger mode %u; requested %u\n", + pr_warning("genirq: irq %d uses trigger mode %u; requested %u\n", irq, nmsk, omsk); } @@ -1115,14 +1132,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) return 0; mismatch: -#ifdef CONFIG_DEBUG_SHIRQ if (!(new->flags & IRQF_PROBE_SHARED)) { - printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); - if (old_name) - printk(KERN_ERR "current handler: %s\n", old_name); + pr_err("genirq: Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", + irq, new->flags, new->name, old->flags, old->name); +#ifdef CONFIG_DEBUG_SHIRQ dump_stack(); - } #endif + } ret = -EBUSY; out_mask: diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 15e53b1766a..cb228bf2176 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -103,8 +103,13 @@ int check_wakeup_irqs(void) int irq; for_each_irq_desc(irq, desc) { + /* + * Only interrupts which are marked as wakeup source + * and have not been disabled before the suspend check + * can abort suspend. + */ if (irqd_is_wakeup_set(&desc->irq_data)) { - if (desc->istate & IRQS_PENDING) + if (desc->depth == 1 && desc->istate & IRQS_PENDING) return -EBUSY; continue; } diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 14dd5761e8c..6454db7b6a4 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -58,10 +58,13 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) /* * We do not resend level type interrupts. Level type * interrupts are resent by hardware when they are still - * active. + * active. Clear the pending bit so suspend/resume does not + * get confused. */ - if (irq_settings_is_level(desc)) + if (irq_settings_is_level(desc)) { + desc->istate &= ~IRQS_PENDING; return; + } if (desc->istate & IRQS_REPLAY) return; if (desc->istate & IRQS_PENDING) { diff --git a/kernel/irq_work.c b/kernel/irq_work.c index c3c46c72046..1588e3b2871 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -5,11 +5,13 @@ * context. The enqueueing is NMI-safe. */ +#include <linux/bug.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/irq_work.h> #include <linux/percpu.h> #include <linux/hardirq.h> +#include <linux/irqflags.h> #include <asm/processor.h> /* diff --git a/kernel/itimer.c b/kernel/itimer.c index 22000c3db0d..8d262b46757 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -284,8 +284,12 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, if (value) { if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) return -EFAULT; - } else - memset((char *) &set_buffer, 0, sizeof(set_buffer)); + } else { + memset(&set_buffer, 0, sizeof(set_buffer)); + printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer." + " Misfeature support will be removed\n", + current->comm); + } error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); if (error || !ovalue) diff --git a/kernel/kmod.c b/kernel/kmod.c index 957a7aab8eb..05698a7415f 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -322,7 +322,7 @@ static void __call_usermodehelper(struct work_struct *work) * land has been frozen during a system-wide hibernation or suspend operation). * Should always be manipulated under umhelper_sem acquired for write. */ -static int usermodehelper_disabled = 1; +static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED; /* Number of helpers running */ static atomic_t running_helpers = ATOMIC_INIT(0); @@ -334,32 +334,110 @@ static atomic_t running_helpers = ATOMIC_INIT(0); static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); /* + * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled + * to become 'false'. + */ +static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq); + +/* * Time to wait for running_helpers to become zero before the setting of * usermodehelper_disabled in usermodehelper_disable() fails */ #define RUNNING_HELPERS_TIMEOUT (5 * HZ) -void read_lock_usermodehelper(void) +int usermodehelper_read_trylock(void) { + DEFINE_WAIT(wait); + int ret = 0; + down_read(&umhelper_sem); + for (;;) { + prepare_to_wait(&usermodehelper_disabled_waitq, &wait, + TASK_INTERRUPTIBLE); + if (!usermodehelper_disabled) + break; + + if (usermodehelper_disabled == UMH_DISABLED) + ret = -EAGAIN; + + up_read(&umhelper_sem); + + if (ret) + break; + + schedule(); + try_to_freeze(); + + down_read(&umhelper_sem); + } + finish_wait(&usermodehelper_disabled_waitq, &wait); + return ret; +} +EXPORT_SYMBOL_GPL(usermodehelper_read_trylock); + +long usermodehelper_read_lock_wait(long timeout) +{ + DEFINE_WAIT(wait); + + if (timeout < 0) + return -EINVAL; + + down_read(&umhelper_sem); + for (;;) { + prepare_to_wait(&usermodehelper_disabled_waitq, &wait, + TASK_UNINTERRUPTIBLE); + if (!usermodehelper_disabled) + break; + + up_read(&umhelper_sem); + + timeout = schedule_timeout(timeout); + if (!timeout) + break; + + down_read(&umhelper_sem); + } + finish_wait(&usermodehelper_disabled_waitq, &wait); + return timeout; } -EXPORT_SYMBOL_GPL(read_lock_usermodehelper); +EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait); -void read_unlock_usermodehelper(void) +void usermodehelper_read_unlock(void) { up_read(&umhelper_sem); } -EXPORT_SYMBOL_GPL(read_unlock_usermodehelper); +EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); /** - * usermodehelper_disable - prevent new helpers from being started + * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. + * depth: New value to assign to usermodehelper_disabled. + * + * Change the value of usermodehelper_disabled (under umhelper_sem locked for + * writing) and wakeup tasks waiting for it to change. */ -int usermodehelper_disable(void) +void __usermodehelper_set_disable_depth(enum umh_disable_depth depth) +{ + down_write(&umhelper_sem); + usermodehelper_disabled = depth; + wake_up(&usermodehelper_disabled_waitq); + up_write(&umhelper_sem); +} + +/** + * __usermodehelper_disable - Prevent new helpers from being started. + * @depth: New value to assign to usermodehelper_disabled. + * + * Set usermodehelper_disabled to @depth and wait for running helpers to exit. + */ +int __usermodehelper_disable(enum umh_disable_depth depth) { long retval; + if (!depth) + return -EINVAL; + down_write(&umhelper_sem); - usermodehelper_disabled = 1; + usermodehelper_disabled = depth; up_write(&umhelper_sem); /* @@ -374,31 +452,10 @@ int usermodehelper_disable(void) if (retval) return 0; - down_write(&umhelper_sem); - usermodehelper_disabled = 0; - up_write(&umhelper_sem); + __usermodehelper_set_disable_depth(UMH_ENABLED); return -EAGAIN; } -/** - * usermodehelper_enable - allow new helpers to be started again - */ -void usermodehelper_enable(void) -{ - down_write(&umhelper_sem); - usermodehelper_disabled = 0; - up_write(&umhelper_sem); -} - -/** - * usermodehelper_is_disabled - check if new helpers are allowed to be started - */ -bool usermodehelper_is_disabled(void) -{ - return usermodehelper_disabled; -} -EXPORT_SYMBOL_GPL(usermodehelper_is_disabled); - static void helper_lock(void) { atomic_inc(&running_helpers); diff --git a/kernel/module.c b/kernel/module.c index 78ac6ec1e42..a4e60973ca7 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2953,7 +2953,7 @@ static struct module *load_module(void __user *umod, /* Module is ready to execute: parsing args may do that. */ err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, - -32768, 32767, NULL); + -32768, 32767, &ddebug_dyndbg_module_param_cb); if (err < 0) goto unlink; diff --git a/kernel/padata.c b/kernel/padata.c index 6f10eb285ec..89fe3d1b9ef 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -1,6 +1,8 @@ /* * padata.c - generic interface to process data streams in parallel * + * See Documentation/padata.txt for an api documentation. + * * Copyright (C) 2008, 2009 secunet Security Networks AG * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com> * @@ -354,13 +356,13 @@ static int padata_setup_cpumasks(struct parallel_data *pd, if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) return -ENOMEM; - cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask); + cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask); if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { free_cpumask_var(pd->cpumask.cbcpu); return -ENOMEM; } - cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask); + cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask); return 0; } @@ -564,7 +566,7 @@ EXPORT_SYMBOL(padata_unregister_cpumask_notifier); static bool padata_validate_cpumask(struct padata_instance *pinst, const struct cpumask *cpumask) { - if (!cpumask_intersects(cpumask, cpu_active_mask)) { + if (!cpumask_intersects(cpumask, cpu_online_mask)) { pinst->flags |= PADATA_INVALID; return false; } @@ -678,7 +680,7 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu) { struct parallel_data *pd; - if (cpumask_test_cpu(cpu, cpu_active_mask)) { + if (cpumask_test_cpu(cpu, cpu_online_mask)) { pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, pinst->cpumask.cbcpu); if (!pd) @@ -746,6 +748,9 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) return -ENOMEM; padata_replace(pinst, pd); + + cpumask_clear_cpu(cpu, pd->cpumask.cbcpu); + cpumask_clear_cpu(cpu, pd->cpumask.pcpu); } return 0; diff --git a/kernel/panic.c b/kernel/panic.c index 80aed44e345..8ed89a175d7 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -97,7 +97,7 @@ void panic(const char *fmt, ...) /* * Avoid nested stack-dumping if a panic occurs during oops processing */ - if (!oops_in_progress) + if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) dump_stack(); #endif diff --git a/kernel/params.c b/kernel/params.c index f37d8263134..ed35345be53 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -85,11 +85,13 @@ bool parameq(const char *a, const char *b) static int parse_one(char *param, char *val, + const char *doing, const struct kernel_param *params, unsigned num_params, s16 min_level, s16 max_level, - int (*handle_unknown)(char *param, char *val)) + int (*handle_unknown)(char *param, char *val, + const char *doing)) { unsigned int i; int err; @@ -104,8 +106,8 @@ static int parse_one(char *param, if (!val && params[i].ops->set != param_set_bool && params[i].ops->set != param_set_bint) return -EINVAL; - pr_debug("They are equal! Calling %p\n", - params[i].ops->set); + pr_debug("handling %s with %p\n", param, + params[i].ops->set); mutex_lock(¶m_lock); err = params[i].ops->set(val, ¶ms[i]); mutex_unlock(¶m_lock); @@ -114,11 +116,11 @@ static int parse_one(char *param, } if (handle_unknown) { - pr_debug("Unknown argument: calling %p\n", handle_unknown); - return handle_unknown(param, val); + pr_debug("doing %s: %s='%s'\n", doing, param, val); + return handle_unknown(param, val, doing); } - pr_debug("Unknown argument `%s'\n", param); + pr_debug("Unknown argument '%s'\n", param); return -ENOENT; } @@ -175,49 +177,47 @@ static char *next_arg(char *args, char **param, char **val) } /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ -int parse_args(const char *name, +int parse_args(const char *doing, char *args, const struct kernel_param *params, unsigned num, s16 min_level, s16 max_level, - int (*unknown)(char *param, char *val)) + int (*unknown)(char *param, char *val, const char *doing)) { char *param, *val; - pr_debug("Parsing ARGS: %s\n", args); - /* Chew leading spaces */ args = skip_spaces(args); + if (*args) + pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args); + while (*args) { int ret; int irq_was_disabled; args = next_arg(args, ¶m, &val); irq_was_disabled = irqs_disabled(); - ret = parse_one(param, val, params, num, + ret = parse_one(param, val, doing, params, num, min_level, max_level, unknown); - if (irq_was_disabled && !irqs_disabled()) { - printk(KERN_WARNING "parse_args(): option '%s' enabled " - "irq's!\n", param); - } + if (irq_was_disabled && !irqs_disabled()) + pr_warn("%s: option '%s' enabled irq's!\n", + doing, param); + switch (ret) { case -ENOENT: - printk(KERN_ERR "%s: Unknown parameter `%s'\n", - name, param); + pr_err("%s: Unknown parameter `%s'\n", doing, param); return ret; case -ENOSPC: - printk(KERN_ERR - "%s: `%s' too large for parameter `%s'\n", - name, val ?: "", param); + pr_err("%s: `%s' too large for parameter `%s'\n", + doing, val ?: "", param); return ret; case 0: break; default: - printk(KERN_ERR - "%s: `%s' invalid for parameter `%s'\n", - name, val ?: "", param); + pr_err("%s: `%s' invalid for parameter `%s'\n", + doing, val ?: "", param); return ret; } } @@ -263,8 +263,7 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); int param_set_charp(const char *val, const struct kernel_param *kp) { if (strlen(val) > 1024) { - printk(KERN_ERR "%s: string parameter too long\n", - kp->name); + pr_err("%s: string parameter too long\n", kp->name); return -ENOSPC; } @@ -400,8 +399,7 @@ static int param_array(const char *name, int len; if (*num == max) { - printk(KERN_ERR "%s: can only take %i arguments\n", - name, max); + pr_err("%s: can only take %i arguments\n", name, max); return -EINVAL; } len = strcspn(val, ","); @@ -420,8 +418,7 @@ static int param_array(const char *name, } while (save == ','); if (*num < min) { - printk(KERN_ERR "%s: needs at least %i arguments\n", - name, min); + pr_err("%s: needs at least %i arguments\n", name, min); return -EINVAL; } return 0; @@ -480,7 +477,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp) const struct kparam_string *kps = kp->str; if (strlen(val)+1 > kps->maxlen) { - printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", + pr_err("%s: string doesn't fit in %u chars.\n", kp->name, kps->maxlen-1); return -ENOSPC; } @@ -750,11 +747,8 @@ static struct module_kobject * __init locate_module_kobject(const char *name) #endif if (err) { kobject_put(&mk->kobj); - printk(KERN_ERR - "Module '%s' failed add to sysfs, error number %d\n", + pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", name, err); - printk(KERN_ERR - "The system will be unstable now.\n"); return NULL; } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 0a186cfde78..e09dfbfeece 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -16,7 +16,6 @@ #include <linux/string.h> #include <linux/device.h> #include <linux/async.h> -#include <linux/kmod.h> #include <linux/delay.h> #include <linux/fs.h> #include <linux/mount.h> @@ -611,14 +610,10 @@ int hibernate(void) if (error) goto Exit; - error = usermodehelper_disable(); - if (error) - goto Exit; - /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); if (error) - goto Enable_umh; + goto Exit; printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); @@ -661,8 +656,6 @@ int hibernate(void) Free_bitmaps: free_basic_memory_bitmaps(); - Enable_umh: - usermodehelper_enable(); Exit: pm_notifier_call_chain(PM_POST_HIBERNATION); pm_restore_console(); @@ -777,15 +770,9 @@ static int software_resume(void) if (error) goto close_finish; - error = usermodehelper_disable(); - if (error) - goto close_finish; - error = create_basic_memory_bitmaps(); - if (error) { - usermodehelper_enable(); + if (error) goto close_finish; - } pr_debug("PM: Preparing processes for restore.\n"); error = freeze_processes(); @@ -806,7 +793,6 @@ static int software_resume(void) thaw_processes(); Done: free_basic_memory_bitmaps(); - usermodehelper_enable(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); pm_restore_console(); diff --git a/kernel/power/process.c b/kernel/power/process.c index 0d2aeb22610..19db29f6755 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -16,6 +16,7 @@ #include <linux/freezer.h> #include <linux/delay.h> #include <linux/workqueue.h> +#include <linux/kmod.h> /* * Timeout for stopping processes @@ -122,6 +123,10 @@ int freeze_processes(void) { int error; + error = __usermodehelper_disable(UMH_FREEZING); + if (error) + return error; + if (!pm_freezing) atomic_inc(&system_freezing_cnt); @@ -130,6 +135,7 @@ int freeze_processes(void) error = try_to_freeze_tasks(true); if (!error) { printk("done."); + __usermodehelper_set_disable_depth(UMH_DISABLED); oom_killer_disable(); } printk("\n"); @@ -187,6 +193,8 @@ void thaw_processes(void) } while_each_thread(g, p); read_unlock(&tasklist_lock); + usermodehelper_enable(); + schedule(); printk("done.\n"); } diff --git a/kernel/power/qos.c b/kernel/power/qos.c index d6d6dbd1ecc..6a031e68402 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -230,6 +230,21 @@ int pm_qos_request_active(struct pm_qos_request *req) EXPORT_SYMBOL_GPL(pm_qos_request_active); /** + * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout + * @work: work struct for the delayed work (timeout) + * + * This cancels the timeout request by falling back to the default at timeout. + */ +static void pm_qos_work_fn(struct work_struct *work) +{ + struct pm_qos_request *req = container_of(to_delayed_work(work), + struct pm_qos_request, + work); + + pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); +} + +/** * pm_qos_add_request - inserts new qos request into the list * @req: pointer to a preallocated handle * @pm_qos_class: identifies which list of qos request to use @@ -253,6 +268,7 @@ void pm_qos_add_request(struct pm_qos_request *req, return; } req->pm_qos_class = pm_qos_class; + INIT_DELAYED_WORK(&req->work, pm_qos_work_fn); pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, &req->node, PM_QOS_ADD_REQ, value); } @@ -279,6 +295,9 @@ void pm_qos_update_request(struct pm_qos_request *req, return; } + if (delayed_work_pending(&req->work)) + cancel_delayed_work_sync(&req->work); + if (new_value != req->node.prio) pm_qos_update_target( pm_qos_array[req->pm_qos_class]->constraints, @@ -287,6 +306,34 @@ void pm_qos_update_request(struct pm_qos_request *req, EXPORT_SYMBOL_GPL(pm_qos_update_request); /** + * pm_qos_update_request_timeout - modifies an existing qos request temporarily. + * @req : handle to list element holding a pm_qos request to use + * @new_value: defines the temporal qos request + * @timeout_us: the effective duration of this qos request in usecs. + * + * After timeout_us, this qos request is cancelled automatically. + */ +void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value, + unsigned long timeout_us) +{ + if (!req) + return; + if (WARN(!pm_qos_request_active(req), + "%s called for unknown object.", __func__)) + return; + + if (delayed_work_pending(&req->work)) + cancel_delayed_work_sync(&req->work); + + if (new_value != req->node.prio) + pm_qos_update_target( + pm_qos_array[req->pm_qos_class]->constraints, + &req->node, PM_QOS_UPDATE_REQ, new_value); + + schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us)); +} + +/** * pm_qos_remove_request - modifies an existing qos request * @req: handle to request list element * @@ -305,6 +352,9 @@ void pm_qos_remove_request(struct pm_qos_request *req) return; } + if (delayed_work_pending(&req->work)) + cancel_delayed_work_sync(&req->work); + pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, &req->node, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 88e5c967370..396d262b8fd 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -12,7 +12,6 @@ #include <linux/delay.h> #include <linux/errno.h> #include <linux/init.h> -#include <linux/kmod.h> #include <linux/console.h> #include <linux/cpu.h> #include <linux/syscalls.h> @@ -102,17 +101,12 @@ static int suspend_prepare(void) if (error) goto Finish; - error = usermodehelper_disable(); - if (error) - goto Finish; - error = suspend_freeze_processes(); if (!error) return 0; suspend_stats.failed_freeze++; dpm_save_failed_step(SUSPEND_FREEZE); - usermodehelper_enable(); Finish: pm_notifier_call_chain(PM_POST_SUSPEND); pm_restore_console(); @@ -259,7 +253,6 @@ int suspend_devices_and_enter(suspend_state_t state) static void suspend_finish(void) { suspend_thaw_processes(); - usermodehelper_enable(); pm_notifier_call_chain(PM_POST_SUSPEND); pm_restore_console(); } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8742fd013a9..eef311a58a6 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -51,6 +51,23 @@ #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) +/* + * Number of free pages that are not high. + */ +static inline unsigned long low_free_pages(void) +{ + return nr_free_pages() - nr_free_highpages(); +} + +/* + * Number of pages required to be kept free while writing the image. Always + * half of all available low pages before the writing starts. + */ +static inline unsigned long reqd_free_pages(void) +{ + return low_free_pages() / 2; +} + struct swap_map_page { sector_t entries[MAP_PAGE_ENTRIES]; sector_t next_swap; @@ -72,7 +89,7 @@ struct swap_map_handle { sector_t cur_swap; sector_t first_sector; unsigned int k; - unsigned long nr_free_pages, written; + unsigned long reqd_free_pages; u32 crc32; }; @@ -316,8 +333,7 @@ static int get_swap_writer(struct swap_map_handle *handle) goto err_rel; } handle->k = 0; - handle->nr_free_pages = nr_free_pages() >> 1; - handle->written = 0; + handle->reqd_free_pages = reqd_free_pages(); handle->first_sector = handle->cur_swap; return 0; err_rel: @@ -352,11 +368,11 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, handle->cur_swap = offset; handle->k = 0; } - if (bio_chain && ++handle->written > handle->nr_free_pages) { + if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { error = hib_wait_on_bio_chain(bio_chain); if (error) goto out; - handle->written = 0; + handle->reqd_free_pages = reqd_free_pages(); } out: return error; @@ -618,7 +634,7 @@ static int save_image_lzo(struct swap_map_handle *handle, * Adjust number of free pages after all allocations have been done. * We don't want to run out of pages when writing. */ - handle->nr_free_pages = nr_free_pages() >> 1; + handle->reqd_free_pages = reqd_free_pages(); /* * Start the CRC32 thread. diff --git a/kernel/power/user.c b/kernel/power/user.c index 33c4329205a..91b0fd021a9 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -12,7 +12,6 @@ #include <linux/suspend.h> #include <linux/syscalls.h> #include <linux/reboot.h> -#include <linux/kmod.h> #include <linux/string.h> #include <linux/device.h> #include <linux/miscdevice.h> @@ -222,14 +221,8 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, sys_sync(); printk("done.\n"); - error = usermodehelper_disable(); - if (error) - break; - error = freeze_processes(); - if (error) - usermodehelper_enable(); - else + if (!error) data->frozen = 1; break; @@ -238,7 +231,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; pm_restore_gfp_mask(); thaw_processes(); - usermodehelper_enable(); data->frozen = 0; break; diff --git a/kernel/printk.c b/kernel/printk.c index b663c2c95d3..32462d2b364 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -41,6 +41,7 @@ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/rculist.h> +#include <linux/poll.h> #include <asm/uaccess.h> @@ -54,8 +55,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) { } -#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) - /* printk's without a loglevel use this.. */ #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL @@ -99,24 +98,6 @@ EXPORT_SYMBOL_GPL(console_drivers); static int console_locked, console_suspended; /* - * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars - * It is also used in interesting ways to provide interlocking in - * console_unlock();. - */ -static DEFINE_RAW_SPINLOCK(logbuf_lock); - -#define LOG_BUF_MASK (log_buf_len-1) -#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) - -/* - * The indices into log_buf are not constrained to log_buf_len - they - * must be masked before subscripting - */ -static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */ -static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */ -static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ - -/* * If exclusive_console is non-NULL then only this console is to be printed to. */ static struct console *exclusive_console; @@ -145,13 +126,491 @@ EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; +/* + * The printk log buffer consists of a chain of concatenated variable + * length records. Every record starts with a record header, containing + * the overall length of the record. + * + * The heads to the first and last entry in the buffer, as well as the + * sequence numbers of these both entries are maintained when messages + * are stored.. + * + * If the heads indicate available messages, the length in the header + * tells the start next message. A length == 0 for the next message + * indicates a wrap-around to the beginning of the buffer. + * + * Every record carries the monotonic timestamp in microseconds, as well as + * the standard userspace syslog level and syslog facility. The usual + * kernel messages use LOG_KERN; userspace-injected messages always carry + * a matching syslog facility, by default LOG_USER. The origin of every + * message can be reliably determined that way. + * + * The human readable log message directly follows the message header. The + * length of the message text is stored in the header, the stored message + * is not terminated. + * + * Optionally, a message can carry a dictionary of properties (key/value pairs), + * to provide userspace with a machine-readable message context. + * + * Examples for well-defined, commonly used property names are: + * DEVICE=b12:8 device identifier + * b12:8 block dev_t + * c127:3 char dev_t + * n8 netdev ifindex + * +sound:card0 subsystem:devname + * SUBSYSTEM=pci driver-core subsystem name + * + * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value + * follows directly after a '=' character. Every property is terminated by + * a '\0' character. The last property is not terminated. + * + * Example of a message structure: + * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec + * 0008 34 00 record is 52 bytes long + * 000a 0b 00 text is 11 bytes long + * 000c 1f 00 dictionary is 23 bytes long + * 000e 03 00 LOG_KERN (facility) LOG_ERR (level) + * 0010 69 74 27 73 20 61 20 6c "it's a l" + * 69 6e 65 "ine" + * 001b 44 45 56 49 43 "DEVIC" + * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D" + * 52 49 56 45 52 3d 62 75 "RIVER=bu" + * 67 "g" + * 0032 00 00 00 padding to next message header + * + * The 'struct log' buffer header must never be directly exported to + * userspace, it is a kernel-private implementation detail that might + * need to be changed in the future, when the requirements change. + * + * /dev/kmsg exports the structured data in the following line format: + * "level,sequnum,timestamp;<message text>\n" + * + * The optional key/value pairs are attached as continuation lines starting + * with a space character and terminated by a newline. All possible + * non-prinatable characters are escaped in the "\xff" notation. + * + * Users of the export format should ignore possible additional values + * separated by ',', and find the message after the ';' character. + */ + +struct log { + u64 ts_nsec; /* timestamp in nanoseconds */ + u16 len; /* length of entire record */ + u16 text_len; /* length of text buffer */ + u16 dict_len; /* length of dictionary buffer */ + u16 level; /* syslog level + facility */ +}; + +/* + * The logbuf_lock protects kmsg buffer, indices, counters. It is also + * used in interesting ways to provide interlocking in console_unlock(); + */ +static DEFINE_RAW_SPINLOCK(logbuf_lock); + +/* the next printk record to read by syslog(READ) or /proc/kmsg */ +static u64 syslog_seq; +static u32 syslog_idx; + +/* index and sequence number of the first record stored in the buffer */ +static u64 log_first_seq; +static u32 log_first_idx; + +/* index and sequence number of the next record to store in the buffer */ +static u64 log_next_seq; #ifdef CONFIG_PRINTK +static u32 log_next_idx; + +/* the next printk record to read after the last 'clear' command */ +static u64 clear_seq; +static u32 clear_idx; + +#define LOG_LINE_MAX 1024 -static char __log_buf[__LOG_BUF_LEN]; +/* record buffer */ +#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +#define LOG_ALIGN 4 +#else +#define LOG_ALIGN 8 +#endif +#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; -static int log_buf_len = __LOG_BUF_LEN; -static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ -static int saved_console_loglevel = -1; +static u32 log_buf_len = __LOG_BUF_LEN; + +/* cpu currently holding logbuf_lock */ +static volatile unsigned int logbuf_cpu = UINT_MAX; + +/* human readable text of the record */ +static char *log_text(const struct log *msg) +{ + return (char *)msg + sizeof(struct log); +} + +/* optional key/value pair dictionary attached to the record */ +static char *log_dict(const struct log *msg) +{ + return (char *)msg + sizeof(struct log) + msg->text_len; +} + +/* get record by index; idx must point to valid msg */ +static struct log *log_from_idx(u32 idx) +{ + struct log *msg = (struct log *)(log_buf + idx); + + /* + * A length == 0 record is the end of buffer marker. Wrap around and + * read the message at the start of the buffer. + */ + if (!msg->len) + return (struct log *)log_buf; + return msg; +} + +/* get next record; idx must point to valid msg */ +static u32 log_next(u32 idx) +{ + struct log *msg = (struct log *)(log_buf + idx); + + /* length == 0 indicates the end of the buffer; wrap */ + /* + * A length == 0 record is the end of buffer marker. Wrap around and + * read the message at the start of the buffer as *this* one, and + * return the one after that. + */ + if (!msg->len) { + msg = (struct log *)log_buf; + return msg->len; + } + return idx + msg->len; +} + +/* insert record into the buffer, discard old ones, update heads */ +static void log_store(int facility, int level, + const char *dict, u16 dict_len, + const char *text, u16 text_len) +{ + struct log *msg; + u32 size, pad_len; + + /* number of '\0' padding bytes to next message */ + size = sizeof(struct log) + text_len + dict_len; + pad_len = (-size) & (LOG_ALIGN - 1); + size += pad_len; + + while (log_first_seq < log_next_seq) { + u32 free; + + if (log_next_idx > log_first_idx) + free = max(log_buf_len - log_next_idx, log_first_idx); + else + free = log_first_idx - log_next_idx; + + if (free > size + sizeof(struct log)) + break; + + /* drop old messages until we have enough contiuous space */ + log_first_idx = log_next(log_first_idx); + log_first_seq++; + } + + if (log_next_idx + size + sizeof(struct log) >= log_buf_len) { + /* + * This message + an additional empty header does not fit + * at the end of the buffer. Add an empty header with len == 0 + * to signify a wrap around. + */ + memset(log_buf + log_next_idx, 0, sizeof(struct log)); + log_next_idx = 0; + } + + /* fill message */ + msg = (struct log *)(log_buf + log_next_idx); + memcpy(log_text(msg), text, text_len); + msg->text_len = text_len; + memcpy(log_dict(msg), dict, dict_len); + msg->dict_len = dict_len; + msg->level = (facility << 3) | (level & 7); + msg->ts_nsec = local_clock(); + memset(log_dict(msg) + dict_len, 0, pad_len); + msg->len = sizeof(struct log) + text_len + dict_len + pad_len; + + /* insert message */ + log_next_idx += msg->len; + log_next_seq++; +} + +/* /dev/kmsg - userspace message inject/listen interface */ +struct devkmsg_user { + u64 seq; + u32 idx; + struct mutex lock; + char buf[8192]; +}; + +static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, + unsigned long count, loff_t pos) +{ + char *buf, *line; + int i; + int level = default_message_loglevel; + int facility = 1; /* LOG_USER */ + size_t len = iov_length(iv, count); + ssize_t ret = len; + + if (len > LOG_LINE_MAX) + return -EINVAL; + buf = kmalloc(len+1, GFP_KERNEL); + if (buf == NULL) + return -ENOMEM; + + line = buf; + for (i = 0; i < count; i++) { + if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) + goto out; + line += iv[i].iov_len; + } + + /* + * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace + * the decimal value represents 32bit, the lower 3 bit are the log + * level, the rest are the log facility. + * + * If no prefix or no userspace facility is specified, we + * enforce LOG_USER, to be able to reliably distinguish + * kernel-generated messages from userspace-injected ones. + */ + line = buf; + if (line[0] == '<') { + char *endp = NULL; + + i = simple_strtoul(line+1, &endp, 10); + if (endp && endp[0] == '>') { + level = i & 7; + if (i >> 3) + facility = i >> 3; + endp++; + len -= endp - line; + line = endp; + } + } + line[len] = '\0'; + + printk_emit(facility, level, NULL, 0, "%s", line); +out: + kfree(buf); + return ret; +} + +static ssize_t devkmsg_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct devkmsg_user *user = file->private_data; + struct log *msg; + u64 ts_usec; + size_t i; + size_t len; + ssize_t ret; + + if (!user) + return -EBADF; + + mutex_lock(&user->lock); + raw_spin_lock(&logbuf_lock); + while (user->seq == log_next_seq) { + if (file->f_flags & O_NONBLOCK) { + ret = -EAGAIN; + raw_spin_unlock(&logbuf_lock); + goto out; + } + + raw_spin_unlock(&logbuf_lock); + ret = wait_event_interruptible(log_wait, + user->seq != log_next_seq); + if (ret) + goto out; + raw_spin_lock(&logbuf_lock); + } + + if (user->seq < log_first_seq) { + /* our last seen message is gone, return error and reset */ + user->idx = log_first_idx; + user->seq = log_first_seq; + ret = -EPIPE; + raw_spin_unlock(&logbuf_lock); + goto out; + } + + msg = log_from_idx(user->idx); + ts_usec = msg->ts_nsec; + do_div(ts_usec, 1000); + len = sprintf(user->buf, "%u,%llu,%llu;", + msg->level, user->seq, ts_usec); + + /* escape non-printable characters */ + for (i = 0; i < msg->text_len; i++) { + unsigned char c = log_text(msg)[i]; + + if (c < ' ' || c >= 128) + len += sprintf(user->buf + len, "\\x%02x", c); + else + user->buf[len++] = c; + } + user->buf[len++] = '\n'; + + if (msg->dict_len) { + bool line = true; + + for (i = 0; i < msg->dict_len; i++) { + unsigned char c = log_dict(msg)[i]; + + if (line) { + user->buf[len++] = ' '; + line = false; + } + + if (c == '\0') { + user->buf[len++] = '\n'; + line = true; + continue; + } + + if (c < ' ' || c >= 128) { + len += sprintf(user->buf + len, "\\x%02x", c); + continue; + } + + user->buf[len++] = c; + } + user->buf[len++] = '\n'; + } + + user->idx = log_next(user->idx); + user->seq++; + raw_spin_unlock(&logbuf_lock); + + if (len > count) { + ret = -EINVAL; + goto out; + } + + if (copy_to_user(buf, user->buf, len)) { + ret = -EFAULT; + goto out; + } + ret = len; +out: + mutex_unlock(&user->lock); + return ret; +} + +static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) +{ + struct devkmsg_user *user = file->private_data; + loff_t ret = 0; + + if (!user) + return -EBADF; + if (offset) + return -ESPIPE; + + raw_spin_lock(&logbuf_lock); + switch (whence) { + case SEEK_SET: + /* the first record */ + user->idx = log_first_idx; + user->seq = log_first_seq; + break; + case SEEK_DATA: + /* + * The first record after the last SYSLOG_ACTION_CLEAR, + * like issued by 'dmesg -c'. Reading /dev/kmsg itself + * changes no global state, and does not clear anything. + */ + user->idx = clear_idx; + user->seq = clear_seq; + break; + case SEEK_END: + /* after the last record */ + user->idx = log_next_idx; + user->seq = log_next_seq; + break; + default: + ret = -EINVAL; + } + raw_spin_unlock(&logbuf_lock); + return ret; +} + +static unsigned int devkmsg_poll(struct file *file, poll_table *wait) +{ + struct devkmsg_user *user = file->private_data; + int ret = 0; + + if (!user) + return POLLERR|POLLNVAL; + + poll_wait(file, &log_wait, wait); + + raw_spin_lock(&logbuf_lock); + if (user->seq < log_next_seq) { + /* return error when data has vanished underneath us */ + if (user->seq < log_first_seq) + ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; + ret = POLLIN|POLLRDNORM; + } + raw_spin_unlock(&logbuf_lock); + + return ret; +} + +static int devkmsg_open(struct inode *inode, struct file *file) +{ + struct devkmsg_user *user; + int err; + + /* write-only does not need any file context */ + if ((file->f_flags & O_ACCMODE) == O_WRONLY) + return 0; + + err = security_syslog(SYSLOG_ACTION_READ_ALL); + if (err) + return err; + + user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); + if (!user) + return -ENOMEM; + + mutex_init(&user->lock); + + raw_spin_lock(&logbuf_lock); + user->idx = log_first_idx; + user->seq = log_first_seq; + raw_spin_unlock(&logbuf_lock); + + file->private_data = user; + return 0; +} + +static int devkmsg_release(struct inode *inode, struct file *file) +{ + struct devkmsg_user *user = file->private_data; + + if (!user) + return 0; + + mutex_destroy(&user->lock); + kfree(user); + return 0; +} + +const struct file_operations kmsg_fops = { + .open = devkmsg_open, + .read = devkmsg_read, + .aio_write = devkmsg_writev, + .llseek = devkmsg_llseek, + .poll = devkmsg_poll, + .release = devkmsg_release, +}; #ifdef CONFIG_KEXEC /* @@ -165,9 +624,9 @@ static int saved_console_loglevel = -1; void log_buf_kexec_setup(void) { VMCOREINFO_SYMBOL(log_buf); - VMCOREINFO_SYMBOL(log_end); VMCOREINFO_SYMBOL(log_buf_len); - VMCOREINFO_SYMBOL(logged_chars); + VMCOREINFO_SYMBOL(log_first_idx); + VMCOREINFO_SYMBOL(log_next_idx); } #endif @@ -191,7 +650,6 @@ early_param("log_buf_len", log_buf_len_setup); void __init setup_log_buf(int early) { unsigned long flags; - unsigned start, dest_idx, offset; char *new_log_buf; int free; @@ -219,20 +677,8 @@ void __init setup_log_buf(int early) log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; - free = __LOG_BUF_LEN - log_end; - - offset = start = min(con_start, log_start); - dest_idx = 0; - while (start != log_end) { - unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1); - - log_buf[dest_idx] = __log_buf[log_idx_mask]; - start++; - dest_idx++; - } - log_start -= offset; - con_start -= offset; - log_end -= offset; + free = __LOG_BUF_LEN - log_next_idx; + memcpy(log_buf, __log_buf, __LOG_BUF_LEN); raw_spin_unlock_irqrestore(&logbuf_lock, flags); pr_info("log_buf_len: %d\n", log_buf_len); @@ -332,11 +778,202 @@ static int check_syslog_permissions(int type, bool from_file) return 0; } +#if defined(CONFIG_PRINTK_TIME) +static bool printk_time = 1; +#else +static bool printk_time; +#endif +module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); + +static size_t print_prefix(const struct log *msg, bool syslog, char *buf) +{ + size_t len = 0; + + if (syslog) { + if (buf) { + len += sprintf(buf, "<%u>", msg->level); + } else { + len += 3; + if (msg->level > 9) + len++; + if (msg->level > 99) + len++; + } + } + + if (printk_time) { + if (buf) { + unsigned long long ts = msg->ts_nsec; + unsigned long rem_nsec = do_div(ts, 1000000000); + + len += sprintf(buf + len, "[%5lu.%06lu] ", + (unsigned long) ts, rem_nsec / 1000); + } else { + len += 15; + } + } + + return len; +} + +static size_t msg_print_text(const struct log *msg, bool syslog, + char *buf, size_t size) +{ + const char *text = log_text(msg); + size_t text_size = msg->text_len; + size_t len = 0; + + do { + const char *next = memchr(text, '\n', text_size); + size_t text_len; + + if (next) { + text_len = next - text; + next++; + text_size -= next - text; + } else { + text_len = text_size; + } + + if (buf) { + if (print_prefix(msg, syslog, NULL) + + text_len + 1>= size - len) + break; + + len += print_prefix(msg, syslog, buf + len); + memcpy(buf + len, text, text_len); + len += text_len; + buf[len++] = '\n'; + } else { + /* SYSLOG_ACTION_* buffer size only calculation */ + len += print_prefix(msg, syslog, NULL); + len += text_len + 1; + } + + text = next; + } while (text); + + return len; +} + +static int syslog_print(char __user *buf, int size) +{ + char *text; + struct log *msg; + int len; + + text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); + if (!text) + return -ENOMEM; + + raw_spin_lock_irq(&logbuf_lock); + if (syslog_seq < log_first_seq) { + /* messages are gone, move to first one */ + syslog_seq = log_first_seq; + syslog_idx = log_first_idx; + } + msg = log_from_idx(syslog_idx); + len = msg_print_text(msg, true, text, LOG_LINE_MAX); + syslog_idx = log_next(syslog_idx); + syslog_seq++; + raw_spin_unlock_irq(&logbuf_lock); + + if (len > 0 && copy_to_user(buf, text, len)) + len = -EFAULT; + + kfree(text); + return len; +} + +static int syslog_print_all(char __user *buf, int size, bool clear) +{ + char *text; + int len = 0; + + text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); + if (!text) + return -ENOMEM; + + raw_spin_lock_irq(&logbuf_lock); + if (buf) { + u64 next_seq; + u64 seq; + u32 idx; + + if (clear_seq < log_first_seq) { + /* messages are gone, move to first available one */ + clear_seq = log_first_seq; + clear_idx = log_first_idx; + } + + /* + * Find first record that fits, including all following records, + * into the user-provided buffer for this dump. + */ + seq = clear_seq; + idx = clear_idx; + while (seq < log_next_seq) { + struct log *msg = log_from_idx(idx); + + len += msg_print_text(msg, true, NULL, 0); + idx = log_next(idx); + seq++; + } + seq = clear_seq; + idx = clear_idx; + while (len > size && seq < log_next_seq) { + struct log *msg = log_from_idx(idx); + + len -= msg_print_text(msg, true, NULL, 0); + idx = log_next(idx); + seq++; + } + + /* last message in this dump */ + next_seq = log_next_seq; + + len = 0; + while (len >= 0 && seq < next_seq) { + struct log *msg = log_from_idx(idx); + int textlen; + + textlen = msg_print_text(msg, true, text, LOG_LINE_MAX); + if (textlen < 0) { + len = textlen; + break; + } + idx = log_next(idx); + seq++; + + raw_spin_unlock_irq(&logbuf_lock); + if (copy_to_user(buf + len, text, textlen)) + len = -EFAULT; + else + len += textlen; + raw_spin_lock_irq(&logbuf_lock); + + if (seq < log_first_seq) { + /* messages are gone, move to next one */ + seq = log_first_seq; + idx = log_first_idx; + } + } + } + + if (clear) { + clear_seq = log_next_seq; + clear_idx = log_next_idx; + } + raw_spin_unlock_irq(&logbuf_lock); + + kfree(text); + return len; +} + int do_syslog(int type, char __user *buf, int len, bool from_file) { - unsigned i, j, limit, count; - int do_clear = 0; - char c; + bool clear = false; + static int saved_console_loglevel = -1; int error; error = check_syslog_permissions(type, from_file); @@ -364,28 +1001,14 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) goto out; } error = wait_event_interruptible(log_wait, - (log_start - log_end)); + syslog_seq != log_next_seq); if (error) goto out; - i = 0; - raw_spin_lock_irq(&logbuf_lock); - while (!error && (log_start != log_end) && i < len) { - c = LOG_BUF(log_start); - log_start++; - raw_spin_unlock_irq(&logbuf_lock); - error = __put_user(c,buf); - buf++; - i++; - cond_resched(); - raw_spin_lock_irq(&logbuf_lock); - } - raw_spin_unlock_irq(&logbuf_lock); - if (!error) - error = i; + error = syslog_print(buf, len); break; /* Read/clear last kernel messages */ case SYSLOG_ACTION_READ_CLEAR: - do_clear = 1; + clear = true; /* FALL THRU */ /* Read last kernel messages */ case SYSLOG_ACTION_READ_ALL: @@ -399,52 +1022,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) error = -EFAULT; goto out; } - count = len; - if (count > log_buf_len) - count = log_buf_len; - raw_spin_lock_irq(&logbuf_lock); - if (count > logged_chars) - count = logged_chars; - if (do_clear) - logged_chars = 0; - limit = log_end; - /* - * __put_user() could sleep, and while we sleep - * printk() could overwrite the messages - * we try to copy to user space. Therefore - * the messages are copied in reverse. <manfreds> - */ - for (i = 0; i < count && !error; i++) { - j = limit-1-i; - if (j + log_buf_len < log_end) - break; - c = LOG_BUF(j); - raw_spin_unlock_irq(&logbuf_lock); - error = __put_user(c,&buf[count-1-i]); - cond_resched(); - raw_spin_lock_irq(&logbuf_lock); - } - raw_spin_unlock_irq(&logbuf_lock); - if (error) - break; - error = i; - if (i != count) { - int offset = count-error; - /* buffer overflow during copy, correct user buffer. */ - for (i = 0; i < error; i++) { - if (__get_user(c,&buf[i+offset]) || - __put_user(c,&buf[i])) { - error = -EFAULT; - break; - } - cond_resched(); - } - } + error = syslog_print_all(buf, len, clear); break; /* Clear ring buffer */ case SYSLOG_ACTION_CLEAR: - logged_chars = 0; - break; + syslog_print_all(NULL, 0, true); /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: if (saved_console_loglevel == -1) @@ -472,7 +1054,35 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: - error = log_end - log_start; + raw_spin_lock_irq(&logbuf_lock); + if (syslog_seq < log_first_seq) { + /* messages are gone, move to first one */ + syslog_seq = log_first_seq; + syslog_idx = log_first_idx; + } + if (from_file) { + /* + * Short-cut for poll(/"proc/kmsg") which simply checks + * for pending data, not the size; return the count of + * records, not the length. + */ + error = log_next_idx - syslog_idx; + } else { + u64 seq; + u32 idx; + + error = 0; + seq = syslog_seq; + idx = syslog_idx; + while (seq < log_next_seq) { + struct log *msg = log_from_idx(idx); + + error += msg_print_text(msg, true, NULL, 0); + idx = log_next(idx); + seq++; + } + } + raw_spin_unlock_irq(&logbuf_lock); break; /* Size of the log buffer */ case SYSLOG_ACTION_SIZE_BUFFER: @@ -501,29 +1111,11 @@ void kdb_syslog_data(char *syslog_data[4]) { syslog_data[0] = log_buf; syslog_data[1] = log_buf + log_buf_len; - syslog_data[2] = log_buf + log_end - - (logged_chars < log_buf_len ? logged_chars : log_buf_len); - syslog_data[3] = log_buf + log_end; + syslog_data[2] = log_buf + log_first_idx; + syslog_data[3] = log_buf + log_next_idx; } #endif /* CONFIG_KGDB_KDB */ -/* - * Call the console drivers on a range of log_buf - */ -static void __call_console_drivers(unsigned start, unsigned end) -{ - struct console *con; - - for_each_console(con) { - if (exclusive_console && con != exclusive_console) - continue; - if ((con->flags & CON_ENABLED) && con->write && - (cpu_online(smp_processor_id()) || - (con->flags & CON_ANYTIME))) - con->write(con, &LOG_BUF(start), end - start); - } -} - static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) @@ -540,142 +1132,33 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" "print all kernel messages to the console."); /* - * Write out chars from start to end - 1 inclusive - */ -static void _call_console_drivers(unsigned start, - unsigned end, int msg_log_level) -{ - trace_console(&LOG_BUF(0), start, end, log_buf_len); - - if ((msg_log_level < console_loglevel || ignore_loglevel) && - console_drivers && start != end) { - if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { - /* wrapped write */ - __call_console_drivers(start & LOG_BUF_MASK, - log_buf_len); - __call_console_drivers(0, end & LOG_BUF_MASK); - } else { - __call_console_drivers(start, end); - } - } -} - -/* - * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the - * lower 3 bit are the log level, the rest are the log facility. In case - * userspace passes usual userspace syslog messages to /dev/kmsg or - * /dev/ttyprintk, the log prefix might contain the facility. Printk needs - * to extract the correct log level for in-kernel processing, and not mangle - * the original value. - * - * If a prefix is found, the length of the prefix is returned. If 'level' is - * passed, it will be filled in with the log level without a possible facility - * value. If 'special' is passed, the special printk prefix chars are accepted - * and returned. If no valid header is found, 0 is returned and the passed - * variables are not touched. - */ -static size_t log_prefix(const char *p, unsigned int *level, char *special) -{ - unsigned int lev = 0; - char sp = '\0'; - size_t len; - - if (p[0] != '<' || !p[1]) - return 0; - if (p[2] == '>') { - /* usual single digit level number or special char */ - switch (p[1]) { - case '0' ... '7': - lev = p[1] - '0'; - break; - case 'c': /* KERN_CONT */ - case 'd': /* KERN_DEFAULT */ - sp = p[1]; - break; - default: - return 0; - } - len = 3; - } else { - /* multi digit including the level and facility number */ - char *endp = NULL; - - lev = (simple_strtoul(&p[1], &endp, 10) & 7); - if (endp == NULL || endp[0] != '>') - return 0; - len = (endp + 1) - p; - } - - /* do not accept special char if not asked for */ - if (sp && !special) - return 0; - - if (special) { - *special = sp; - /* return special char, do not touch level */ - if (sp) - return len; - } - - if (level) - *level = lev; - return len; -} - -/* * Call the console drivers, asking them to write out * log_buf[start] to log_buf[end - 1]. * The console_lock must be held. */ -static void call_console_drivers(unsigned start, unsigned end) +static void call_console_drivers(int level, const char *text, size_t len) { - unsigned cur_index, start_print; - static int msg_level = -1; + struct console *con; - BUG_ON(((int)(start - end)) > 0); + trace_console(text, 0, len, len); - cur_index = start; - start_print = start; - while (cur_index != end) { - if (msg_level < 0 && ((end - cur_index) > 2)) { - /* strip log prefix */ - cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL); - start_print = cur_index; - } - while (cur_index != end) { - char c = LOG_BUF(cur_index); - - cur_index++; - if (c == '\n') { - if (msg_level < 0) { - /* - * printk() has already given us loglevel tags in - * the buffer. This code is here in case the - * log buffer has wrapped right round and scribbled - * on those tags - */ - msg_level = default_message_loglevel; - } - _call_console_drivers(start_print, cur_index, msg_level); - msg_level = -1; - start_print = cur_index; - break; - } - } - } - _call_console_drivers(start_print, end, msg_level); -} + if (level >= console_loglevel && !ignore_loglevel) + return; + if (!console_drivers) + return; -static void emit_log_char(char c) -{ - LOG_BUF(log_end) = c; - log_end++; - if (log_end - log_start > log_buf_len) - log_start = log_end - log_buf_len; - if (log_end - con_start > log_buf_len) - con_start = log_end - log_buf_len; - if (logged_chars < log_buf_len) - logged_chars++; + for_each_console(con) { + if (exclusive_console && con != exclusive_console) + continue; + if (!(con->flags & CON_ENABLED)) + continue; + if (!con->write) + continue; + if (!cpu_online(smp_processor_id()) && + !(con->flags & CON_ANYTIME)) + continue; + con->write(con, text, len); + } } /* @@ -700,16 +1183,6 @@ static void zap_locks(void) sema_init(&console_sem, 1); } -#if defined(CONFIG_PRINTK_TIME) -static bool printk_time = 1; -#else -static bool printk_time = 0; -#endif -module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); - -static bool always_kmsg_dump; -module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); - /* Check if we have any console registered that can be called early in boot. */ static int have_callable_console(void) { @@ -722,51 +1195,6 @@ static int have_callable_console(void) return 0; } -/** - * printk - print a kernel message - * @fmt: format string - * - * This is printk(). It can be called from any context. We want it to work. - * - * We try to grab the console_lock. If we succeed, it's easy - we log the output and - * call the console drivers. If we fail to get the semaphore we place the output - * into the log buffer and return. The current holder of the console_sem will - * notice the new output in console_unlock(); and will send it to the - * consoles before releasing the lock. - * - * One effect of this deferred printing is that code which calls printk() and - * then changes console_loglevel may break. This is because console_loglevel - * is inspected when the actual printing occurs. - * - * See also: - * printf(3) - * - * See the vsnprintf() documentation for format string extensions over C99. - */ - -asmlinkage int printk(const char *fmt, ...) -{ - va_list args; - int r; - -#ifdef CONFIG_KGDB_KDB - if (unlikely(kdb_trap_printk)) { - va_start(args, fmt); - r = vkdb_printf(fmt, args); - va_end(args); - return r; - } -#endif - va_start(args, fmt); - r = vprintk(fmt, args); - va_end(args); - - return r; -} - -/* cpu currently holding logbuf_lock */ -static volatile unsigned int printk_cpu = UINT_MAX; - /* * Can we actually use the console at this time on this cpu? * @@ -810,17 +1238,12 @@ static int console_trylock_for_printk(unsigned int cpu) retval = 0; } } - printk_cpu = UINT_MAX; + logbuf_cpu = UINT_MAX; if (wake) up(&console_sem); raw_spin_unlock(&logbuf_lock); return retval; } -static const char recursion_bug_msg [] = - KERN_CRIT "BUG: recent printk recursion!\n"; -static int recursion_bug; -static int new_text_line = 1; -static char printk_buf[1024]; int printk_delay_msec __read_mostly; @@ -836,15 +1259,23 @@ static inline void printk_delay(void) } } -asmlinkage int vprintk(const char *fmt, va_list args) +asmlinkage int vprintk_emit(int facility, int level, + const char *dict, size_t dictlen, + const char *fmt, va_list args) { - int printed_len = 0; - int current_log_level = default_message_loglevel; + static int recursion_bug; + static char cont_buf[LOG_LINE_MAX]; + static size_t cont_len; + static int cont_level; + static struct task_struct *cont_task; + static char textbuf[LOG_LINE_MAX]; + char *text = textbuf; + size_t text_len; unsigned long flags; int this_cpu; - char *p; - size_t plen; - char special; + bool newline = false; + bool prefix = false; + int printed_len = 0; boot_delay_msec(); printk_delay(); @@ -856,7 +1287,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) /* * Ouch, printk recursed into itself! */ - if (unlikely(printk_cpu == this_cpu)) { + if (unlikely(logbuf_cpu == this_cpu)) { /* * If a crash is occurring during printk() on this CPU, * then try to get the crash message out but make sure @@ -873,97 +1304,110 @@ asmlinkage int vprintk(const char *fmt, va_list args) lockdep_off(); raw_spin_lock(&logbuf_lock); - printk_cpu = this_cpu; + logbuf_cpu = this_cpu; if (recursion_bug) { + static const char recursion_msg[] = + "BUG: recent printk recursion!"; + recursion_bug = 0; - strcpy(printk_buf, recursion_bug_msg); - printed_len = strlen(recursion_bug_msg); + printed_len += strlen(recursion_msg); + /* emit KERN_CRIT message */ + log_store(0, 2, NULL, 0, recursion_msg, printed_len); } - /* Emit the output into the temporary buffer */ - printed_len += vscnprintf(printk_buf + printed_len, - sizeof(printk_buf) - printed_len, fmt, args); - p = printk_buf; + /* + * The printf needs to come first; we need the syslog + * prefix which might be passed-in as a parameter. + */ + text_len = vscnprintf(text, sizeof(textbuf), fmt, args); - /* Read log level and handle special printk prefix */ - plen = log_prefix(p, ¤t_log_level, &special); - if (plen) { - p += plen; + /* mark and strip a trailing newline */ + if (text_len && text[text_len-1] == '\n') { + text_len--; + newline = true; + } - switch (special) { - case 'c': /* Strip <c> KERN_CONT, continue line */ - plen = 0; - break; - case 'd': /* Strip <d> KERN_DEFAULT, start new line */ - plen = 0; - default: - if (!new_text_line) { - emit_log_char('\n'); - new_text_line = 1; - } + /* strip syslog prefix and extract log level or control flags */ + if (text[0] == '<' && text[1] && text[2] == '>') { + switch (text[1]) { + case '0' ... '7': + if (level == -1) + level = text[1] - '0'; + case 'd': /* KERN_DEFAULT */ + prefix = true; + case 'c': /* KERN_CONT */ + text += 3; + text_len -= 3; } } - /* - * Copy the output into log_buf. If the caller didn't provide - * the appropriate log prefix, we insert them here - */ - for (; *p; p++) { - if (new_text_line) { - new_text_line = 0; - - if (plen) { - /* Copy original log prefix */ - int i; - - for (i = 0; i < plen; i++) - emit_log_char(printk_buf[i]); - printed_len += plen; - } else { - /* Add log prefix */ - emit_log_char('<'); - emit_log_char(current_log_level + '0'); - emit_log_char('>'); - printed_len += 3; - } + if (level == -1) + level = default_message_loglevel; - if (printk_time) { - /* Add the current time stamp */ - char tbuf[50], *tp; - unsigned tlen; - unsigned long long t; - unsigned long nanosec_rem; - - t = cpu_clock(printk_cpu); - nanosec_rem = do_div(t, 1000000000); - tlen = sprintf(tbuf, "[%5lu.%06lu] ", - (unsigned long) t, - nanosec_rem / 1000); - - for (tp = tbuf; tp < tbuf + tlen; tp++) - emit_log_char(*tp); - printed_len += tlen; - } + if (dict) { + prefix = true; + newline = true; + } - if (!*p) - break; + if (!newline) { + if (cont_len && (prefix || cont_task != current)) { + /* + * Flush earlier buffer, which is either from a + * different thread, or when we got a new prefix. + */ + log_store(facility, cont_level, NULL, 0, cont_buf, cont_len); + cont_len = 0; } - emit_log_char(*p); - if (*p == '\n') - new_text_line = 1; + if (!cont_len) { + cont_level = level; + cont_task = current; + } + + /* buffer or append to earlier buffer from the same thread */ + if (cont_len + text_len > sizeof(cont_buf)) + text_len = sizeof(cont_buf) - cont_len; + memcpy(cont_buf + cont_len, text, text_len); + cont_len += text_len; + } else { + if (cont_len && cont_task == current) { + if (prefix) { + /* + * New prefix from the same thread; flush. We + * either got no earlier newline, or we race + * with an interrupt. + */ + log_store(facility, cont_level, + NULL, 0, cont_buf, cont_len); + cont_len = 0; + } + + /* append to the earlier buffer and flush */ + if (cont_len + text_len > sizeof(cont_buf)) + text_len = sizeof(cont_buf) - cont_len; + memcpy(cont_buf + cont_len, text, text_len); + cont_len += text_len; + log_store(facility, cont_level, + NULL, 0, cont_buf, cont_len); + cont_len = 0; + cont_task = NULL; + printed_len = cont_len; + } else { + /* ordinary single and terminated line */ + log_store(facility, level, + dict, dictlen, text, text_len); + printed_len = text_len; + } } /* - * Try to acquire and then immediately release the - * console semaphore. The release will do all the - * actual magic (print out buffers, wake up klogd, - * etc). + * Try to acquire and then immediately release the console semaphore. + * The release will print out buffers and wake up /dev/kmsg and syslog() + * users. * - * The console_trylock_for_printk() function - * will release 'logbuf_lock' regardless of whether it - * actually gets the semaphore or not. + * The console_trylock_for_printk() function will release 'logbuf_lock' + * regardless of whether it actually gets the console semaphore or not. */ if (console_trylock_for_printk(this_cpu)) console_unlock(); @@ -974,16 +1418,81 @@ out_restore_irqs: return printed_len; } -EXPORT_SYMBOL(printk); -EXPORT_SYMBOL(vprintk); +EXPORT_SYMBOL(vprintk_emit); -#else +asmlinkage int vprintk(const char *fmt, va_list args) +{ + return vprintk_emit(0, -1, NULL, 0, fmt, args); +} +EXPORT_SYMBOL(vprintk); -static void call_console_drivers(unsigned start, unsigned end) +asmlinkage int printk_emit(int facility, int level, + const char *dict, size_t dictlen, + const char *fmt, ...) { + va_list args; + int r; + + va_start(args, fmt); + r = vprintk_emit(facility, level, dict, dictlen, fmt, args); + va_end(args); + + return r; } +EXPORT_SYMBOL(printk_emit); +/** + * printk - print a kernel message + * @fmt: format string + * + * This is printk(). It can be called from any context. We want it to work. + * + * We try to grab the console_lock. If we succeed, it's easy - we log the + * output and call the console drivers. If we fail to get the semaphore, we + * place the output into the log buffer and return. The current holder of + * the console_sem will notice the new output in console_unlock(); and will + * send it to the consoles before releasing the lock. + * + * One effect of this deferred printing is that code which calls printk() and + * then changes console_loglevel may break. This is because console_loglevel + * is inspected when the actual printing occurs. + * + * See also: + * printf(3) + * + * See the vsnprintf() documentation for format string extensions over C99. + */ +asmlinkage int printk(const char *fmt, ...) +{ + va_list args; + int r; + +#ifdef CONFIG_KGDB_KDB + if (unlikely(kdb_trap_printk)) { + va_start(args, fmt); + r = vkdb_printf(fmt, args); + va_end(args); + return r; + } #endif + va_start(args, fmt); + r = vprintk_emit(0, -1, NULL, 0, fmt, args); + va_end(args); + + return r; +} +EXPORT_SYMBOL(printk); + +#else + +#define LOG_LINE_MAX 0 +static struct log *log_from_idx(u32 idx) { return NULL; } +static u32 log_next(u32 idx) { return 0; } +static void call_console_drivers(int level, const char *text, size_t len) {} +static size_t msg_print_text(const struct log *msg, bool syslog, + char *buf, size_t size) { return 0; } + +#endif /* CONFIG_PRINTK */ static int __add_preferred_console(char *name, int idx, char *options, char *brl_options) @@ -1217,7 +1726,7 @@ int is_console_locked(void) } /* - * Delayed printk facility, for scheduler-internal messages: + * Delayed printk version, for scheduler-internal messages: */ #define PRINTK_BUF_SIZE 512 @@ -1253,6 +1762,10 @@ void wake_up_klogd(void) this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); } +/* the next printk record to write to the console */ +static u64 console_seq; +static u32 console_idx; + /** * console_unlock - unlock the console system * @@ -1263,15 +1776,16 @@ void wake_up_klogd(void) * by printk(). If this is the case, console_unlock(); emits * the output prior to releasing the lock. * - * If there is output waiting for klogd, we wake it up. + * If there is output waiting, we wake /dev/kmsg and syslog() users. * * console_unlock(); may be called from any context. */ void console_unlock(void) { + static u64 seen_seq; unsigned long flags; - unsigned _con_start, _log_end; - unsigned wake_klogd = 0, retry = 0; + bool wake_klogd = false; + bool retry; if (console_suspended) { up(&console_sem); @@ -1281,17 +1795,38 @@ void console_unlock(void) console_may_schedule = 0; again: - for ( ; ; ) { + for (;;) { + struct log *msg; + static char text[LOG_LINE_MAX]; + size_t len; + int level; + raw_spin_lock_irqsave(&logbuf_lock, flags); - wake_klogd |= log_start - log_end; - if (con_start == log_end) - break; /* Nothing to print */ - _con_start = con_start; - _log_end = log_end; - con_start = log_end; /* Flush */ + if (seen_seq != log_next_seq) { + wake_klogd = true; + seen_seq = log_next_seq; + } + + if (console_seq < log_first_seq) { + /* messages are gone, move to first one */ + console_seq = log_first_seq; + console_idx = log_first_idx; + } + + if (console_seq == log_next_seq) + break; + + msg = log_from_idx(console_idx); + level = msg->level & 7; + + len = msg_print_text(msg, false, text, sizeof(text)); + + console_idx = log_next(console_idx); + console_seq++; raw_spin_unlock(&logbuf_lock); + stop_critical_timings(); /* don't trace print latency */ - call_console_drivers(_con_start, _log_end); + call_console_drivers(level, text, len); start_critical_timings(); local_irq_restore(flags); } @@ -1312,8 +1847,7 @@ again: * flush, no worries. */ raw_spin_lock(&logbuf_lock); - if (con_start != log_end) - retry = 1; + retry = console_seq != log_next_seq; raw_spin_unlock_irqrestore(&logbuf_lock, flags); if (retry && console_trylock()) @@ -1549,7 +2083,8 @@ void register_console(struct console *newcon) * for us. */ raw_spin_lock_irqsave(&logbuf_lock, flags); - con_start = log_start; + console_seq = syslog_seq; + console_idx = syslog_idx; raw_spin_unlock_irqrestore(&logbuf_lock, flags); /* * We're about to replay the log buffer. Only do this to the @@ -1758,6 +2293,9 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper) } EXPORT_SYMBOL_GPL(kmsg_dump_unregister); +static bool always_kmsg_dump; +module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); + /** * kmsg_dump - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping @@ -1767,8 +2305,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister); */ void kmsg_dump(enum kmsg_dump_reason reason) { - unsigned long end; - unsigned chars; + u64 idx; struct kmsg_dumper *dumper; const char *s1, *s2; unsigned long l1, l2; @@ -1780,24 +2317,27 @@ void kmsg_dump(enum kmsg_dump_reason reason) /* Theoretically, the log could move on after we do this, but there's not a lot we can do about that. The new messages will overwrite the start of what we dump. */ + raw_spin_lock_irqsave(&logbuf_lock, flags); - end = log_end & LOG_BUF_MASK; - chars = logged_chars; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + if (syslog_seq < log_first_seq) + idx = syslog_idx; + else + idx = log_first_idx; - if (chars > end) { - s1 = log_buf + log_buf_len - chars + end; - l1 = chars - end; + if (idx > log_next_idx) { + s1 = log_buf; + l1 = log_next_idx; - s2 = log_buf; - l2 = end; + s2 = log_buf + idx; + l2 = log_buf_len - idx; } else { s1 = ""; l1 = 0; - s2 = log_buf + end - chars; - l2 = chars; + s2 = log_buf + idx; + l2 = log_next_idx - idx; } + raw_spin_unlock_irqrestore(&logbuf_lock, flags); rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a86f1741cc2..95cba41ce1e 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -51,6 +51,34 @@ #include "rcu.h" +#ifdef CONFIG_PREEMPT_RCU + +/* + * Check for a task exiting while in a preemptible-RCU read-side + * critical section, clean up if so. No need to issue warnings, + * as debug_check_no_locks_held() already does this if lockdep + * is enabled. + */ +void exit_rcu(void) +{ + struct task_struct *t = current; + + if (likely(list_empty(¤t->rcu_node_entry))) + return; + t->rcu_read_lock_nesting = 1; + barrier(); + t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; + __rcu_read_unlock(); +} + +#else /* #ifdef CONFIG_PREEMPT_RCU */ + +void exit_rcu(void) +{ +} + +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; struct lockdep_map rcu_lock_map = diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 22ecea0dfb6..fc31a2d6510 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -851,22 +851,6 @@ int rcu_preempt_needs_cpu(void) return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; } -/* - * Check for a task exiting while in a preemptible -RCU read-side - * critical section, clean up if so. No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. - */ -void exit_rcu(void) -{ - struct task_struct *t = current; - - if (t->rcu_read_lock_nesting == 0) - return; - t->rcu_read_lock_nesting = 1; - __rcu_read_unlock(); -} - #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ #ifdef CONFIG_RCU_TRACE diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index a89b381a8c6..e66b34ab755 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -64,6 +64,7 @@ static int irqreader = 1; /* RCU readers from irq (timers). */ static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ static int fqs_holdoff; /* Hold time within burst (us). */ static int fqs_stutter = 3; /* Wait time between bursts (s). */ +static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */ static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ @@ -96,6 +97,8 @@ module_param(fqs_holdoff, int, 0444); MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); module_param(fqs_stutter, int, 0444); MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); +module_param(n_barrier_cbs, int, 0444); +MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); module_param(onoff_interval, int, 0444); MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); module_param(onoff_holdoff, int, 0444); @@ -139,6 +142,8 @@ static struct task_struct *shutdown_task; static struct task_struct *onoff_task; #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static struct task_struct *stall_task; +static struct task_struct **barrier_cbs_tasks; +static struct task_struct *barrier_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -164,6 +169,7 @@ static atomic_t n_rcu_torture_alloc_fail; static atomic_t n_rcu_torture_free; static atomic_t n_rcu_torture_mberror; static atomic_t n_rcu_torture_error; +static long n_rcu_torture_barrier_error; static long n_rcu_torture_boost_ktrerror; static long n_rcu_torture_boost_rterror; static long n_rcu_torture_boost_failure; @@ -173,6 +179,8 @@ static long n_offline_attempts; static long n_offline_successes; static long n_online_attempts; static long n_online_successes; +static long n_barrier_attempts; +static long n_barrier_successes; static struct list_head rcu_torture_removed; static cpumask_var_t shuffle_tmp_mask; @@ -197,6 +205,10 @@ static unsigned long shutdown_time; /* jiffies to system shutdown. */ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ +static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ +static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ +static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ +static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ @@ -327,6 +339,7 @@ struct rcu_torture_ops { int (*completed)(void); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); + void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); void (*cb_barrier)(void); void (*fqs)(void); int (*stats)(char *page); @@ -417,6 +430,7 @@ static struct rcu_torture_ops rcu_ops = { .completed = rcu_torture_completed, .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, + .call = call_rcu, .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, .stats = NULL, @@ -460,6 +474,7 @@ static struct rcu_torture_ops rcu_sync_ops = { .completed = rcu_torture_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_rcu, + .call = NULL, .cb_barrier = NULL, .fqs = rcu_force_quiescent_state, .stats = NULL, @@ -477,6 +492,7 @@ static struct rcu_torture_ops rcu_expedited_ops = { .completed = rcu_no_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_rcu_expedited, + .call = NULL, .cb_barrier = NULL, .fqs = rcu_force_quiescent_state, .stats = NULL, @@ -519,6 +535,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .completed = rcu_bh_torture_completed, .deferred_free = rcu_bh_torture_deferred_free, .sync = synchronize_rcu_bh, + .call = call_rcu_bh, .cb_barrier = rcu_barrier_bh, .fqs = rcu_bh_force_quiescent_state, .stats = NULL, @@ -535,6 +552,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { .completed = rcu_bh_torture_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_rcu_bh, + .call = NULL, .cb_barrier = NULL, .fqs = rcu_bh_force_quiescent_state, .stats = NULL, @@ -551,6 +569,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = { .completed = rcu_bh_torture_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_rcu_bh_expedited, + .call = NULL, .cb_barrier = NULL, .fqs = rcu_bh_force_quiescent_state, .stats = NULL, @@ -606,6 +625,11 @@ static int srcu_torture_completed(void) return srcu_batches_completed(&srcu_ctl); } +static void srcu_torture_deferred_free(struct rcu_torture *rp) +{ + call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb); +} + static void srcu_torture_synchronize(void) { synchronize_srcu(&srcu_ctl); @@ -620,7 +644,7 @@ static int srcu_torture_stats(char *page) cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { - cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, + cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu, per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); } @@ -635,13 +659,29 @@ static struct rcu_torture_ops srcu_ops = { .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, .completed = srcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, + .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, + .call = NULL, .cb_barrier = NULL, .stats = srcu_torture_stats, .name = "srcu" }; +static struct rcu_torture_ops srcu_sync_ops = { + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock, + .read_delay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .completed = srcu_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = srcu_torture_synchronize, + .call = NULL, + .cb_barrier = NULL, + .stats = srcu_torture_stats, + .name = "srcu_sync" +}; + static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) { return srcu_read_lock_raw(&srcu_ctl); @@ -659,13 +699,29 @@ static struct rcu_torture_ops srcu_raw_ops = { .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock_raw, .completed = srcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, + .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, + .call = NULL, .cb_barrier = NULL, .stats = srcu_torture_stats, .name = "srcu_raw" }; +static struct rcu_torture_ops srcu_raw_sync_ops = { + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock_raw, + .read_delay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock_raw, + .completed = srcu_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = srcu_torture_synchronize, + .call = NULL, + .cb_barrier = NULL, + .stats = srcu_torture_stats, + .name = "srcu_raw_sync" +}; + static void srcu_torture_synchronize_expedited(void) { synchronize_srcu_expedited(&srcu_ctl); @@ -680,6 +736,7 @@ static struct rcu_torture_ops srcu_expedited_ops = { .completed = srcu_torture_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = srcu_torture_synchronize_expedited, + .call = NULL, .cb_barrier = NULL, .stats = srcu_torture_stats, .name = "srcu_expedited" @@ -1129,7 +1186,8 @@ rcu_torture_printk(char *page) "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " "rtmbe: %d rtbke: %ld rtbre: %ld " "rtbf: %ld rtb: %ld nt: %ld " - "onoff: %ld/%ld:%ld/%ld", + "onoff: %ld/%ld:%ld/%ld " + "barrier: %ld/%ld:%ld", rcu_torture_current, rcu_torture_current_version, list_empty(&rcu_torture_freelist), @@ -1145,14 +1203,17 @@ rcu_torture_printk(char *page) n_online_successes, n_online_attempts, n_offline_successes, - n_offline_attempts); + n_offline_attempts, + n_barrier_successes, + n_barrier_attempts, + n_rcu_torture_barrier_error); + cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) != 0 || + n_rcu_torture_barrier_error != 0 || n_rcu_torture_boost_ktrerror != 0 || n_rcu_torture_boost_rterror != 0 || - n_rcu_torture_boost_failure != 0) - cnt += sprintf(&page[cnt], " !!!"); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - if (i > 1) { + n_rcu_torture_boost_failure != 0 || + i > 1) { cnt += sprintf(&page[cnt], "!!! "); atomic_inc(&n_rcu_torture_error); WARN_ON_ONCE(1); @@ -1337,6 +1398,7 @@ static void rcutorture_booster_cleanup(int cpu) /* This must be outside of the mutex, otherwise deadlock! */ kthread_stop(t); + boost_tasks[cpu] = NULL; } static int rcutorture_booster_init(int cpu) @@ -1484,13 +1546,15 @@ static void rcu_torture_onoff_cleanup(void) return; VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); kthread_stop(onoff_task); + onoff_task = NULL; } #else /* #ifdef CONFIG_HOTPLUG_CPU */ -static void +static int rcu_torture_onoff_init(void) { + return 0; } static void rcu_torture_onoff_cleanup(void) @@ -1554,6 +1618,152 @@ static void rcu_torture_stall_cleanup(void) return; VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); kthread_stop(stall_task); + stall_task = NULL; +} + +/* Callback function for RCU barrier testing. */ +void rcu_torture_barrier_cbf(struct rcu_head *rcu) +{ + atomic_inc(&barrier_cbs_invoked); +} + +/* kthread function to register callbacks used to test RCU barriers. */ +static int rcu_torture_barrier_cbs(void *arg) +{ + long myid = (long)arg; + struct rcu_head rcu; + + init_rcu_head_on_stack(&rcu); + VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); + set_user_nice(current, 19); + do { + wait_event(barrier_cbs_wq[myid], + atomic_read(&barrier_cbs_count) == n_barrier_cbs || + kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP); + if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) + break; + cur_ops->call(&rcu, rcu_torture_barrier_cbf); + if (atomic_dec_and_test(&barrier_cbs_count)) + wake_up(&barrier_wq); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping"); + rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); + while (!kthread_should_stop()) + schedule_timeout_interruptible(1); + cur_ops->cb_barrier(); + destroy_rcu_head_on_stack(&rcu); + return 0; +} + +/* kthread function to drive and coordinate RCU barrier testing. */ +static int rcu_torture_barrier(void *arg) +{ + int i; + + VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); + do { + atomic_set(&barrier_cbs_invoked, 0); + atomic_set(&barrier_cbs_count, n_barrier_cbs); + /* wake_up() path contains the required barriers. */ + for (i = 0; i < n_barrier_cbs; i++) + wake_up(&barrier_cbs_wq[i]); + wait_event(barrier_wq, + atomic_read(&barrier_cbs_count) == 0 || + kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP); + if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) + break; + n_barrier_attempts++; + cur_ops->cb_barrier(); + if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { + n_rcu_torture_barrier_error++; + WARN_ON_ONCE(1); + } + n_barrier_successes++; + schedule_timeout_interruptible(HZ / 10); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); + rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); + while (!kthread_should_stop()) + schedule_timeout_interruptible(1); + return 0; +} + +/* Initialize RCU barrier testing. */ +static int rcu_torture_barrier_init(void) +{ + int i; + int ret; + + if (n_barrier_cbs == 0) + return 0; + if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { + printk(KERN_ALERT "%s" TORTURE_FLAG + " Call or barrier ops missing for %s,\n", + torture_type, cur_ops->name); + printk(KERN_ALERT "%s" TORTURE_FLAG + " RCU barrier testing omitted from run.\n", + torture_type); + return 0; + } + atomic_set(&barrier_cbs_count, 0); + atomic_set(&barrier_cbs_invoked, 0); + barrier_cbs_tasks = + kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), + GFP_KERNEL); + barrier_cbs_wq = + kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), + GFP_KERNEL); + if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0) + return -ENOMEM; + for (i = 0; i < n_barrier_cbs; i++) { + init_waitqueue_head(&barrier_cbs_wq[i]); + barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, + (void *)(long)i, + "rcu_torture_barrier_cbs"); + if (IS_ERR(barrier_cbs_tasks[i])) { + ret = PTR_ERR(barrier_cbs_tasks[i]); + VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); + barrier_cbs_tasks[i] = NULL; + return ret; + } + } + barrier_task = kthread_run(rcu_torture_barrier, NULL, + "rcu_torture_barrier"); + if (IS_ERR(barrier_task)) { + ret = PTR_ERR(barrier_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier"); + barrier_task = NULL; + } + return 0; +} + +/* Clean up after RCU barrier testing. */ +static void rcu_torture_barrier_cleanup(void) +{ + int i; + + if (barrier_task != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task"); + kthread_stop(barrier_task); + barrier_task = NULL; + } + if (barrier_cbs_tasks != NULL) { + for (i = 0; i < n_barrier_cbs; i++) { + if (barrier_cbs_tasks[i] != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); + kthread_stop(barrier_cbs_tasks[i]); + barrier_cbs_tasks[i] = NULL; + } + } + kfree(barrier_cbs_tasks); + barrier_cbs_tasks = NULL; + } + if (barrier_cbs_wq != NULL) { + kfree(barrier_cbs_wq); + barrier_cbs_wq = NULL; + } } static int rcutorture_cpu_notify(struct notifier_block *self, @@ -1598,6 +1808,7 @@ rcu_torture_cleanup(void) fullstop = FULLSTOP_RMMOD; mutex_unlock(&fullstop_mutex); unregister_reboot_notifier(&rcutorture_shutdown_nb); + rcu_torture_barrier_cleanup(); rcu_torture_stall_cleanup(); if (stutter_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); @@ -1665,6 +1876,7 @@ rcu_torture_cleanup(void) VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); kthread_stop(shutdown_task); } + shutdown_task = NULL; rcu_torture_onoff_cleanup(); /* Wait for all RCU callbacks to fire. */ @@ -1676,7 +1888,7 @@ rcu_torture_cleanup(void) if (cur_ops->cleanup) cur_ops->cleanup(); - if (atomic_read(&n_rcu_torture_error)) + if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); else if (n_online_successes != n_online_attempts || n_offline_successes != n_offline_attempts) @@ -1692,10 +1904,12 @@ rcu_torture_init(void) int i; int cpu; int firsterr = 0; + int retval; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, - &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, + &srcu_ops, &srcu_sync_ops, &srcu_raw_ops, + &srcu_raw_sync_ops, &srcu_expedited_ops, &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; mutex_lock(&fullstop_mutex); @@ -1749,6 +1963,7 @@ rcu_torture_init(void) atomic_set(&n_rcu_torture_free, 0); atomic_set(&n_rcu_torture_mberror, 0); atomic_set(&n_rcu_torture_error, 0); + n_rcu_torture_barrier_error = 0; n_rcu_torture_boost_ktrerror = 0; n_rcu_torture_boost_rterror = 0; n_rcu_torture_boost_failure = 0; @@ -1872,7 +2087,6 @@ rcu_torture_init(void) test_boost_duration = 2; if ((test_boost == 1 && cur_ops->can_boost) || test_boost == 2) { - int retval; boost_starttime = jiffies + test_boost_interval * HZ; register_cpu_notifier(&rcutorture_cpu_nb); @@ -1897,9 +2111,22 @@ rcu_torture_init(void) goto unwind; } } - rcu_torture_onoff_init(); + i = rcu_torture_onoff_init(); + if (i != 0) { + firsterr = i; + goto unwind; + } register_reboot_notifier(&rcutorture_shutdown_nb); - rcu_torture_stall_init(); + i = rcu_torture_stall_init(); + if (i != 0) { + firsterr = i; + goto unwind; + } + retval = rcu_torture_barrier_init(); + if (retval != 0) { + firsterr = retval; + goto unwind; + } rcutorture_record_test_transition(); mutex_unlock(&fullstop_mutex); return 0; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 1050d6d3922..0da7b88d92d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -75,6 +75,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; .gpnum = -300, \ .completed = -300, \ .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ + .orphan_nxttail = &structname##_state.orphan_nxtlist, \ + .orphan_donetail = &structname##_state.orphan_donelist, \ .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ @@ -145,6 +147,13 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); unsigned long rcutorture_testseq; unsigned long rcutorture_vernum; +/* State information for rcu_barrier() and friends. */ + +static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; +static atomic_t rcu_barrier_cpu_count; +static DEFINE_MUTEX(rcu_barrier_mutex); +static struct completion rcu_barrier_completion; + /* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s * permit this function to be invoked without holding the root rcu_node @@ -192,7 +201,6 @@ void rcu_note_context_switch(int cpu) { trace_rcu_utilization("Start context switch"); rcu_sched_qs(cpu); - rcu_preempt_note_context_switch(cpu); trace_rcu_utilization("End context switch"); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -1311,95 +1319,133 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) #ifdef CONFIG_HOTPLUG_CPU /* - * Move a dying CPU's RCU callbacks to online CPU's callback list. - * Also record a quiescent state for this CPU for the current grace period. - * Synchronization and interrupt disabling are not required because - * this function executes in stop_machine() context. Therefore, cleanup - * operations that might block must be done later from the CPU_DEAD - * notifier. - * - * Note that the outgoing CPU's bit has already been cleared in the - * cpu_online_mask. This allows us to randomly pick a callback - * destination from the bits set in that mask. + * Send the specified CPU's RCU callbacks to the orphanage. The + * specified CPU must be offline, and the caller must hold the + * ->onofflock. */ -static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) +static void +rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, + struct rcu_node *rnp, struct rcu_data *rdp) { int i; - unsigned long mask; - int receive_cpu = cpumask_any(cpu_online_mask); - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); - RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */ - /* First, adjust the counts. */ + /* + * Orphan the callbacks. First adjust the counts. This is safe + * because ->onofflock excludes _rcu_barrier()'s adoption of + * the callbacks, thus no memory barrier is required. + */ if (rdp->nxtlist != NULL) { - receive_rdp->qlen_lazy += rdp->qlen_lazy; - receive_rdp->qlen += rdp->qlen; + rsp->qlen_lazy += rdp->qlen_lazy; + rsp->qlen += rdp->qlen; + rdp->n_cbs_orphaned += rdp->qlen; rdp->qlen_lazy = 0; rdp->qlen = 0; } /* - * Next, move ready-to-invoke callbacks to be invoked on some - * other CPU. These will not be required to pass through another - * grace period: They are done, regardless of CPU. + * Next, move those callbacks still needing a grace period to + * the orphanage, where some other CPU will pick them up. + * Some of the callbacks might have gone partway through a grace + * period, but that is too bad. They get to start over because we + * cannot assume that grace periods are synchronized across CPUs. + * We don't bother updating the ->nxttail[] array yet, instead + * we just reset the whole thing later on. */ - if (rdp->nxtlist != NULL && - rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { - struct rcu_head *oldhead; - struct rcu_head **oldtail; - struct rcu_head **newtail; - - oldhead = rdp->nxtlist; - oldtail = receive_rdp->nxttail[RCU_DONE_TAIL]; - rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; - *rdp->nxttail[RCU_DONE_TAIL] = *oldtail; - *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead; - newtail = rdp->nxttail[RCU_DONE_TAIL]; - for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) { - if (receive_rdp->nxttail[i] == oldtail) - receive_rdp->nxttail[i] = newtail; - if (rdp->nxttail[i] == newtail) - rdp->nxttail[i] = &rdp->nxtlist; - } + if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { + *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; + rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; + *rdp->nxttail[RCU_DONE_TAIL] = NULL; } /* - * Finally, put the rest of the callbacks at the end of the list. - * The ones that made it partway through get to start over: We - * cannot assume that grace periods are synchronized across CPUs. - * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but - * this does not seem compelling. Not yet, anyway.) + * Then move the ready-to-invoke callbacks to the orphanage, + * where some other CPU will pick them up. These will not be + * required to pass though another grace period: They are done. */ if (rdp->nxtlist != NULL) { - *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; - receive_rdp->nxttail[RCU_NEXT_TAIL] = - rdp->nxttail[RCU_NEXT_TAIL]; - receive_rdp->n_cbs_adopted += rdp->qlen; - rdp->n_cbs_orphaned += rdp->qlen; - - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; + *rsp->orphan_donetail = rdp->nxtlist; + rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; } + /* Finally, initialize the rcu_data structure's list to empty. */ + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; +} + +/* + * Adopt the RCU callbacks from the specified rcu_state structure's + * orphanage. The caller must hold the ->onofflock. + */ +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +{ + int i; + struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); + /* - * Record a quiescent state for the dying CPU. This is safe - * only because we have already cleared out the callbacks. - * (Otherwise, the RCU core might try to schedule the invocation - * of callbacks on this now-offline CPU, which would be bad.) + * If there is an rcu_barrier() operation in progress, then + * only the task doing that operation is permitted to adopt + * callbacks. To do otherwise breaks rcu_barrier() and friends + * by causing them to fail to wait for the callbacks in the + * orphanage. */ - mask = rdp->grpmask; /* rnp->grplo is constant. */ + if (rsp->rcu_barrier_in_progress && + rsp->rcu_barrier_in_progress != current) + return; + + /* Do the accounting first. */ + rdp->qlen_lazy += rsp->qlen_lazy; + rdp->qlen += rsp->qlen; + rdp->n_cbs_adopted += rsp->qlen; + rsp->qlen_lazy = 0; + rsp->qlen = 0; + + /* + * We do not need a memory barrier here because the only way we + * can get here if there is an rcu_barrier() in flight is if + * we are the task doing the rcu_barrier(). + */ + + /* First adopt the ready-to-invoke callbacks. */ + if (rsp->orphan_donelist != NULL) { + *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; + *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; + for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) + if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) + rdp->nxttail[i] = rsp->orphan_donetail; + rsp->orphan_donelist = NULL; + rsp->orphan_donetail = &rsp->orphan_donelist; + } + + /* And then adopt the callbacks that still need a grace period. */ + if (rsp->orphan_nxtlist != NULL) { + *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; + rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; + rsp->orphan_nxtlist = NULL; + rsp->orphan_nxttail = &rsp->orphan_nxtlist; + } +} + +/* + * Trace the fact that this CPU is going offline. + */ +static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) +{ + RCU_TRACE(unsigned long mask); + RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); + RCU_TRACE(struct rcu_node *rnp = rdp->mynode); + + RCU_TRACE(mask = rdp->grpmask); trace_rcu_grace_period(rsp->name, rnp->gpnum + 1 - !!(rnp->qsmask & mask), "cpuofl"); - rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum); - /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */ } /* * The CPU has been completely removed, and some other CPU is reporting - * this fact from process context. Do the remainder of the cleanup. + * this fact from process context. Do the remainder of the cleanup, + * including orphaning the outgoing CPU's RCU callbacks, and also + * adopting them, if there is no _rcu_barrier() instance running. * There can only be one CPU hotplug operation at a time, so no other * CPU can be attempting to update rcu_cpu_kthread_task. */ @@ -1409,17 +1455,21 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) unsigned long mask; int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */ + struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ /* Adjust any no-longer-needed kthreads. */ rcu_stop_cpu_kthread(cpu); rcu_node_kthread_setaffinity(rnp, -1); - /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ + /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ /* Exclude any attempts to start a new grace period. */ raw_spin_lock_irqsave(&rsp->onofflock, flags); + /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ + rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); + rcu_adopt_orphan_cbs(rsp); + /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ mask = rdp->grpmask; /* rnp->grplo is constant. */ do { @@ -1456,6 +1506,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) #else /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +{ +} + static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { } @@ -1524,9 +1578,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rcu_is_callbacks_kthread()); /* Update count, and requeue any remaining callbacks. */ - rdp->qlen_lazy -= count_lazy; - rdp->qlen -= count; - rdp->n_cbs_invoked += count; if (list != NULL) { *tail = rdp->nxtlist; rdp->nxtlist = list; @@ -1536,6 +1587,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) else break; } + smp_mb(); /* List handling before counting for rcu_barrier(). */ + rdp->qlen_lazy -= count_lazy; + rdp->qlen -= count; + rdp->n_cbs_invoked += count; /* Reinstate batch limit if we have worked down the excess. */ if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) @@ -1820,15 +1875,17 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * a quiescent state betweentimes. */ local_irq_save(flags); - WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ - *rdp->nxttail[RCU_NEXT_TAIL] = head; - rdp->nxttail[RCU_NEXT_TAIL] = &head->next; rdp->qlen++; if (lazy) rdp->qlen_lazy++; + else + rcu_idle_count_callbacks_posted(); + smp_mb(); /* Count before adding callback for rcu_barrier(). */ + *rdp->nxttail[RCU_NEXT_TAIL] = head; + rdp->nxttail[RCU_NEXT_TAIL] = &head->next; if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, @@ -1894,6 +1951,38 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_bh); +/* + * Because a context switch is a grace period for RCU-sched and RCU-bh, + * any blocking grace-period wait automatically implies a grace period + * if there is only one CPU online at any point time during execution + * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to + * occasionally incorrectly indicate that there are multiple CPUs online + * when there was in fact only one the whole time, as this just adds + * some overhead: RCU still operates correctly. + * + * Of course, sampling num_online_cpus() with preemption enabled can + * give erroneous results if there are concurrent CPU-hotplug operations. + * For example, given a demonic sequence of preemptions in num_online_cpus() + * and CPU-hotplug operations, there could be two or more CPUs online at + * all times, but num_online_cpus() might well return one (or even zero). + * + * However, all such demonic sequences require at least one CPU-offline + * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer + * is only a problem if there is an RCU read-side critical section executing + * throughout. But RCU-sched and RCU-bh read-side critical sections + * disable either preemption or bh, which prevents a CPU from going offline. + * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return + * that there is only one CPU when in fact there was more than one throughout + * is when there were no RCU readers in the system. If there are no + * RCU readers, the grace period by definition can be of zero length, + * regardless of the number of online CPUs. + */ +static inline int rcu_blocking_is_gp(void) +{ + might_sleep(); /* Check for RCU read-side critical section. */ + return num_online_cpus() <= 1; +} + /** * synchronize_sched - wait until an rcu-sched grace period has elapsed. * @@ -2167,11 +2256,10 @@ static int rcu_cpu_has_callbacks(int cpu) rcu_preempt_cpu_has_callbacks(cpu); } -static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; -static atomic_t rcu_barrier_cpu_count; -static DEFINE_MUTEX(rcu_barrier_mutex); -static struct completion rcu_barrier_completion; - +/* + * RCU callback function for _rcu_barrier(). If we are last, wake + * up the task executing _rcu_barrier(). + */ static void rcu_barrier_callback(struct rcu_head *notused) { if (atomic_dec_and_test(&rcu_barrier_cpu_count)) @@ -2201,27 +2289,94 @@ static void _rcu_barrier(struct rcu_state *rsp, void (*call_rcu_func)(struct rcu_head *head, void (*func)(struct rcu_head *head))) { - BUG_ON(in_interrupt()); + int cpu; + unsigned long flags; + struct rcu_data *rdp; + struct rcu_head rh; + + init_rcu_head_on_stack(&rh); + /* Take mutex to serialize concurrent rcu_barrier() requests. */ mutex_lock(&rcu_barrier_mutex); - init_completion(&rcu_barrier_completion); + + smp_mb(); /* Prevent any prior operations from leaking in. */ + /* - * Initialize rcu_barrier_cpu_count to 1, then invoke - * rcu_barrier_func() on each CPU, so that each CPU also has - * incremented rcu_barrier_cpu_count. Only then is it safe to - * decrement rcu_barrier_cpu_count -- otherwise the first CPU - * might complete its grace period before all of the other CPUs - * did their increment, causing this function to return too - * early. Note that on_each_cpu() disables irqs, which prevents - * any CPUs from coming online or going offline until each online - * CPU has queued its RCU-barrier callback. + * Initialize the count to one rather than to zero in order to + * avoid a too-soon return to zero in case of a short grace period + * (or preemption of this task). Also flag this task as doing + * an rcu_barrier(). This will prevent anyone else from adopting + * orphaned callbacks, which could cause otherwise failure if a + * CPU went offline and quickly came back online. To see this, + * consider the following sequence of events: + * + * 1. We cause CPU 0 to post an rcu_barrier_callback() callback. + * 2. CPU 1 goes offline, orphaning its callbacks. + * 3. CPU 0 adopts CPU 1's orphaned callbacks. + * 4. CPU 1 comes back online. + * 5. We cause CPU 1 to post an rcu_barrier_callback() callback. + * 6. Both rcu_barrier_callback() callbacks are invoked, awakening + * us -- but before CPU 1's orphaned callbacks are invoked!!! */ + init_completion(&rcu_barrier_completion); atomic_set(&rcu_barrier_cpu_count, 1); - on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); + raw_spin_lock_irqsave(&rsp->onofflock, flags); + rsp->rcu_barrier_in_progress = current; + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + + /* + * Force every CPU with callbacks to register a new callback + * that will tell us when all the preceding callbacks have + * been invoked. If an offline CPU has callbacks, wait for + * it to either come back online or to finish orphaning those + * callbacks. + */ + for_each_possible_cpu(cpu) { + preempt_disable(); + rdp = per_cpu_ptr(rsp->rda, cpu); + if (cpu_is_offline(cpu)) { + preempt_enable(); + while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) + schedule_timeout_interruptible(1); + } else if (ACCESS_ONCE(rdp->qlen)) { + smp_call_function_single(cpu, rcu_barrier_func, + (void *)call_rcu_func, 1); + preempt_enable(); + } else { + preempt_enable(); + } + } + + /* + * Now that all online CPUs have rcu_barrier_callback() callbacks + * posted, we can adopt all of the orphaned callbacks and place + * an rcu_barrier_callback() callback after them. When that is done, + * we are guaranteed to have an rcu_barrier_callback() callback + * following every callback that could possibly have been + * registered before _rcu_barrier() was called. + */ + raw_spin_lock_irqsave(&rsp->onofflock, flags); + rcu_adopt_orphan_cbs(rsp); + rsp->rcu_barrier_in_progress = NULL; + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + atomic_inc(&rcu_barrier_cpu_count); + smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ + call_rcu_func(&rh, rcu_barrier_callback); + + /* + * Now that we have an rcu_barrier_callback() callback on each + * CPU, and thus each counted, remove the initial count. + */ if (atomic_dec_and_test(&rcu_barrier_cpu_count)) complete(&rcu_barrier_completion); + + /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ wait_for_completion(&rcu_barrier_completion); + + /* Other rcu_barrier() invocations can now safely proceed. */ mutex_unlock(&rcu_barrier_mutex); + + destroy_rcu_head_on_stack(&rh); } /** @@ -2418,7 +2573,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) for (i = NUM_RCU_LVLS - 1; i > 0; i--) rsp->levelspread[i] = CONFIG_RCU_FANOUT; - rsp->levelspread[0] = RCU_FANOUT_LEAF; + rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF; } #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ static void __init rcu_init_levelspread(struct rcu_state *rsp) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index cdd1be0a407..7f5d138dedf 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -29,18 +29,14 @@ #include <linux/seqlock.h> /* - * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. + * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and + * CONFIG_RCU_FANOUT_LEAF. * In theory, it should be possible to add more levels straightforwardly. * In practice, this did work well going from three levels to four. * Of course, your mileage may vary. */ #define MAX_RCU_LVLS 4 -#if CONFIG_RCU_FANOUT > 16 -#define RCU_FANOUT_LEAF 16 -#else /* #if CONFIG_RCU_FANOUT > 16 */ -#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT) -#endif /* #else #if CONFIG_RCU_FANOUT > 16 */ -#define RCU_FANOUT_1 (RCU_FANOUT_LEAF) +#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF) #define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) #define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) @@ -371,6 +367,17 @@ struct rcu_state { raw_spinlock_t onofflock; /* exclude on/offline and */ /* starting new GP. */ + struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ + /* need a grace period. */ + struct rcu_head **orphan_nxttail; /* Tail of above. */ + struct rcu_head *orphan_donelist; /* Orphaned callbacks that */ + /* are ready to invoke. */ + struct rcu_head **orphan_donetail; /* Tail of above. */ + long qlen_lazy; /* Number of lazy callbacks. */ + long qlen; /* Total number of callbacks. */ + struct task_struct *rcu_barrier_in_progress; + /* Task doing rcu_barrier(), */ + /* or NULL if no barrier. */ raw_spinlock_t fqslock; /* Only one task forcing */ /* quiescent states. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ @@ -423,7 +430,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); long rcu_batches_completed(void); -static void rcu_preempt_note_context_switch(int cpu); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, @@ -471,6 +477,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu); static void rcu_prepare_for_idle_init(int cpu); static void rcu_cleanup_after_idle(int cpu); static void rcu_prepare_for_idle(int cpu); +static void rcu_idle_count_callbacks_posted(void); static void print_cpu_stall_info_begin(void); static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c023464816b..2411000d986 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu) * * Caller must disable preemption. */ -static void rcu_preempt_note_context_switch(int cpu) +void rcu_preempt_note_context_switch(void) { struct task_struct *t = current; unsigned long flags; @@ -164,7 +164,7 @@ static void rcu_preempt_note_context_switch(int cpu) (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); + rdp = __this_cpu_ptr(rcu_preempt_state.rda); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; @@ -228,7 +228,7 @@ static void rcu_preempt_note_context_switch(int cpu) * means that we continue to block the current grace period. */ local_irq_save(flags); - rcu_preempt_qs(cpu); + rcu_preempt_qs(smp_processor_id()); local_irq_restore(flags); } @@ -969,22 +969,6 @@ static void __init __rcu_init_preempt(void) rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); } -/* - * Check for a task exiting while in a preemptible-RCU read-side - * critical section, clean up if so. No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. - */ -void exit_rcu(void) -{ - struct task_struct *t = current; - - if (t->rcu_read_lock_nesting == 0) - return; - t->rcu_read_lock_nesting = 1; - __rcu_read_unlock(); -} - #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ static struct rcu_state *rcu_state = &rcu_sched_state; @@ -1018,14 +1002,6 @@ void rcu_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* - * Because preemptible RCU does not exist, we never have to check for - * CPUs being in quiescent states. - */ -static void rcu_preempt_note_context_switch(int cpu) -{ -} - -/* * Because preemptible RCU does not exist, there are never any preempted * RCU readers. */ @@ -1938,6 +1914,14 @@ static void rcu_prepare_for_idle(int cpu) { } +/* + * Don't bother keeping a running count of the number of RCU callbacks + * posted because CONFIG_RCU_FAST_NO_HZ=n. + */ +static void rcu_idle_count_callbacks_posted(void) +{ +} + #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ /* @@ -1978,11 +1962,20 @@ static void rcu_prepare_for_idle(int cpu) #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ +/* Loop counter for rcu_prepare_for_idle(). */ static DEFINE_PER_CPU(int, rcu_dyntick_drain); +/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); -static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); -static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */ -static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ +/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */ +static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); +/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */ +static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); +/* Enable special processing on first attempt to enter dyntick-idle mode. */ +static DEFINE_PER_CPU(bool, rcu_idle_first_pass); +/* Running count of non-lazy callbacks posted, never decremented. */ +static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); +/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */ +static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); /* * Allow the CPU to enter dyntick-idle mode if either: (1) There are no @@ -1995,6 +1988,8 @@ static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ */ int rcu_needs_cpu(int cpu) { + /* Flag a new idle sojourn to the idle-entry state machine. */ + per_cpu(rcu_idle_first_pass, cpu) = 1; /* If no callbacks, RCU doesn't need the CPU. */ if (!rcu_cpu_has_callbacks(cpu)) return 0; @@ -2045,16 +2040,34 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) } /* + * Handler for smp_call_function_single(). The only point of this + * handler is to wake the CPU up, so the handler does only tracing. + */ +void rcu_idle_demigrate(void *unused) +{ + trace_rcu_prep_idle("Demigrate"); +} + +/* * Timer handler used to force CPU to start pushing its remaining RCU * callbacks in the case where it entered dyntick-idle mode with callbacks * pending. The hander doesn't really need to do anything because the * real work is done upon re-entry to idle, or by the next scheduling-clock * interrupt should idle not be re-entered. + * + * One special case: the timer gets migrated without awakening the CPU + * on which the timer was scheduled on. In this case, we must wake up + * that CPU. We do so with smp_call_function_single(). */ -static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) +static void rcu_idle_gp_timer_func(unsigned long cpu_in) { + int cpu = (int)cpu_in; + trace_rcu_prep_idle("Timer"); - return HRTIMER_NORESTART; + if (cpu != smp_processor_id()) + smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0); + else + WARN_ON_ONCE(1); /* Getting here can hang the system... */ } /* @@ -2062,19 +2075,11 @@ static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) */ static void rcu_prepare_for_idle_init(int cpu) { - static int firsttime = 1; - struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); - - hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtp->function = rcu_idle_gp_timer_func; - if (firsttime) { - unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); - - rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); - upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY); - rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000); - firsttime = 0; - } + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; + setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), + rcu_idle_gp_timer_func, cpu); + per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; + per_cpu(rcu_idle_first_pass, cpu) = 1; } /* @@ -2084,7 +2089,8 @@ static void rcu_prepare_for_idle_init(int cpu) */ static void rcu_cleanup_after_idle(int cpu) { - hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); + del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); + trace_rcu_prep_idle("Cleanup after idle"); } /* @@ -2108,6 +2114,29 @@ static void rcu_cleanup_after_idle(int cpu) */ static void rcu_prepare_for_idle(int cpu) { + struct timer_list *tp; + + /* + * If this is an idle re-entry, for example, due to use of + * RCU_NONIDLE() or the new idle-loop tracing API within the idle + * loop, then don't take any state-machine actions, unless the + * momentary exit from idle queued additional non-lazy callbacks. + * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks + * pending. + */ + if (!per_cpu(rcu_idle_first_pass, cpu) && + (per_cpu(rcu_nonlazy_posted, cpu) == + per_cpu(rcu_nonlazy_posted_snap, cpu))) { + if (rcu_cpu_has_callbacks(cpu)) { + tp = &per_cpu(rcu_idle_gp_timer, cpu); + mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); + } + return; + } + per_cpu(rcu_idle_first_pass, cpu) = 0; + per_cpu(rcu_nonlazy_posted_snap, cpu) = + per_cpu(rcu_nonlazy_posted, cpu) - 1; + /* * If there are no callbacks on this CPU, enter dyntick-idle mode. * Also reset state to avoid prejudicing later attempts. @@ -2140,11 +2169,15 @@ static void rcu_prepare_for_idle(int cpu) per_cpu(rcu_dyntick_drain, cpu) = 0; per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; if (rcu_cpu_has_nonlazy_callbacks(cpu)) - hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), - rcu_idle_gp_wait, HRTIMER_MODE_REL); + per_cpu(rcu_idle_gp_timer_expires, cpu) = + jiffies + RCU_IDLE_GP_DELAY; else - hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), - rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); + per_cpu(rcu_idle_gp_timer_expires, cpu) = + jiffies + RCU_IDLE_LAZY_GP_DELAY; + tp = &per_cpu(rcu_idle_gp_timer, cpu); + mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); + per_cpu(rcu_nonlazy_posted_snap, cpu) = + per_cpu(rcu_nonlazy_posted, cpu); return; /* Nothing more to do immediately. */ } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ @@ -2184,6 +2217,19 @@ static void rcu_prepare_for_idle(int cpu) trace_rcu_prep_idle("Callbacks drained"); } +/* + * Keep a running count of the number of non-lazy callbacks posted + * on this CPU. This running counter (which is never decremented) allows + * rcu_prepare_for_idle() to detect when something out of the idle loop + * posts a callback, even if an equal number of callbacks are invoked. + * Of course, callbacks should only be posted from within a trace event + * designed to be called from idle or from within RCU_NONIDLE(). + */ +static void rcu_idle_count_callbacks_posted(void) +{ + __this_cpu_add(rcu_nonlazy_posted, 1); +} + #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ #ifdef CONFIG_RCU_CPU_STALL_INFO @@ -2192,14 +2238,12 @@ static void rcu_prepare_for_idle(int cpu) static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { - struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); + struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); - sprintf(cp, "drain=%d %c timer=%lld", + sprintf(cp, "drain=%d %c timer=%lu", per_cpu(rcu_dyntick_drain, cpu), per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', - hrtimer_active(hrtp) - ? ktime_to_us(hrtimer_get_remaining(hrtp)) - : -1); + timer_pending(tltp) ? tltp->expires - jiffies : -1); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index ed459edeff4..d4bc16ddd1d 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -271,13 +271,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) gpnum = rsp->gpnum; seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " - "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", + "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", rsp->completed, gpnum, rsp->fqs_state, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff), rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, - rsp->n_force_qs_lh); + rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 9a7dd35102a..173ea52f3af 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -16,5 +16,3 @@ obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o - - diff --git a/kernel/sched/core.c b/kernel/sched/core.c index afc6d7e7155..03667c3fdb3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -83,6 +83,7 @@ #include "sched.h" #include "../workqueue_sched.h" +#include "../smpboot.h" #define CREATE_TRACE_POINTS #include <trace/events/sched.h> @@ -2083,6 +2084,7 @@ context_switch(struct rq *rq, struct task_struct *prev, #endif /* Here we just switch the register state and the stack. */ + rcu_switch_from(prev); switch_to(prev, next, prev); barrier(); @@ -6382,6 +6384,8 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sg) return -ENOMEM; + sg->next = sg; + *per_cpu_ptr(sdd->sg, j) = sg; sgp = kzalloc_node(sizeof(struct sched_group_power), @@ -6405,16 +6409,26 @@ static void __sdt_free(const struct cpumask *cpu_map) struct sd_data *sdd = &tl->data; for_each_cpu(j, cpu_map) { - struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); - if (sd && (sd->flags & SD_OVERLAP)) - free_sched_groups(sd->groups, 0); - kfree(*per_cpu_ptr(sdd->sd, j)); - kfree(*per_cpu_ptr(sdd->sg, j)); - kfree(*per_cpu_ptr(sdd->sgp, j)); + struct sched_domain *sd; + + if (sdd->sd) { + sd = *per_cpu_ptr(sdd->sd, j); + if (sd && (sd->flags & SD_OVERLAP)) + free_sched_groups(sd->groups, 0); + kfree(*per_cpu_ptr(sdd->sd, j)); + } + + if (sdd->sg) + kfree(*per_cpu_ptr(sdd->sg, j)); + if (sdd->sgp) + kfree(*per_cpu_ptr(sdd->sgp, j)); } free_percpu(sdd->sd); + sdd->sd = NULL; free_percpu(sdd->sg); + sdd->sg = NULL; free_percpu(sdd->sgp); + sdd->sgp = NULL; } } @@ -7049,6 +7063,7 @@ void __init sched_init(void) /* May be allocated at isolcpus cmdline parse time */ if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); + idle_thread_set_boot_cpu(); #endif init_sched_fair_class(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0d97ebdc58f..e9553640c1c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP if (entity_is_task(se)) - list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); + list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); #endif cfs_rq->nr_running++; } @@ -3215,6 +3215,8 @@ static int move_one_task(struct lb_env *env) static unsigned long task_h_load(struct task_struct *p); +static const unsigned int sched_nr_migrate_break = 32; + /* * move_tasks tries to move up to load_move weighted load from busiest to * this_rq, as part of a balancing operation within domain "sd". @@ -3242,7 +3244,7 @@ static int move_tasks(struct lb_env *env) /* take a breather every nr_migrate tasks */ if (env->loop > env->loop_break) { - env->loop_break += sysctl_sched_nr_migrate; + env->loop_break += sched_nr_migrate_break; env->flags |= LBF_NEED_BREAK; break; } @@ -3252,7 +3254,7 @@ static int move_tasks(struct lb_env *env) load = task_h_load(p); - if (load < 16 && !env->sd->nr_balance_failed) + if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) goto next; if ((load / 2) > env->load_move) @@ -4407,7 +4409,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .dst_cpu = this_cpu, .dst_rq = this_rq, .idle = idle, - .loop_break = sysctl_sched_nr_migrate, + .loop_break = sched_nr_migrate_break, }; cpumask_copy(cpus, cpu_active_mask); @@ -4445,10 +4447,10 @@ redo: * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; - env.load_move = imbalance; - env.src_cpu = busiest->cpu; - env.src_rq = busiest; - env.loop_max = busiest->nr_running; + env.load_move = imbalance; + env.src_cpu = busiest->cpu; + env.src_rq = busiest; + env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running); more_balance: local_irq_save(flags); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index e61fd73913d..de00a486c5c 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -68,3 +68,4 @@ SCHED_FEAT(TTWU_QUEUE, true) SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) +SCHED_FEAT(LB_MIN, false) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e8d76c5895e..ee376beedaf 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -3,16 +3,357 @@ * * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> * - * This defines a simple but solid secure-computing mode. + * Copyright (C) 2012 Google, Inc. + * Will Drewry <wad@chromium.org> + * + * This defines a simple but solid secure-computing facility. + * + * Mode 1 uses a fixed list of allowed system calls. + * Mode 2 allows user-defined system call filters in the form + * of Berkeley Packet Filters/Linux Socket Filters. */ +#include <linux/atomic.h> #include <linux/audit.h> -#include <linux/seccomp.h> -#include <linux/sched.h> #include <linux/compat.h> +#include <linux/sched.h> +#include <linux/seccomp.h> /* #define SECCOMP_DEBUG 1 */ -#define NR_SECCOMP_MODES 1 + +#ifdef CONFIG_SECCOMP_FILTER +#include <asm/syscall.h> +#include <linux/filter.h> +#include <linux/ptrace.h> +#include <linux/security.h> +#include <linux/slab.h> +#include <linux/tracehook.h> +#include <linux/uaccess.h> + +/** + * struct seccomp_filter - container for seccomp BPF programs + * + * @usage: reference count to manage the object lifetime. + * get/put helpers should be used when accessing an instance + * outside of a lifetime-guarded section. In general, this + * is only needed for handling filters shared across tasks. + * @prev: points to a previously installed, or inherited, filter + * @len: the number of instructions in the program + * @insns: the BPF program instructions to evaluate + * + * seccomp_filter objects are organized in a tree linked via the @prev + * pointer. For any task, it appears to be a singly-linked list starting + * with current->seccomp.filter, the most recently attached or inherited filter. + * However, multiple filters may share a @prev node, by way of fork(), which + * results in a unidirectional tree existing in memory. This is similar to + * how namespaces work. + * + * seccomp_filter objects should never be modified after being attached + * to a task_struct (other than @usage). + */ +struct seccomp_filter { + atomic_t usage; + struct seccomp_filter *prev; + unsigned short len; /* Instruction count */ + struct sock_filter insns[]; +}; + +/* Limit any path through the tree to 256KB worth of instructions. */ +#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) + +/** + * get_u32 - returns a u32 offset into data + * @data: a unsigned 64 bit value + * @index: 0 or 1 to return the first or second 32-bits + * + * This inline exists to hide the length of unsigned long. If a 32-bit + * unsigned long is passed in, it will be extended and the top 32-bits will be + * 0. If it is a 64-bit unsigned long, then whatever data is resident will be + * properly returned. + * + * Endianness is explicitly ignored and left for BPF program authors to manage + * as per the specific architecture. + */ +static inline u32 get_u32(u64 data, int index) +{ + return ((u32 *)&data)[index]; +} + +/* Helper for bpf_load below. */ +#define BPF_DATA(_name) offsetof(struct seccomp_data, _name) +/** + * bpf_load: checks and returns a pointer to the requested offset + * @off: offset into struct seccomp_data to load from + * + * Returns the requested 32-bits of data. + * seccomp_check_filter() should assure that @off is 32-bit aligned + * and not out of bounds. Failure to do so is a BUG. + */ +u32 seccomp_bpf_load(int off) +{ + struct pt_regs *regs = task_pt_regs(current); + if (off == BPF_DATA(nr)) + return syscall_get_nr(current, regs); + if (off == BPF_DATA(arch)) + return syscall_get_arch(current, regs); + if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { + unsigned long value; + int arg = (off - BPF_DATA(args[0])) / sizeof(u64); + int index = !!(off % sizeof(u64)); + syscall_get_arguments(current, regs, arg, 1, &value); + return get_u32(value, index); + } + if (off == BPF_DATA(instruction_pointer)) + return get_u32(KSTK_EIP(current), 0); + if (off == BPF_DATA(instruction_pointer) + sizeof(u32)) + return get_u32(KSTK_EIP(current), 1); + /* seccomp_check_filter should make this impossible. */ + BUG(); +} + +/** + * seccomp_check_filter - verify seccomp filter code + * @filter: filter to verify + * @flen: length of filter + * + * Takes a previously checked filter (by sk_chk_filter) and + * redirects all filter code that loads struct sk_buff data + * and related data through seccomp_bpf_load. It also + * enforces length and alignment checking of those loads. + * + * Returns 0 if the rule set is legal or -EINVAL if not. + */ +static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) +{ + int pc; + for (pc = 0; pc < flen; pc++) { + struct sock_filter *ftest = &filter[pc]; + u16 code = ftest->code; + u32 k = ftest->k; + + switch (code) { + case BPF_S_LD_W_ABS: + ftest->code = BPF_S_ANC_SECCOMP_LD_W; + /* 32-bit aligned and not out of bounds. */ + if (k >= sizeof(struct seccomp_data) || k & 3) + return -EINVAL; + continue; + case BPF_S_LD_W_LEN: + ftest->code = BPF_S_LD_IMM; + ftest->k = sizeof(struct seccomp_data); + continue; + case BPF_S_LDX_W_LEN: + ftest->code = BPF_S_LDX_IMM; + ftest->k = sizeof(struct seccomp_data); + continue; + /* Explicitly include allowed calls. */ + case BPF_S_RET_K: + case BPF_S_RET_A: + case BPF_S_ALU_ADD_K: + case BPF_S_ALU_ADD_X: + case BPF_S_ALU_SUB_K: + case BPF_S_ALU_SUB_X: + case BPF_S_ALU_MUL_K: + case BPF_S_ALU_MUL_X: + case BPF_S_ALU_DIV_X: + case BPF_S_ALU_AND_K: + case BPF_S_ALU_AND_X: + case BPF_S_ALU_OR_K: + case BPF_S_ALU_OR_X: + case BPF_S_ALU_LSH_K: + case BPF_S_ALU_LSH_X: + case BPF_S_ALU_RSH_K: + case BPF_S_ALU_RSH_X: + case BPF_S_ALU_NEG: + case BPF_S_LD_IMM: + case BPF_S_LDX_IMM: + case BPF_S_MISC_TAX: + case BPF_S_MISC_TXA: + case BPF_S_ALU_DIV_K: + case BPF_S_LD_MEM: + case BPF_S_LDX_MEM: + case BPF_S_ST: + case BPF_S_STX: + case BPF_S_JMP_JA: + case BPF_S_JMP_JEQ_K: + case BPF_S_JMP_JEQ_X: + case BPF_S_JMP_JGE_K: + case BPF_S_JMP_JGE_X: + case BPF_S_JMP_JGT_K: + case BPF_S_JMP_JGT_X: + case BPF_S_JMP_JSET_K: + case BPF_S_JMP_JSET_X: + continue; + default: + return -EINVAL; + } + } + return 0; +} + +/** + * seccomp_run_filters - evaluates all seccomp filters against @syscall + * @syscall: number of the current system call + * + * Returns valid seccomp BPF response codes. + */ +static u32 seccomp_run_filters(int syscall) +{ + struct seccomp_filter *f; + u32 ret = SECCOMP_RET_ALLOW; + + /* Ensure unexpected behavior doesn't result in failing open. */ + if (WARN_ON(current->seccomp.filter == NULL)) + return SECCOMP_RET_KILL; + + /* + * All filters in the list are evaluated and the lowest BPF return + * value always takes priority (ignoring the DATA). + */ + for (f = current->seccomp.filter; f; f = f->prev) { + u32 cur_ret = sk_run_filter(NULL, f->insns); + if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) + ret = cur_ret; + } + return ret; +} + +/** + * seccomp_attach_filter: Attaches a seccomp filter to current. + * @fprog: BPF program to install + * + * Returns 0 on success or an errno on failure. + */ +static long seccomp_attach_filter(struct sock_fprog *fprog) +{ + struct seccomp_filter *filter; + unsigned long fp_size = fprog->len * sizeof(struct sock_filter); + unsigned long total_insns = fprog->len; + long ret; + + if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) + return -EINVAL; + + for (filter = current->seccomp.filter; filter; filter = filter->prev) + total_insns += filter->len + 4; /* include a 4 instr penalty */ + if (total_insns > MAX_INSNS_PER_PATH) + return -ENOMEM; + + /* + * Installing a seccomp filter requires that the task have + * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. + * This avoids scenarios where unprivileged tasks can affect the + * behavior of privileged children. + */ + if (!current->no_new_privs && + security_capable_noaudit(current_cred(), current_user_ns(), + CAP_SYS_ADMIN) != 0) + return -EACCES; + + /* Allocate a new seccomp_filter */ + filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, + GFP_KERNEL|__GFP_NOWARN); + if (!filter) + return -ENOMEM; + atomic_set(&filter->usage, 1); + filter->len = fprog->len; + + /* Copy the instructions from fprog. */ + ret = -EFAULT; + if (copy_from_user(filter->insns, fprog->filter, fp_size)) + goto fail; + + /* Check and rewrite the fprog via the skb checker */ + ret = sk_chk_filter(filter->insns, filter->len); + if (ret) + goto fail; + + /* Check and rewrite the fprog for seccomp use */ + ret = seccomp_check_filter(filter->insns, filter->len); + if (ret) + goto fail; + + /* + * If there is an existing filter, make it the prev and don't drop its + * task reference. + */ + filter->prev = current->seccomp.filter; + current->seccomp.filter = filter; + return 0; +fail: + kfree(filter); + return ret; +} + +/** + * seccomp_attach_user_filter - attaches a user-supplied sock_fprog + * @user_filter: pointer to the user data containing a sock_fprog. + * + * Returns 0 on success and non-zero otherwise. + */ +long seccomp_attach_user_filter(char __user *user_filter) +{ + struct sock_fprog fprog; + long ret = -EFAULT; + +#ifdef CONFIG_COMPAT + if (is_compat_task()) { + struct compat_sock_fprog fprog32; + if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) + goto out; + fprog.len = fprog32.len; + fprog.filter = compat_ptr(fprog32.filter); + } else /* falls through to the if below. */ +#endif + if (copy_from_user(&fprog, user_filter, sizeof(fprog))) + goto out; + ret = seccomp_attach_filter(&fprog); +out: + return ret; +} + +/* get_seccomp_filter - increments the reference count of the filter on @tsk */ +void get_seccomp_filter(struct task_struct *tsk) +{ + struct seccomp_filter *orig = tsk->seccomp.filter; + if (!orig) + return; + /* Reference count is bounded by the number of total processes. */ + atomic_inc(&orig->usage); +} + +/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ +void put_seccomp_filter(struct task_struct *tsk) +{ + struct seccomp_filter *orig = tsk->seccomp.filter; + /* Clean up single-reference branches iteratively. */ + while (orig && atomic_dec_and_test(&orig->usage)) { + struct seccomp_filter *freeme = orig; + orig = orig->prev; + kfree(freeme); + } +} + +/** + * seccomp_send_sigsys - signals the task to allow in-process syscall emulation + * @syscall: syscall number to send to userland + * @reason: filter-supplied reason code to send to userland (via si_errno) + * + * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. + */ +static void seccomp_send_sigsys(int syscall, int reason) +{ + struct siginfo info; + memset(&info, 0, sizeof(info)); + info.si_signo = SIGSYS; + info.si_code = SYS_SECCOMP; + info.si_call_addr = (void __user *)KSTK_EIP(current); + info.si_errno = reason; + info.si_arch = syscall_get_arch(current, task_pt_regs(current)); + info.si_syscall = syscall; + force_sig_info(SIGSYS, &info, current); +} +#endif /* CONFIG_SECCOMP_FILTER */ /* * Secure computing mode 1 allows only read/write/exit/sigreturn. @@ -31,13 +372,15 @@ static int mode1_syscalls_32[] = { }; #endif -void __secure_computing(int this_syscall) +int __secure_computing(int this_syscall) { int mode = current->seccomp.mode; - int * syscall; + int exit_sig = 0; + int *syscall; + u32 ret; switch (mode) { - case 1: + case SECCOMP_MODE_STRICT: syscall = mode1_syscalls; #ifdef CONFIG_COMPAT if (is_compat_task()) @@ -45,9 +388,54 @@ void __secure_computing(int this_syscall) #endif do { if (*syscall == this_syscall) - return; + return 0; } while (*++syscall); + exit_sig = SIGKILL; + ret = SECCOMP_RET_KILL; + break; +#ifdef CONFIG_SECCOMP_FILTER + case SECCOMP_MODE_FILTER: { + int data; + ret = seccomp_run_filters(this_syscall); + data = ret & SECCOMP_RET_DATA; + ret &= SECCOMP_RET_ACTION; + switch (ret) { + case SECCOMP_RET_ERRNO: + /* Set the low-order 16-bits as a errno. */ + syscall_set_return_value(current, task_pt_regs(current), + -data, 0); + goto skip; + case SECCOMP_RET_TRAP: + /* Show the handler the original registers. */ + syscall_rollback(current, task_pt_regs(current)); + /* Let the filter pass back 16 bits of data. */ + seccomp_send_sigsys(this_syscall, data); + goto skip; + case SECCOMP_RET_TRACE: + /* Skip these calls if there is no tracer. */ + if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) + goto skip; + /* Allow the BPF to provide the event message */ + ptrace_event(PTRACE_EVENT_SECCOMP, data); + /* + * The delivery of a fatal signal during event + * notification may silently skip tracer notification. + * Terminating the task now avoids executing a system + * call that may not be intended. + */ + if (fatal_signal_pending(current)) + break; + return 0; + case SECCOMP_RET_ALLOW: + return 0; + case SECCOMP_RET_KILL: + default: + break; + } + exit_sig = SIGSYS; break; + } +#endif default: BUG(); } @@ -55,8 +443,13 @@ void __secure_computing(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif - audit_seccomp(this_syscall); - do_exit(SIGKILL); + audit_seccomp(this_syscall, exit_sig, ret); + do_exit(exit_sig); +#ifdef CONFIG_SECCOMP_FILTER +skip: + audit_seccomp(this_syscall, exit_sig, ret); +#endif + return -1; } long prctl_get_seccomp(void) @@ -64,25 +457,48 @@ long prctl_get_seccomp(void) return current->seccomp.mode; } -long prctl_set_seccomp(unsigned long seccomp_mode) +/** + * prctl_set_seccomp: configures current->seccomp.mode + * @seccomp_mode: requested mode to use + * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER + * + * This function may be called repeatedly with a @seccomp_mode of + * SECCOMP_MODE_FILTER to install additional filters. Every filter + * successfully installed will be evaluated (in reverse order) for each system + * call the task makes. + * + * Once current->seccomp.mode is non-zero, it may not be changed. + * + * Returns 0 on success or -EINVAL on failure. + */ +long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) { - long ret; + long ret = -EINVAL; - /* can set it only once to be even more secure */ - ret = -EPERM; - if (unlikely(current->seccomp.mode)) + if (current->seccomp.mode && + current->seccomp.mode != seccomp_mode) goto out; - ret = -EINVAL; - if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { - current->seccomp.mode = seccomp_mode; - set_thread_flag(TIF_SECCOMP); + switch (seccomp_mode) { + case SECCOMP_MODE_STRICT: + ret = 0; #ifdef TIF_NOTSC disable_TSC(); #endif - ret = 0; + break; +#ifdef CONFIG_SECCOMP_FILTER + case SECCOMP_MODE_FILTER: + ret = seccomp_attach_user_filter(filter); + if (ret) + goto out; + break; +#endif + default: + goto out; } - out: + current->seccomp.mode = seccomp_mode; + set_thread_flag(TIF_SECCOMP); +out: return ret; } diff --git a/kernel/signal.c b/kernel/signal.c index 17afcaf582d..1a006b5d9d9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -160,7 +160,7 @@ void recalc_sigpending(void) #define SYNCHRONOUS_MASK \ (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ - sigmask(SIGTRAP) | sigmask(SIGFPE)) + sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS)) int next_signal(struct sigpending *pending, sigset_t *mask) { @@ -2706,6 +2706,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_ptr, &to->si_ptr); break; +#ifdef __ARCH_SIGSYS + case __SI_SYS: + err |= __put_user(from->si_call_addr, &to->si_call_addr); + err |= __put_user(from->si_syscall, &to->si_syscall); + err |= __put_user(from->si_arch, &to->si_arch); + break; +#endif default: /* this is just in case for now ... */ err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); diff --git a/kernel/smp.c b/kernel/smp.c index 2f8b10ecf75..d0ae5b24875 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -13,6 +13,8 @@ #include <linux/smp.h> #include <linux/cpu.h> +#include "smpboot.h" + #ifdef CONFIG_USE_GENERIC_SMP_HELPERS static struct { struct list_head queue; @@ -669,6 +671,8 @@ void __init smp_init(void) { unsigned int cpu; + idle_threads_init(); + /* FIXME: This should be done in userspace --RR */ for_each_present_cpu(cpu) { if (num_online_cpus() >= setup_max_cpus) @@ -791,3 +795,26 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), } } EXPORT_SYMBOL(on_each_cpu_cond); + +static void do_nothing(void *unused) +{ +} + +/** + * kick_all_cpus_sync - Force all cpus out of idle + * + * Used to synchronize the update of pm_idle function pointer. It's + * called after the pointer is updated and returns after the dummy + * callback function has been executed on all cpus. The execution of + * the function can only happen on the remote cpus after they have + * left the idle function which had been called via pm_idle function + * pointer. So it's guaranteed that nothing uses the previous pointer + * anymore. + */ +void kick_all_cpus_sync(void) +{ + /* Make sure the change is visible before we kick the cpus */ + smp_mb(); + smp_call_function(do_nothing, NULL, 1); +} +EXPORT_SYMBOL_GPL(kick_all_cpus_sync); diff --git a/kernel/smpboot.c b/kernel/smpboot.c new file mode 100644 index 00000000000..e1a797e028a --- /dev/null +++ b/kernel/smpboot.c @@ -0,0 +1,62 @@ +/* + * Common SMP CPU bringup/teardown functions + */ +#include <linux/err.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/percpu.h> + +#include "smpboot.h" + +#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD +/* + * For the hotplug case we keep the task structs around and reuse + * them. + */ +static DEFINE_PER_CPU(struct task_struct *, idle_threads); + +struct task_struct * __cpuinit idle_thread_get(unsigned int cpu) +{ + struct task_struct *tsk = per_cpu(idle_threads, cpu); + + if (!tsk) + return ERR_PTR(-ENOMEM); + init_idle(tsk, cpu); + return tsk; +} + +void __init idle_thread_set_boot_cpu(void) +{ + per_cpu(idle_threads, smp_processor_id()) = current; +} + +static inline void idle_init(unsigned int cpu) +{ + struct task_struct *tsk = per_cpu(idle_threads, cpu); + + if (!tsk) { + tsk = fork_idle(cpu); + if (IS_ERR(tsk)) + pr_err("SMP: fork_idle() failed for CPU %u\n", cpu); + else + per_cpu(idle_threads, cpu) = tsk; + } +} + +/** + * idle_thread_init - Initialize the idle thread for a cpu + * @cpu: The cpu for which the idle thread should be initialized + * + * Creates the thread if it does not exist. + */ +void __init idle_threads_init(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) { + if (cpu != smp_processor_id()) + idle_init(cpu); + } +} +#endif diff --git a/kernel/smpboot.h b/kernel/smpboot.h new file mode 100644 index 00000000000..80c0acfb847 --- /dev/null +++ b/kernel/smpboot.h @@ -0,0 +1,18 @@ +#ifndef SMPBOOT_H +#define SMPBOOT_H + +struct task_struct; + +int smpboot_prepare(unsigned int cpu); + +#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD +struct task_struct *idle_thread_get(unsigned int cpu); +void idle_thread_set_boot_cpu(void); +void idle_threads_init(void); +#else +static inline struct task_struct *idle_thread_get(unsigned int cpu) { return NULL; } +static inline void idle_thread_set_boot_cpu(void) { } +static inline void idle_threads_init(void) { } +#endif + +#endif diff --git a/kernel/srcu.c b/kernel/srcu.c index ba35f3a4a1f..2095be3318d 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -34,10 +34,77 @@ #include <linux/delay.h> #include <linux/srcu.h> +/* + * Initialize an rcu_batch structure to empty. + */ +static inline void rcu_batch_init(struct rcu_batch *b) +{ + b->head = NULL; + b->tail = &b->head; +} + +/* + * Enqueue a callback onto the tail of the specified rcu_batch structure. + */ +static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head) +{ + *b->tail = head; + b->tail = &head->next; +} + +/* + * Is the specified rcu_batch structure empty? + */ +static inline bool rcu_batch_empty(struct rcu_batch *b) +{ + return b->tail == &b->head; +} + +/* + * Remove the callback at the head of the specified rcu_batch structure + * and return a pointer to it, or return NULL if the structure is empty. + */ +static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b) +{ + struct rcu_head *head; + + if (rcu_batch_empty(b)) + return NULL; + + head = b->head; + b->head = head->next; + if (b->tail == &head->next) + rcu_batch_init(b); + + return head; +} + +/* + * Move all callbacks from the rcu_batch structure specified by "from" to + * the structure specified by "to". + */ +static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) +{ + if (!rcu_batch_empty(from)) { + *to->tail = from->head; + to->tail = from->tail; + rcu_batch_init(from); + } +} + +/* single-thread state-machine */ +static void process_srcu(struct work_struct *work); + static int init_srcu_struct_fields(struct srcu_struct *sp) { sp->completed = 0; - mutex_init(&sp->mutex); + spin_lock_init(&sp->queue_lock); + sp->running = false; + rcu_batch_init(&sp->batch_queue); + rcu_batch_init(&sp->batch_check0); + rcu_batch_init(&sp->batch_check1); + rcu_batch_init(&sp->batch_done); + INIT_DELAYED_WORK(&sp->work, process_srcu); sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); return sp->per_cpu_ref ? 0 : -ENOMEM; } @@ -73,21 +140,116 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* - * srcu_readers_active_idx -- returns approximate number of readers - * active on the specified rank of per-CPU counters. + * Returns approximate total of the readers' ->seq[] values for the + * rank of per-CPU counters specified by idx. */ +static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) +{ + int cpu; + unsigned long sum = 0; + unsigned long t; -static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) + for_each_possible_cpu(cpu) { + t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); + sum += t; + } + return sum; +} + +/* + * Returns approximate number of readers active on the specified rank + * of the per-CPU ->c[] counters. + */ +static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) { int cpu; - int sum; + unsigned long sum = 0; + unsigned long t; - sum = 0; - for_each_possible_cpu(cpu) - sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; + for_each_possible_cpu(cpu) { + t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); + sum += t; + } return sum; } +/* + * Return true if the number of pre-existing readers is determined to + * be stably zero. An example unstable zero can occur if the call + * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, + * but due to task migration, sees the corresponding __srcu_read_unlock() + * decrement. This can happen because srcu_readers_active_idx() takes + * time to sum the array, and might in fact be interrupted or preempted + * partway through the summation. + */ +static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) +{ + unsigned long seq; + + seq = srcu_readers_seq_idx(sp, idx); + + /* + * The following smp_mb() A pairs with the smp_mb() B located in + * __srcu_read_lock(). This pairing ensures that if an + * __srcu_read_lock() increments its counter after the summation + * in srcu_readers_active_idx(), then the corresponding SRCU read-side + * critical section will see any changes made prior to the start + * of the current SRCU grace period. + * + * Also, if the above call to srcu_readers_seq_idx() saw the + * increment of ->seq[], then the call to srcu_readers_active_idx() + * must see the increment of ->c[]. + */ + smp_mb(); /* A */ + + /* + * Note that srcu_readers_active_idx() can incorrectly return + * zero even though there is a pre-existing reader throughout. + * To see this, suppose that task A is in a very long SRCU + * read-side critical section that started on CPU 0, and that + * no other reader exists, so that the sum of the counters + * is equal to one. Then suppose that task B starts executing + * srcu_readers_active_idx(), summing up to CPU 1, and then that + * task C starts reading on CPU 0, so that its increment is not + * summed, but finishes reading on CPU 2, so that its decrement + * -is- summed. Then when task B completes its sum, it will + * incorrectly get zero, despite the fact that task A has been + * in its SRCU read-side critical section the whole time. + * + * We therefore do a validation step should srcu_readers_active_idx() + * return zero. + */ + if (srcu_readers_active_idx(sp, idx) != 0) + return false; + + /* + * The remainder of this function is the validation step. + * The following smp_mb() D pairs with the smp_mb() C in + * __srcu_read_unlock(). If the __srcu_read_unlock() was seen + * by srcu_readers_active_idx() above, then any destructive + * operation performed after the grace period will happen after + * the corresponding SRCU read-side critical section. + * + * Note that there can be at most NR_CPUS worth of readers using + * the old index, which is not enough to overflow even a 32-bit + * integer. (Yes, this does mean that systems having more than + * a billion or so CPUs need to be 64-bit systems.) Therefore, + * the sum of the ->seq[] counters cannot possibly overflow. + * Therefore, the only way that the return values of the two + * calls to srcu_readers_seq_idx() can be equal is if there were + * no increments of the corresponding rank of ->seq[] counts + * in the interim. But the missed-increment scenario laid out + * above includes an increment of the ->seq[] counter by + * the corresponding __srcu_read_lock(). Therefore, if this + * scenario occurs, the return values from the two calls to + * srcu_readers_seq_idx() will differ, and thus the validation + * step below suffices. + */ + smp_mb(); /* D */ + + return srcu_readers_seq_idx(sp, idx) == seq; +} + /** * srcu_readers_active - returns approximate number of readers. * @sp: which srcu_struct to count active readers (holding srcu_read_lock). @@ -98,7 +260,14 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) */ static int srcu_readers_active(struct srcu_struct *sp) { - return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); + int cpu; + unsigned long sum = 0; + + for_each_possible_cpu(cpu) { + sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); + sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); + } + return sum; } /** @@ -131,10 +300,11 @@ int __srcu_read_lock(struct srcu_struct *sp) int idx; preempt_disable(); - idx = sp->completed & 0x1; - barrier(); /* ensure compiler looks -once- at sp->completed. */ - per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; - srcu_barrier(); /* ensure compiler won't misorder critical section. */ + idx = rcu_dereference_index_check(sp->completed, + rcu_read_lock_sched_held()) & 0x1; + ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; + smp_mb(); /* B */ /* Avoid leaking the critical section. */ + ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; preempt_enable(); return idx; } @@ -149,8 +319,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); void __srcu_read_unlock(struct srcu_struct *sp, int idx) { preempt_disable(); - srcu_barrier(); /* ensure compiler won't misorder critical section. */ - per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; + smp_mb(); /* C */ /* Avoid leaking the critical section. */ + ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1; preempt_enable(); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -163,106 +333,119 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); * we repeatedly block for 1-millisecond time periods. This approach * has done well in testing, so there is no need for a config parameter. */ -#define SYNCHRONIZE_SRCU_READER_DELAY 10 +#define SRCU_RETRY_CHECK_DELAY 5 +#define SYNCHRONIZE_SRCU_TRYCOUNT 2 +#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12 /* - * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). + * @@@ Wait until all pre-existing readers complete. Such readers + * will have used the index specified by "idx". + * the caller should ensures the ->completed is not changed while checking + * and idx = (->completed & 1) ^ 1 */ -static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) +static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) { - int idx; - - rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && - !lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); - - idx = sp->completed; - mutex_lock(&sp->mutex); + for (;;) { + if (srcu_readers_active_idx_check(sp, idx)) + return true; + if (--trycount <= 0) + return false; + udelay(SRCU_RETRY_CHECK_DELAY); + } +} - /* - * Check to see if someone else did the work for us while we were - * waiting to acquire the lock. We need -two- advances of - * the counter, not just one. If there was but one, we might have - * shown up -after- our helper's first synchronize_sched(), thus - * having failed to prevent CPU-reordering races with concurrent - * srcu_read_unlock()s on other CPUs (see comment below). So we - * either (1) wait for two or (2) supply the second ourselves. - */ +/* + * Increment the ->completed counter so that future SRCU readers will + * use the other rank of the ->c[] and ->seq[] arrays. This allows + * us to wait for pre-existing readers in a starvation-free manner. + */ +static void srcu_flip(struct srcu_struct *sp) +{ + sp->completed++; +} - if ((sp->completed - idx) >= 2) { - mutex_unlock(&sp->mutex); - return; +/* + * Enqueue an SRCU callback on the specified srcu_struct structure, + * initiating grace-period processing if it is not already running. + */ +void call_srcu(struct srcu_struct *sp, struct rcu_head *head, + void (*func)(struct rcu_head *head)) +{ + unsigned long flags; + + head->next = NULL; + head->func = func; + spin_lock_irqsave(&sp->queue_lock, flags); + rcu_batch_queue(&sp->batch_queue, head); + if (!sp->running) { + sp->running = true; + queue_delayed_work(system_nrt_wq, &sp->work, 0); } + spin_unlock_irqrestore(&sp->queue_lock, flags); +} +EXPORT_SYMBOL_GPL(call_srcu); - sync_func(); /* Force memory barrier on all CPUs. */ +struct rcu_synchronize { + struct rcu_head head; + struct completion completion; +}; - /* - * The preceding synchronize_sched() ensures that any CPU that - * sees the new value of sp->completed will also see any preceding - * changes to data structures made by this CPU. This prevents - * some other CPU from reordering the accesses in its SRCU - * read-side critical section to precede the corresponding - * srcu_read_lock() -- ensuring that such references will in - * fact be protected. - * - * So it is now safe to do the flip. - */ +/* + * Awaken the corresponding synchronize_srcu() instance now that a + * grace period has elapsed. + */ +static void wakeme_after_rcu(struct rcu_head *head) +{ + struct rcu_synchronize *rcu; - idx = sp->completed & 0x1; - sp->completed++; + rcu = container_of(head, struct rcu_synchronize, head); + complete(&rcu->completion); +} - sync_func(); /* Force memory barrier on all CPUs. */ +static void srcu_advance_batches(struct srcu_struct *sp, int trycount); +static void srcu_reschedule(struct srcu_struct *sp); - /* - * At this point, because of the preceding synchronize_sched(), - * all srcu_read_lock() calls using the old counters have completed. - * Their corresponding critical sections might well be still - * executing, but the srcu_read_lock() primitives themselves - * will have finished executing. We initially give readers - * an arbitrarily chosen 10 microseconds to get out of their - * SRCU read-side critical sections, then loop waiting 1/HZ - * seconds per iteration. The 10-microsecond value has done - * very well in testing. - */ - - if (srcu_readers_active_idx(sp, idx)) - udelay(SYNCHRONIZE_SRCU_READER_DELAY); - while (srcu_readers_active_idx(sp, idx)) - schedule_timeout_interruptible(1); +/* + * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). + */ +static void __synchronize_srcu(struct srcu_struct *sp, int trycount) +{ + struct rcu_synchronize rcu; + struct rcu_head *head = &rcu.head; + bool done = false; - sync_func(); /* Force memory barrier on all CPUs. */ + rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && + !lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); - /* - * The preceding synchronize_sched() forces all srcu_read_unlock() - * primitives that were executing concurrently with the preceding - * for_each_possible_cpu() loop to have completed by this point. - * More importantly, it also forces the corresponding SRCU read-side - * critical sections to have also completed, and the corresponding - * references to SRCU-protected data items to be dropped. - * - * Note: - * - * Despite what you might think at first glance, the - * preceding synchronize_sched() -must- be within the - * critical section ended by the following mutex_unlock(). - * Otherwise, a task taking the early exit can race - * with a srcu_read_unlock(), which might have executed - * just before the preceding srcu_readers_active() check, - * and whose CPU might have reordered the srcu_read_unlock() - * with the preceding critical section. In this case, there - * is nothing preventing the synchronize_sched() task that is - * taking the early exit from freeing a data structure that - * is still being referenced (out of order) by the task - * doing the srcu_read_unlock(). - * - * Alternatively, the comparison with "2" on the early exit - * could be changed to "3", but this increases synchronize_srcu() - * latency for bulk loads. So the current code is preferred. - */ + init_completion(&rcu.completion); + + head->next = NULL; + head->func = wakeme_after_rcu; + spin_lock_irq(&sp->queue_lock); + if (!sp->running) { + /* steal the processing owner */ + sp->running = true; + rcu_batch_queue(&sp->batch_check0, head); + spin_unlock_irq(&sp->queue_lock); + + srcu_advance_batches(sp, trycount); + if (!rcu_batch_empty(&sp->batch_done)) { + BUG_ON(sp->batch_done.head != head); + rcu_batch_dequeue(&sp->batch_done); + done = true; + } + /* give the processing owner to work_struct */ + srcu_reschedule(sp); + } else { + rcu_batch_queue(&sp->batch_queue, head); + spin_unlock_irq(&sp->queue_lock); + } - mutex_unlock(&sp->mutex); + if (!done) + wait_for_completion(&rcu.completion); } /** @@ -281,7 +464,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) */ void synchronize_srcu(struct srcu_struct *sp) { - __synchronize_srcu(sp, synchronize_sched); + __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); } EXPORT_SYMBOL_GPL(synchronize_srcu); @@ -289,18 +472,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); * synchronize_srcu_expedited - Brute-force SRCU grace period * @sp: srcu_struct with which to synchronize. * - * Wait for an SRCU grace period to elapse, but use a "big hammer" - * approach to force the grace period to end quickly. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. In fact, - * if you are using synchronize_srcu_expedited() in a loop, please - * restructure your code to batch your updates, and then use a single - * synchronize_srcu() instead. + * Wait for an SRCU grace period to elapse, but be more aggressive about + * spinning rather than blocking when waiting. * * Note that it is illegal to call this function while holding any lock - * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal - * to call this function from a CPU-hotplug notifier. Failing to observe - * these restriction will result in deadlock. It is also illegal to call + * that is acquired by a CPU-hotplug notifier. It is also illegal to call * synchronize_srcu_expedited() from the corresponding SRCU read-side * critical section; doing so will result in deadlock. However, it is * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct @@ -309,20 +485,166 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); */ void synchronize_srcu_expedited(struct srcu_struct *sp) { - __synchronize_srcu(sp, synchronize_sched_expedited); + __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT); } EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); /** + * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. + */ +void srcu_barrier(struct srcu_struct *sp) +{ + synchronize_srcu(sp); +} +EXPORT_SYMBOL_GPL(srcu_barrier); + +/** * srcu_batches_completed - return batches completed. * @sp: srcu_struct on which to report batch completion. * * Report the number of batches, correlated with, but not necessarily * precisely the same as, the number of grace periods that have elapsed. */ - long srcu_batches_completed(struct srcu_struct *sp) { return sp->completed; } EXPORT_SYMBOL_GPL(srcu_batches_completed); + +#define SRCU_CALLBACK_BATCH 10 +#define SRCU_INTERVAL 1 + +/* + * Move any new SRCU callbacks to the first stage of the SRCU grace + * period pipeline. + */ +static void srcu_collect_new(struct srcu_struct *sp) +{ + if (!rcu_batch_empty(&sp->batch_queue)) { + spin_lock_irq(&sp->queue_lock); + rcu_batch_move(&sp->batch_check0, &sp->batch_queue); + spin_unlock_irq(&sp->queue_lock); + } +} + +/* + * Core SRCU state machine. Advance callbacks from ->batch_check0 to + * ->batch_check1 and then to ->batch_done as readers drain. + */ +static void srcu_advance_batches(struct srcu_struct *sp, int trycount) +{ + int idx = 1 ^ (sp->completed & 1); + + /* + * Because readers might be delayed for an extended period after + * fetching ->completed for their index, at any point in time there + * might well be readers using both idx=0 and idx=1. We therefore + * need to wait for readers to clear from both index values before + * invoking a callback. + */ + + if (rcu_batch_empty(&sp->batch_check0) && + rcu_batch_empty(&sp->batch_check1)) + return; /* no callbacks need to be advanced */ + + if (!try_check_zero(sp, idx, trycount)) + return; /* failed to advance, will try after SRCU_INTERVAL */ + + /* + * The callbacks in ->batch_check1 have already done with their + * first zero check and flip back when they were enqueued on + * ->batch_check0 in a previous invocation of srcu_advance_batches(). + * (Presumably try_check_zero() returned false during that + * invocation, leaving the callbacks stranded on ->batch_check1.) + * They are therefore ready to invoke, so move them to ->batch_done. + */ + rcu_batch_move(&sp->batch_done, &sp->batch_check1); + + if (rcu_batch_empty(&sp->batch_check0)) + return; /* no callbacks need to be advanced */ + srcu_flip(sp); + + /* + * The callbacks in ->batch_check0 just finished their + * first check zero and flip, so move them to ->batch_check1 + * for future checking on the other idx. + */ + rcu_batch_move(&sp->batch_check1, &sp->batch_check0); + + /* + * SRCU read-side critical sections are normally short, so check + * at least twice in quick succession after a flip. + */ + trycount = trycount < 2 ? 2 : trycount; + if (!try_check_zero(sp, idx^1, trycount)) + return; /* failed to advance, will try after SRCU_INTERVAL */ + + /* + * The callbacks in ->batch_check1 have now waited for all + * pre-existing readers using both idx values. They are therefore + * ready to invoke, so move them to ->batch_done. + */ + rcu_batch_move(&sp->batch_done, &sp->batch_check1); +} + +/* + * Invoke a limited number of SRCU callbacks that have passed through + * their grace period. If there are more to do, SRCU will reschedule + * the workqueue. + */ +static void srcu_invoke_callbacks(struct srcu_struct *sp) +{ + int i; + struct rcu_head *head; + + for (i = 0; i < SRCU_CALLBACK_BATCH; i++) { + head = rcu_batch_dequeue(&sp->batch_done); + if (!head) + break; + local_bh_disable(); + head->func(head); + local_bh_enable(); + } +} + +/* + * Finished one round of SRCU grace period. Start another if there are + * more SRCU callbacks queued, otherwise put SRCU into not-running state. + */ +static void srcu_reschedule(struct srcu_struct *sp) +{ + bool pending = true; + + if (rcu_batch_empty(&sp->batch_done) && + rcu_batch_empty(&sp->batch_check1) && + rcu_batch_empty(&sp->batch_check0) && + rcu_batch_empty(&sp->batch_queue)) { + spin_lock_irq(&sp->queue_lock); + if (rcu_batch_empty(&sp->batch_done) && + rcu_batch_empty(&sp->batch_check1) && + rcu_batch_empty(&sp->batch_check0) && + rcu_batch_empty(&sp->batch_queue)) { + sp->running = false; + pending = false; + } + spin_unlock_irq(&sp->queue_lock); + } + + if (pending) + queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL); +} + +/* + * This is the work-queue function that handles SRCU grace periods. + */ +static void process_srcu(struct work_struct *work) +{ + struct srcu_struct *sp; + + sp = container_of(work, struct srcu_struct, work.work); + + srcu_collect_new(sp); + srcu_advance_batches(sp, 1); + srcu_invoke_callbacks(sp); + srcu_reschedule(sp); +} diff --git a/kernel/sys.c b/kernel/sys.c index e7006eb6c1e..ba0ae8eea6f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1908,7 +1908,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = prctl_get_seccomp(); break; case PR_SET_SECCOMP: - error = prctl_set_seccomp(arg2); + error = prctl_set_seccomp(arg2, (char __user *)arg3); break; case PR_GET_TSC: error = GET_TSC_CTL(arg2); @@ -1979,6 +1979,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = put_user(me->signal->is_child_subreaper, (int __user *) arg2); break; + case PR_SET_NO_NEW_PRIVS: + if (arg2 != 1 || arg3 || arg4 || arg5) + return -EINVAL; + + current->no_new_privs = 1; + break; + case PR_GET_NO_NEW_PRIVS: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + return current->no_new_privs ? 1 : 0; default: error = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 52b3a06a02f..4ab11879aeb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -170,7 +170,7 @@ static int proc_taint(struct ctl_table *table, int write, #endif #ifdef CONFIG_PRINTK -static int proc_dmesg_restrict(struct ctl_table *table, int write, +static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -703,7 +703,7 @@ static struct ctl_table kern_table[] = { .data = &dmesg_restrict, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax_sysadmin, .extra1 = &zero, .extra2 = &one, }, @@ -712,7 +712,7 @@ static struct ctl_table kern_table[] = { .data = &kptr_restrict, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dmesg_restrict, + .proc_handler = proc_dointvec_minmax_sysadmin, .extra1 = &zero, .extra2 = &two, }, @@ -1943,7 +1943,7 @@ static int proc_taint(struct ctl_table *table, int write, } #ifdef CONFIG_PRINTK -static int proc_dmesg_restrict(struct ctl_table *table, int write, +static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { if (write && !capable(CAP_SYS_ADMIN)) diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 2cf9cc7aa10..a20dc8a3c94 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -1,6 +1,10 @@ # # Timer subsystem related configuration options # + +# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is +# only related to the tick functionality. Oneshot clockevent devices +# are supported independ of this. config TICK_ONESHOT bool diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8a538c55fc7..aa27d391bfc 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(rtcdev_lock); * If one has not already been chosen, it checks to see if a * functional rtc device is available. */ -static struct rtc_device *alarmtimer_get_rtcdev(void) +struct rtc_device *alarmtimer_get_rtcdev(void) { unsigned long flags; struct rtc_device *ret; @@ -115,7 +115,7 @@ static void alarmtimer_rtc_interface_remove(void) class_interface_unregister(&alarmtimer_rtc_interface); } #else -static inline struct rtc_device *alarmtimer_get_rtcdev(void) +struct rtc_device *alarmtimer_get_rtcdev(void) { return NULL; } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index e883f57a3cd..f113755695e 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -346,7 +346,8 @@ int tick_resume_broadcast(void) tick_get_broadcast_mask()); break; case TICKDEV_MODE_ONESHOT: - broadcast = tick_resume_broadcast_oneshot(bc); + if (!cpumask_empty(tick_get_broadcast_mask())) + broadcast = tick_resume_broadcast_oneshot(bc); break; } } @@ -373,6 +374,9 @@ static int tick_broadcast_set_event(ktime_t expires, int force) { struct clock_event_device *bc = tick_broadcast_device.evtdev; + if (bc->mode != CLOCK_EVT_MODE_ONESHOT) + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + return clockevents_program_event(bc, expires, force); } @@ -531,7 +535,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; bc->event_handler = tick_handle_oneshot_broadcast; - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); /* Take the do_timer update */ tick_do_timer_cpu = cpu; @@ -549,6 +552,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) to_cpumask(tmpmask)); if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); tick_broadcast_init_next_event(to_cpumask(tmpmask), tick_next_period); tick_broadcast_set_event(tick_next_period, 1); @@ -575,15 +579,12 @@ void tick_broadcast_switch_to_oneshot(void) unsigned long flags; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - if (cpumask_empty(tick_get_broadcast_mask())) - goto end; tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; bc = tick_broadcast_device.evtdev; if (bc) tick_broadcast_setup_oneshot(bc); -end: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3526038f283..6a3a5b9ff56 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -534,9 +534,9 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) hrtimer_get_expires(&ts->sched_timer), 0)) break; } - /* Update jiffies and reread time */ - tick_do_update_jiffies64(now); + /* Reread time and update jiffies */ now = ktime_get(); + tick_do_update_jiffies64(now); } } diff --git a/kernel/timer.c b/kernel/timer.c index a297ffcf888..09de9a941cd 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -861,7 +861,13 @@ EXPORT_SYMBOL(mod_timer); * * mod_timer_pinned() is a way to update the expire field of an * active timer (if the timer is inactive it will be activated) - * and not allow the timer to be migrated to a different CPU. + * and to ensure that the timer is scheduled on the current CPU. + * + * Note that this does not prevent the timer from being migrated + * when the current CPU goes offline. If this is a problem for + * you, use CPU-hotplug notifiers to handle it correctly, for + * example, cancelling the timer when the corresponding CPU goes + * offline. * * mod_timer_pinned(timer, expires) is equivalent to: * @@ -1102,7 +1108,9 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), * warnings as well as problems when looking into * timer->lockdep_map, make a copy and use that here. */ - struct lockdep_map lockdep_map = timer->lockdep_map; + struct lockdep_map lockdep_map; + + lockdep_copy_map(&lockdep_map, &timer->lockdep_map); #endif /* * Couple the lock chain with the lock chain at diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 5f39a07fe5e..b3afe0e76f7 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -41,7 +41,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o -obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o ifeq ($(CONFIG_BLOCK),y) obj-$(CONFIG_EVENT_TRACING) += blktrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index cdea7b56b0c..c0bd0308741 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -311,13 +311,6 @@ int blk_trace_remove(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_trace_remove); -static int blk_dropped_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - - return 0; -} - static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { @@ -331,18 +324,11 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, static const struct file_operations blk_dropped_fops = { .owner = THIS_MODULE, - .open = blk_dropped_open, + .open = simple_open, .read = blk_dropped_read, .llseek = default_llseek, }; -static int blk_msg_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - - return 0; -} - static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, size_t count, loff_t *ppos) { @@ -371,7 +357,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, static const struct file_operations blk_msg_fops = { .owner = THIS_MODULE, - .open = blk_msg_open, + .open = simple_open, .write = blk_msg_write, .llseek = noop_llseek, }; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ed7b5d1e12f..2a22255c101 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4629,7 +4629,8 @@ static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ring_buffer *buffer = filp->private_data; + struct trace_array *tr = filp->private_data; + struct ring_buffer *buffer = tr->buffer; char buf[64]; int r; @@ -4647,7 +4648,8 @@ static ssize_t rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ring_buffer *buffer = filp->private_data; + struct trace_array *tr = filp->private_data; + struct ring_buffer *buffer = tr->buffer; unsigned long val; int ret; @@ -4734,7 +4736,7 @@ static __init int tracer_init_debugfs(void) &trace_clock_fops); trace_create_file("tracing_on", 0644, d_tracer, - global_trace.buffer, &rb_simple_fops); + &global_trace, &rb_simple_fops); #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 95059f091a2..f95d65da6db 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -836,11 +836,11 @@ extern const char *__stop___trace_bprintk_fmt[]; filter) #include "trace_entries.h" -#ifdef CONFIG_FUNCTION_TRACER +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) int perf_ftrace_event_register(struct ftrace_event_call *call, enum trace_reg type, void *data); #else #define perf_ftrace_event_register NULL -#endif /* CONFIG_FUNCTION_TRACER */ +#endif #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 079a93ae8a9..29111da1d10 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -294,6 +294,9 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, if (!call->name || !call->class || !call->class->reg) continue; + if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) + continue; + if (match && strcmp(match, call->name) != 0 && strcmp(match, call->class->system) != 0) @@ -1164,7 +1167,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, return -1; } - if (call->class->reg) + if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) trace_create_file("enable", 0644, call->dir, call, enable); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 3dd15e8bc85..e039906b037 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -180,6 +180,7 @@ struct ftrace_event_call __used event_##call = { \ .event.type = etype, \ .class = &event_class_ftrace_##call, \ .print_fmt = print, \ + .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ }; \ struct ftrace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 859fae6b182..df611a0e76c 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -652,6 +652,8 @@ int trace_print_lat_context(struct trace_iterator *iter) { u64 next_ts; int ret; + /* trace_find_next_entry will reset ent_size */ + int ent_size = iter->ent_size; struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent, *next_entry = trace_find_next_entry(iter, NULL, @@ -660,6 +662,9 @@ int trace_print_lat_context(struct trace_iterator *iter) unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); unsigned long rel_usecs; + /* Restore the original ent_size */ + iter->ent_size = ent_size; + if (!next_entry) next_ts = iter->ts; rel_usecs = ns2usecs(next_ts - iter->ts); diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c deleted file mode 100644 index 209b379a472..00000000000 --- a/kernel/trace/trace_workqueue.c +++ /dev/null @@ -1,300 +0,0 @@ -/* - * Workqueue statistical tracer. - * - * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> - * - */ - - -#include <trace/events/workqueue.h> -#include <linux/list.h> -#include <linux/percpu.h> -#include <linux/slab.h> -#include <linux/kref.h> -#include "trace_stat.h" -#include "trace.h" - - -/* A cpu workqueue thread */ -struct cpu_workqueue_stats { - struct list_head list; - struct kref kref; - int cpu; - pid_t pid; -/* Can be inserted from interrupt or user context, need to be atomic */ - atomic_t inserted; -/* - * Don't need to be atomic, works are serialized in a single workqueue thread - * on a single CPU. - */ - unsigned int executed; -}; - -/* List of workqueue threads on one cpu */ -struct workqueue_global_stats { - struct list_head list; - spinlock_t lock; -}; - -/* Don't need a global lock because allocated before the workqueues, and - * never freed. - */ -static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); -#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) - -static void cpu_workqueue_stat_free(struct kref *kref) -{ - kfree(container_of(kref, struct cpu_workqueue_stats, kref)); -} - -/* Insertion of a work */ -static void -probe_workqueue_insertion(void *ignore, - struct task_struct *wq_thread, - struct work_struct *work) -{ - int cpu = cpumask_first(&wq_thread->cpus_allowed); - struct cpu_workqueue_stats *node; - unsigned long flags; - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { - if (node->pid == wq_thread->pid) { - atomic_inc(&node->inserted); - goto found; - } - } - pr_debug("trace_workqueue: entry not found\n"); -found: - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -} - -/* Execution of a work */ -static void -probe_workqueue_execution(void *ignore, - struct task_struct *wq_thread, - struct work_struct *work) -{ - int cpu = cpumask_first(&wq_thread->cpus_allowed); - struct cpu_workqueue_stats *node; - unsigned long flags; - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { - if (node->pid == wq_thread->pid) { - node->executed++; - goto found; - } - } - pr_debug("trace_workqueue: entry not found\n"); -found: - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -} - -/* Creation of a cpu workqueue thread */ -static void probe_workqueue_creation(void *ignore, - struct task_struct *wq_thread, int cpu) -{ - struct cpu_workqueue_stats *cws; - unsigned long flags; - - WARN_ON(cpu < 0); - - /* Workqueues are sometimes created in atomic context */ - cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC); - if (!cws) { - pr_warning("trace_workqueue: not enough memory\n"); - return; - } - INIT_LIST_HEAD(&cws->list); - kref_init(&cws->kref); - cws->cpu = cpu; - cws->pid = wq_thread->pid; - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -} - -/* Destruction of a cpu workqueue thread */ -static void -probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread) -{ - /* Workqueue only execute on one cpu */ - int cpu = cpumask_first(&wq_thread->cpus_allowed); - struct cpu_workqueue_stats *node, *next; - unsigned long flags; - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, - list) { - if (node->pid == wq_thread->pid) { - list_del(&node->list); - kref_put(&node->kref, cpu_workqueue_stat_free); - goto found; - } - } - - pr_debug("trace_workqueue: don't find workqueue to destroy\n"); -found: - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - -} - -static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) -{ - unsigned long flags; - struct cpu_workqueue_stats *ret = NULL; - - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - - if (!list_empty(&workqueue_cpu_stat(cpu)->list)) { - ret = list_entry(workqueue_cpu_stat(cpu)->list.next, - struct cpu_workqueue_stats, list); - kref_get(&ret->kref); - } - - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - - return ret; -} - -static void *workqueue_stat_start(struct tracer_stat *trace) -{ - int cpu; - void *ret = NULL; - - for_each_possible_cpu(cpu) { - ret = workqueue_stat_start_cpu(cpu); - if (ret) - return ret; - } - return NULL; -} - -static void *workqueue_stat_next(void *prev, int idx) -{ - struct cpu_workqueue_stats *prev_cws = prev; - struct cpu_workqueue_stats *ret; - int cpu = prev_cws->cpu; - unsigned long flags; - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - do { - cpu = cpumask_next(cpu, cpu_possible_mask); - if (cpu >= nr_cpu_ids) - return NULL; - } while (!(ret = workqueue_stat_start_cpu(cpu))); - return ret; - } else { - ret = list_entry(prev_cws->list.next, - struct cpu_workqueue_stats, list); - kref_get(&ret->kref); - } - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - - return ret; -} - -static int workqueue_stat_show(struct seq_file *s, void *p) -{ - struct cpu_workqueue_stats *cws = p; - struct pid *pid; - struct task_struct *tsk; - - pid = find_get_pid(cws->pid); - if (pid) { - tsk = get_pid_task(pid, PIDTYPE_PID); - if (tsk) { - seq_printf(s, "%3d %6d %6u %s\n", cws->cpu, - atomic_read(&cws->inserted), cws->executed, - tsk->comm); - put_task_struct(tsk); - } - put_pid(pid); - } - - return 0; -} - -static void workqueue_stat_release(void *stat) -{ - struct cpu_workqueue_stats *node = stat; - - kref_put(&node->kref, cpu_workqueue_stat_free); -} - -static int workqueue_stat_headers(struct seq_file *s) -{ - seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); - seq_printf(s, "# | | | |\n"); - return 0; -} - -struct tracer_stat workqueue_stats __read_mostly = { - .name = "workqueues", - .stat_start = workqueue_stat_start, - .stat_next = workqueue_stat_next, - .stat_show = workqueue_stat_show, - .stat_release = workqueue_stat_release, - .stat_headers = workqueue_stat_headers -}; - - -int __init stat_workqueue_init(void) -{ - if (register_stat_tracer(&workqueue_stats)) { - pr_warning("Unable to register workqueue stat tracer\n"); - return 1; - } - - return 0; -} -fs_initcall(stat_workqueue_init); - -/* - * Workqueues are created very early, just after pre-smp initcalls. - * So we must register our tracepoints at this stage. - */ -int __init trace_workqueue_early_init(void) -{ - int ret, cpu; - - for_each_possible_cpu(cpu) { - spin_lock_init(&workqueue_cpu_stat(cpu)->lock); - INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); - } - - ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); - if (ret) - goto out; - - ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL); - if (ret) - goto no_insertion; - - ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL); - if (ret) - goto no_execution; - - ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL); - if (ret) - goto no_creation; - - return 0; - -no_creation: - unregister_trace_workqueue_creation(probe_workqueue_creation, NULL); -no_execution: - unregister_trace_workqueue_execution(probe_workqueue_execution, NULL); -no_insertion: - unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL); -out: - pr_warning("trace_workqueue: unable to trace workqueues\n"); - - return 1; -} -early_initcall(trace_workqueue_early_init); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5abf42f63c0..9a3128dc67d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1032,7 +1032,10 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, cwq = get_cwq(gcwq->cpu, wq); trace_workqueue_queue_work(cpu, cwq, work); - BUG_ON(!list_empty(&work->entry)); + if (WARN_ON(!list_empty(&work->entry))) { + spin_unlock_irqrestore(&gcwq->lock, flags); + return; + } cwq->nr_in_flight[cwq->work_color]++; work_flags = work_color_to_flags(cwq->work_color); @@ -1210,8 +1213,13 @@ static void worker_enter_idle(struct worker *worker) } else wake_up_all(&gcwq->trustee_wait); - /* sanity check nr_running */ - WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && + /* + * Sanity check nr_running. Because trustee releases gcwq->lock + * between setting %WORKER_ROGUE and zapping nr_running, the + * warning may trigger spuriously. Check iff trustee is idle. + */ + WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && + gcwq->nr_workers == gcwq->nr_idle && atomic_read(get_gcwq_nr_running(gcwq->cpu))); } @@ -1810,7 +1818,9 @@ __acquires(&gcwq->lock) * lock freed" warnings as well as problems when looking into * work->lockdep_map, make a copy and use that here. */ - struct lockdep_map lockdep_map = work->lockdep_map; + struct lockdep_map lockdep_map; + + lockdep_copy_map(&lockdep_map, &work->lockdep_map); #endif /* * A single work shouldn't be executed concurrently by @@ -2506,6 +2516,9 @@ bool flush_work(struct work_struct *work) { struct wq_barrier barr; + lock_map_acquire(&work->lockdep_map); + lock_map_release(&work->lockdep_map); + if (start_flush_work(work, &barr, true)) { wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); |