diff options
Diffstat (limited to 'kernel')
32 files changed, 583 insertions, 243 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 21c7fa615bd..91e53d04b6a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1056,7 +1056,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, static void wait_for_auditd(unsigned long sleep_time) { DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(&audit_backlog_wait, &wait); if (audit_backlog_limit && diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index a291aa23fb3..43c307dc945 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -658,6 +658,7 @@ int audit_add_tree_rule(struct audit_krule *rule) struct vfsmount *mnt; int err; + rule->tree = NULL; list_for_each_entry(tree, &tree_list, list) { if (!strcmp(seed->pathname, tree->pathname)) { put_tree(seed); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2a9926275f8..a7c9e6ddb97 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1686,11 +1686,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, */ cgroup_drop_root(opts.new_root); - if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) && - root->flags != opts.flags) { - pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); - ret = -EINVAL; - goto drop_new_super; + if (root->flags != opts.flags) { + if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { + pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); + ret = -EINVAL; + goto drop_new_super; + } else { + pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); + } } /* no subsys rebinding, so refcounts don't change */ @@ -2699,13 +2702,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, goto out; } + cfe->type = (void *)cft; + cfe->dentry = dentry; + dentry->d_fsdata = cfe; + simple_xattrs_init(&cfe->xattrs); + mode = cgroup_file_mode(cft); error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); if (!error) { - cfe->type = (void *)cft; - cfe->dentry = dentry; - dentry->d_fsdata = cfe; - simple_xattrs_init(&cfe->xattrs); list_add_tail(&cfe->node, &parent->files); cfe = NULL; } @@ -2953,11 +2957,8 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, WARN_ON_ONCE(!rcu_read_lock_held()); /* if first iteration, pretend we just visited @cgroup */ - if (!pos) { - if (list_empty(&cgroup->children)) - return NULL; + if (!pos) pos = cgroup; - } /* visit the first child if exists */ next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); @@ -2965,14 +2966,14 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, return next; /* no child, visit my or the closest ancestor's next sibling */ - do { + while (pos != cgroup) { next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); if (&next->sibling != &pos->parent->children) return next; pos = pos->parent; - } while (pos != cgroup); + } return NULL; } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 65349f07b87..383f8231e43 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -15,7 +15,6 @@ */ #include <linux/context_tracking.h> -#include <linux/kvm_host.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/hardirq.h> @@ -71,6 +70,46 @@ void user_enter(void) local_irq_restore(flags); } +#ifdef CONFIG_PREEMPT +/** + * preempt_schedule_context - preempt_schedule called by tracing + * + * The tracing infrastructure uses preempt_enable_notrace to prevent + * recursion and tracing preempt enabling caused by the tracing + * infrastructure itself. But as tracing can happen in areas coming + * from userspace or just about to enter userspace, a preempt enable + * can occur before user_exit() is called. This will cause the scheduler + * to be called when the system is still in usermode. + * + * To prevent this, the preempt_enable_notrace will use this function + * instead of preempt_schedule() to exit user context if needed before + * calling the scheduler. + */ +void __sched notrace preempt_schedule_context(void) +{ + struct thread_info *ti = current_thread_info(); + enum ctx_state prev_ctx; + + if (likely(ti->preempt_count || irqs_disabled())) + return; + + /* + * Need to disable preemption in case user_exit() is traced + * and the tracer calls preempt_enable_notrace() causing + * an infinite recursion. + */ + preempt_disable_notrace(); + prev_ctx = exception_enter(); + preempt_enable_no_resched_notrace(); + + preempt_schedule(); + + preempt_disable_notrace(); + exception_exit(prev_ctx); + preempt_enable_notrace(); +} +EXPORT_SYMBOL_GPL(preempt_schedule_context); +#endif /* CONFIG_PREEMPT */ /** * user_exit - Inform the context tracking that the CPU is diff --git a/kernel/cpu.c b/kernel/cpu.c index b5e4ab2d427..198a38883e6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -133,6 +133,27 @@ static void cpu_hotplug_done(void) mutex_unlock(&cpu_hotplug.lock); } +/* + * Wait for currently running CPU hotplug operations to complete (if any) and + * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects + * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the + * hotplug path before performing hotplug operations. So acquiring that lock + * guarantees mutual exclusion from any currently running hotplug operations. + */ +void cpu_hotplug_disable(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 1; + cpu_maps_update_done(); +} + +void cpu_hotplug_enable(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 0; + cpu_maps_update_done(); +} + #else /* #if CONFIG_HOTPLUG_CPU */ static void cpu_hotplug_begin(void) {} static void cpu_hotplug_done(void) {} @@ -541,36 +562,6 @@ static int __init alloc_frozen_cpus(void) core_initcall(alloc_frozen_cpus); /* - * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU - * hotplug when tasks are about to be frozen. Also, don't allow the freezer - * to continue until any currently running CPU hotplug operation gets - * completed. - * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the - * 'cpu_add_remove_lock'. And this same lock is also taken by the regular - * CPU hotplug path and released only after it is complete. Thus, we - * (and hence the freezer) will block here until any currently running CPU - * hotplug operation gets completed. - */ -void cpu_hotplug_disable_before_freeze(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 1; - cpu_maps_update_done(); -} - - -/* - * When tasks have been thawed, re-enable regular CPU hotplug (which had been - * disabled while beginning to freeze tasks). - */ -void cpu_hotplug_enable_after_thaw(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; - cpu_maps_update_done(); -} - -/* * When callbacks for CPU hotplug notifications are being executed, we must * ensure that the state of the system with respect to the tasks being frozen * or not, as reported by the notification, remains unchanged *throughout the @@ -589,12 +580,12 @@ cpu_hotplug_pm_callback(struct notifier_block *nb, case PM_SUSPEND_PREPARE: case PM_HIBERNATION_PREPARE: - cpu_hotplug_disable_before_freeze(); + cpu_hotplug_disable(); break; case PM_POST_SUSPEND: case PM_POST_HIBERNATION: - cpu_hotplug_enable_after_thaw(); + cpu_hotplug_enable(); break; default: diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index d5585f5e038..e695c0a0bcb 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -5,6 +5,7 @@ #include <linux/cpu.h> #include <linux/tick.h> #include <linux/mm.h> +#include <linux/stackprotector.h> #include <asm/tlb.h> @@ -58,6 +59,7 @@ void __weak arch_cpu_idle_dead(void) { } void __weak arch_cpu_idle(void) { cpu_idle_force_poll = 1; + local_irq_enable(); } /* @@ -112,6 +114,21 @@ static void cpu_idle_loop(void) void cpu_startup_entry(enum cpuhp_state state) { + /* + * This #ifdef needs to die, but it's too late in the cycle to + * make this generic (arm and sh have never invoked the canary + * init for the non boot cpus!). Will be fixed in 3.11 + */ +#ifdef CONFIG_X86 + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. The boot CPU already has it initialized but no harm + * in doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); +#endif current_set_polling(); arch_cpu_idle_prepare(); cpu_idle_loop(); diff --git a/kernel/events/core.c b/kernel/events/core.c index 9dc297faf7c..b391907d535 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -196,9 +196,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); -static void ring_buffer_attach(struct perf_event *event, - struct ring_buffer *rb); - void __weak perf_event_print_debug(void) { } extern __weak const char *perf_pmu_name(void) @@ -2918,6 +2915,7 @@ static void free_event_rcu(struct rcu_head *head) } static void ring_buffer_put(struct ring_buffer *rb); +static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); static void free_event(struct perf_event *event) { @@ -2942,15 +2940,30 @@ static void free_event(struct perf_event *event) if (has_branch_stack(event)) { static_key_slow_dec_deferred(&perf_sched_events); /* is system-wide event */ - if (!(event->attach_state & PERF_ATTACH_TASK)) + if (!(event->attach_state & PERF_ATTACH_TASK)) { atomic_dec(&per_cpu(perf_branch_stack_events, event->cpu)); + } } } if (event->rb) { - ring_buffer_put(event->rb); - event->rb = NULL; + struct ring_buffer *rb; + + /* + * Can happen when we close an event with re-directed output. + * + * Since we have a 0 refcount, perf_mmap_close() will skip + * over us; possibly making our ring_buffer_put() the last. + */ + mutex_lock(&event->mmap_mutex); + rb = event->rb; + if (rb) { + rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); + ring_buffer_put(rb); /* could be last */ + } + mutex_unlock(&event->mmap_mutex); } if (is_cgroup_event(event)) @@ -3188,30 +3201,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) unsigned int events = POLL_HUP; /* - * Race between perf_event_set_output() and perf_poll(): perf_poll() - * grabs the rb reference but perf_event_set_output() overrides it. - * Here is the timeline for two threads T1, T2: - * t0: T1, rb = rcu_dereference(event->rb) - * t1: T2, old_rb = event->rb - * t2: T2, event->rb = new rb - * t3: T2, ring_buffer_detach(old_rb) - * t4: T1, ring_buffer_attach(rb1) - * t5: T1, poll_wait(event->waitq) - * - * To avoid this problem, we grab mmap_mutex in perf_poll() - * thereby ensuring that the assignment of the new ring buffer - * and the detachment of the old buffer appear atomic to perf_poll() + * Pin the event->rb by taking event->mmap_mutex; otherwise + * perf_event_set_output() can swizzle our rb and make us miss wakeups. */ mutex_lock(&event->mmap_mutex); - - rcu_read_lock(); - rb = rcu_dereference(event->rb); - if (rb) { - ring_buffer_attach(event, rb); + rb = event->rb; + if (rb) events = atomic_xchg(&rb->poll, 0); - } - rcu_read_unlock(); - mutex_unlock(&event->mmap_mutex); poll_wait(file, &event->waitq, wait); @@ -3521,16 +3517,12 @@ static void ring_buffer_attach(struct perf_event *event, return; spin_lock_irqsave(&rb->event_lock, flags); - if (!list_empty(&event->rb_entry)) - goto unlock; - - list_add(&event->rb_entry, &rb->event_list); -unlock: + if (list_empty(&event->rb_entry)) + list_add(&event->rb_entry, &rb->event_list); spin_unlock_irqrestore(&rb->event_lock, flags); } -static void ring_buffer_detach(struct perf_event *event, - struct ring_buffer *rb) +static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) { unsigned long flags; @@ -3549,13 +3541,10 @@ static void ring_buffer_wakeup(struct perf_event *event) rcu_read_lock(); rb = rcu_dereference(event->rb); - if (!rb) - goto unlock; - - list_for_each_entry_rcu(event, &rb->event_list, rb_entry) - wake_up_all(&event->waitq); - -unlock: + if (rb) { + list_for_each_entry_rcu(event, &rb->event_list, rb_entry) + wake_up_all(&event->waitq); + } rcu_read_unlock(); } @@ -3584,18 +3573,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) static void ring_buffer_put(struct ring_buffer *rb) { - struct perf_event *event, *n; - unsigned long flags; - if (!atomic_dec_and_test(&rb->refcount)) return; - spin_lock_irqsave(&rb->event_lock, flags); - list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { - list_del_init(&event->rb_entry); - wake_up_all(&event->waitq); - } - spin_unlock_irqrestore(&rb->event_lock, flags); + WARN_ON_ONCE(!list_empty(&rb->event_list)); call_rcu(&rb->rcu_head, rb_free_rcu); } @@ -3605,26 +3586,100 @@ static void perf_mmap_open(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; atomic_inc(&event->mmap_count); + atomic_inc(&event->rb->mmap_count); } +/* + * A buffer can be mmap()ed multiple times; either directly through the same + * event, or through other events by use of perf_event_set_output(). + * + * In order to undo the VM accounting done by perf_mmap() we need to destroy + * the buffer here, where we still have a VM context. This means we need + * to detach all events redirecting to us. + */ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - unsigned long size = perf_data_size(event->rb); - struct user_struct *user = event->mmap_user; - struct ring_buffer *rb = event->rb; + struct ring_buffer *rb = event->rb; + struct user_struct *mmap_user = rb->mmap_user; + int mmap_locked = rb->mmap_locked; + unsigned long size = perf_data_size(rb); + + atomic_dec(&rb->mmap_count); + + if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) + return; - atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); - vma->vm_mm->pinned_vm -= event->mmap_locked; - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); + /* Detach current event from the buffer. */ + rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); + mutex_unlock(&event->mmap_mutex); + + /* If there's still other mmap()s of this buffer, we're done. */ + if (atomic_read(&rb->mmap_count)) { + ring_buffer_put(rb); /* can't be last */ + return; + } + + /* + * No other mmap()s, detach from all other events that might redirect + * into the now unreachable buffer. Somewhat complicated by the + * fact that rb::event_lock otherwise nests inside mmap_mutex. + */ +again: + rcu_read_lock(); + list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { + if (!atomic_long_inc_not_zero(&event->refcount)) { + /* + * This event is en-route to free_event() which will + * detach it and remove it from the list. + */ + continue; + } + rcu_read_unlock(); + + mutex_lock(&event->mmap_mutex); + /* + * Check we didn't race with perf_event_set_output() which can + * swizzle the rb from under us while we were waiting to + * acquire mmap_mutex. + * + * If we find a different rb; ignore this event, a next + * iteration will no longer find it on the list. We have to + * still restart the iteration to make sure we're not now + * iterating the wrong list. + */ + if (event->rb == rb) { + rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); + ring_buffer_put(rb); /* can't be last, we still have one */ + } mutex_unlock(&event->mmap_mutex); + put_event(event); - ring_buffer_put(rb); - free_uid(user); + /* + * Restart the iteration; either we're on the wrong list or + * destroyed its integrity by doing a deletion. + */ + goto again; } + rcu_read_unlock(); + + /* + * It could be there's still a few 0-ref events on the list; they'll + * get cleaned up by free_event() -- they'll also still have their + * ref on the rb and will free it whenever they are done with it. + * + * Aside from that, this buffer is 'fully' detached and unmapped, + * undo the VM accounting. + */ + + atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); + vma->vm_mm->pinned_vm -= mmap_locked; + free_uid(mmap_user); + + ring_buffer_put(rb); /* could be last */ } static const struct vm_operations_struct perf_mmap_vmops = { @@ -3674,12 +3729,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; WARN_ON_ONCE(event->ctx->parent_ctx); +again: mutex_lock(&event->mmap_mutex); if (event->rb) { - if (event->rb->nr_pages == nr_pages) - atomic_inc(&event->rb->refcount); - else + if (event->rb->nr_pages != nr_pages) { ret = -EINVAL; + goto unlock; + } + + if (!atomic_inc_not_zero(&event->rb->mmap_count)) { + /* + * Raced against perf_mmap_close() through + * perf_event_set_output(). Try again, hope for better + * luck. + */ + mutex_unlock(&event->mmap_mutex); + goto again; + } + goto unlock; } @@ -3720,12 +3787,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ret = -ENOMEM; goto unlock; } - rcu_assign_pointer(event->rb, rb); + + atomic_set(&rb->mmap_count, 1); + rb->mmap_locked = extra; + rb->mmap_user = get_current_user(); atomic_long_add(user_extra, &user->locked_vm); - event->mmap_locked = extra; - event->mmap_user = get_current_user(); - vma->vm_mm->pinned_vm += event->mmap_locked; + vma->vm_mm->pinned_vm += extra; + + ring_buffer_attach(event, rb); + rcu_assign_pointer(event->rb, rb); perf_event_update_userpage(event); @@ -3734,7 +3805,11 @@ unlock: atomic_inc(&event->mmap_count); mutex_unlock(&event->mmap_mutex); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + /* + * Since pinned accounting is per vm we cannot allow fork() to copy our + * vma. + */ + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &perf_mmap_vmops; return ret; @@ -6412,6 +6487,8 @@ set: if (atomic_read(&event->mmap_count)) goto unlock; + old_rb = event->rb; + if (output_event) { /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); @@ -6419,16 +6496,28 @@ set: goto unlock; } - old_rb = event->rb; - rcu_assign_pointer(event->rb, rb); if (old_rb) ring_buffer_detach(event, old_rb); + + if (rb) + ring_buffer_attach(event, rb); + + rcu_assign_pointer(event->rb, rb); + + if (old_rb) { + ring_buffer_put(old_rb); + /* + * Since we detached before setting the new rb, so that we + * could attach the new rb, we could have missed a wakeup. + * Provide it now. + */ + wake_up_all(&event->waitq); + } + ret = 0; unlock: mutex_unlock(&event->mmap_mutex); - if (old_rb) - ring_buffer_put(old_rb); out: return ret; } diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index a64f8aeb5c1..20185ea64aa 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -120,7 +120,7 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) list_for_each_entry(iter, &bp_task_head, hw.bp_list) { if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type && - cpu == iter->cpu) + (iter->cpu < 0 || cpu == iter->cpu)) count += hw_breakpoint_weight(iter); } @@ -149,7 +149,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, return; } - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { unsigned int nr; nr = per_cpu(nr_cpu_bp_pinned[type], cpu); @@ -235,7 +235,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, if (cpu >= 0) { toggle_bp_task_slot(bp, cpu, enable, type, weight); } else { - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) toggle_bp_task_slot(bp, cpu, enable, type, weight); } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index eb675c4d59d..ca6599723be 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -31,6 +31,10 @@ struct ring_buffer { spinlock_t event_lock; struct list_head event_list; + atomic_t mmap_count; + unsigned long mmap_locked; + struct user_struct *mmap_user; + struct perf_event_mmap_page *user_page; void *data_pages[0]; }; diff --git a/kernel/exit.c b/kernel/exit.c index af2eb3cbd49..7bb73f9d09d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -649,7 +649,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ forget_original_parent(tsk); - exit_task_namespaces(tsk); write_lock_irq(&tasklist_lock); if (group_dead) @@ -795,6 +794,7 @@ void do_exit(long code) exit_shm(tsk); exit_files(tsk); exit_fs(tsk); + exit_task_namespaces(tsk); exit_task_work(tsk); check_stack_usage(); exit_thread(); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 5a83dde8ca0..54a4d522323 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -143,7 +143,10 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, * irq_domain_add_simple() - Allocate and register a simple irq_domain. * @of_node: pointer to interrupt controller's device tree node. * @size: total number of irqs in mapping - * @first_irq: first number of irq block assigned to the domain + * @first_irq: first number of irq block assigned to the domain, + * pass zero to assign irqs on-the-fly. This will result in a + * linear IRQ domain so it is important to use irq_create_mapping() + * for each used IRQ, especially when SPARSE_IRQ is enabled. * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * @@ -191,6 +194,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, /* A linear domain is the default */ return irq_domain_add_linear(of_node, size, ops, host_data); } +EXPORT_SYMBOL_GPL(irq_domain_add_simple); /** * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. @@ -397,11 +401,12 @@ static void irq_domain_disassociate_many(struct irq_domain *domain, while (count--) { int irq = irq_base + count; struct irq_data *irq_data = irq_get_irq_data(irq); - irq_hw_number_t hwirq = irq_data->hwirq; + irq_hw_number_t hwirq; if (WARN_ON(!irq_data || irq_data->domain != domain)) continue; + hwirq = irq_data->hwirq; irq_set_status_flags(irq, IRQ_NOREQUEST); /* remove chip and handler */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3fed7f0cbcd..bddf3b201a4 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -467,6 +467,7 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) /* Optimization staging list, protected by kprobe_mutex */ static LIST_HEAD(optimizing_list); static LIST_HEAD(unoptimizing_list); +static LIST_HEAD(freeing_list); static void kprobe_optimizer(struct work_struct *work); static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); @@ -504,7 +505,7 @@ static __kprobes void do_optimize_kprobes(void) * Unoptimize (replace a jump with a breakpoint and remove the breakpoint * if need) kprobes listed on unoptimizing_list. */ -static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) +static __kprobes void do_unoptimize_kprobes(void) { struct optimized_kprobe *op, *tmp; @@ -515,9 +516,9 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) /* Ditto to do_optimize_kprobes */ get_online_cpus(); mutex_lock(&text_mutex); - arch_unoptimize_kprobes(&unoptimizing_list, free_list); + arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); /* Loop free_list for disarming */ - list_for_each_entry_safe(op, tmp, free_list, list) { + list_for_each_entry_safe(op, tmp, &freeing_list, list) { /* Disarm probes if marked disabled */ if (kprobe_disabled(&op->kp)) arch_disarm_kprobe(&op->kp); @@ -536,11 +537,11 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) } /* Reclaim all kprobes on the free_list */ -static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) +static __kprobes void do_free_cleaned_kprobes(void) { struct optimized_kprobe *op, *tmp; - list_for_each_entry_safe(op, tmp, free_list, list) { + list_for_each_entry_safe(op, tmp, &freeing_list, list) { BUG_ON(!kprobe_unused(&op->kp)); list_del_init(&op->list); free_aggr_kprobe(&op->kp); @@ -556,8 +557,6 @@ static __kprobes void kick_kprobe_optimizer(void) /* Kprobe jump optimizer */ static __kprobes void kprobe_optimizer(struct work_struct *work) { - LIST_HEAD(free_list); - mutex_lock(&kprobe_mutex); /* Lock modules while optimizing kprobes */ mutex_lock(&module_mutex); @@ -566,7 +565,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) * kprobes before waiting for quiesence period. */ - do_unoptimize_kprobes(&free_list); + do_unoptimize_kprobes(); /* * Step 2: Wait for quiesence period to ensure all running interrupts @@ -581,7 +580,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) do_optimize_kprobes(); /* Step 4: Free cleaned kprobes after quiesence period */ - do_free_cleaned_kprobes(&free_list); + do_free_cleaned_kprobes(); mutex_unlock(&module_mutex); mutex_unlock(&kprobe_mutex); @@ -723,8 +722,19 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p) if (!list_empty(&op->list)) /* Dequeue from the (un)optimization queue */ list_del_init(&op->list); - op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + + if (kprobe_unused(p)) { + /* Enqueue if it is unused */ + list_add(&op->list, &freeing_list); + /* + * Remove unused probes from the hash list. After waiting + * for synchronization, this probe is reclaimed. + * (reclaiming is done by do_free_cleaned_kprobes().) + */ + hlist_del_rcu(&op->kp.hlist); + } + /* Don't touch the code, because it is already freed. */ arch_remove_optimized_kprobe(op); } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 5dfdc9ea180..9c39de095ba 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -100,7 +100,6 @@ config PM_SLEEP_SMP depends on SMP depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE depends on PM_SLEEP - select HOTPLUG select HOTPLUG_CPU config PM_AUTOSLEEP diff --git a/kernel/printk.c b/kernel/printk.c index fa36e149442..8212c1aef12 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -363,6 +363,53 @@ static void log_store(int facility, int level, log_next_seq++; } +#ifdef CONFIG_SECURITY_DMESG_RESTRICT +int dmesg_restrict = 1; +#else +int dmesg_restrict; +#endif + +static int syslog_action_restricted(int type) +{ + if (dmesg_restrict) + return 1; + /* + * Unless restricted, we allow "read all" and "get buffer size" + * for everybody. + */ + return type != SYSLOG_ACTION_READ_ALL && + type != SYSLOG_ACTION_SIZE_BUFFER; +} + +static int check_syslog_permissions(int type, bool from_file) +{ + /* + * If this is from /proc/kmsg and we've already opened it, then we've + * already done the capabilities checks at open time. + */ + if (from_file && type != SYSLOG_ACTION_OPEN) + return 0; + + if (syslog_action_restricted(type)) { + if (capable(CAP_SYSLOG)) + return 0; + /* + * For historical reasons, accept CAP_SYS_ADMIN too, with + * a warning. + */ + if (capable(CAP_SYS_ADMIN)) { + pr_warn_once("%s (%d): Attempt to access syslog with " + "CAP_SYS_ADMIN but no CAP_SYSLOG " + "(deprecated).\n", + current->comm, task_pid_nr(current)); + return 0; + } + return -EPERM; + } + return security_syslog(type); +} + + /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { u64 seq; @@ -620,7 +667,8 @@ static int devkmsg_open(struct inode *inode, struct file *file) if ((file->f_flags & O_ACCMODE) == O_WRONLY) return 0; - err = security_syslog(SYSLOG_ACTION_READ_ALL); + err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, + SYSLOG_FROM_READER); if (err) return err; @@ -813,45 +861,6 @@ static inline void boot_delay_msec(int level) } #endif -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif - -static int syslog_action_restricted(int type) -{ - if (dmesg_restrict) - return 1; - /* Unless restricted, we allow "read all" and "get buffer size" for everybody */ - return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; -} - -static int check_syslog_permissions(int type, bool from_file) -{ - /* - * If this is from /proc/kmsg and we've already opened it, then we've - * already done the capabilities checks at open time. - */ - if (from_file && type != SYSLOG_ACTION_OPEN) - return 0; - - if (syslog_action_restricted(type)) { - if (capable(CAP_SYSLOG)) - return 0; - /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ - if (capable(CAP_SYS_ADMIN)) { - printk_once(KERN_WARNING "%s (%d): " - "Attempt to access syslog with CAP_SYS_ADMIN " - "but no CAP_SYSLOG (deprecated).\n", - current->comm, task_pid_nr(current)); - return 0; - } - return -EPERM; - } - return 0; -} - #if defined(CONFIG_PRINTK_TIME) static bool printk_time = 1; #else @@ -1249,7 +1258,7 @@ out: SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { - return do_syslog(type, buf, len, SYSLOG_FROM_CALL); + return do_syslog(type, buf, len, SYSLOG_FROM_READER); } /* diff --git a/kernel/ptrace.c b/kernel/ptrace.c index aed981a3f69..335a7ae697f 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -665,20 +665,22 @@ static int ptrace_peek_siginfo(struct task_struct *child, if (unlikely(is_compat_task())) { compat_siginfo_t __user *uinfo = compat_ptr(data); - ret = copy_siginfo_to_user32(uinfo, &info); - ret |= __put_user(info.si_code, &uinfo->si_code); + if (copy_siginfo_to_user32(uinfo, &info) || + __put_user(info.si_code, &uinfo->si_code)) { + ret = -EFAULT; + break; + } + } else #endif { siginfo_t __user *uinfo = (siginfo_t __user *) data; - ret = copy_siginfo_to_user(uinfo, &info); - ret |= __put_user(info.si_code, &uinfo->si_code); - } - - if (ret) { - ret = -EFAULT; - break; + if (copy_siginfo_to_user(uinfo, &info) || + __put_user(info.si_code, &uinfo->si_code)) { + ret = -EFAULT; + break; + } } data += sizeof(siginfo_t); diff --git a/kernel/range.c b/kernel/range.c index 071b0ab455c..322ea8e93e4 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -4,7 +4,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/sort.h> - +#include <linux/string.h> #include <linux/range.h> int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) @@ -32,9 +32,8 @@ int add_range_with_merge(struct range *range, int az, int nr_range, if (start >= end) return nr_range; - /* Try to merge it with old one: */ + /* get new start/end: */ for (i = 0; i < nr_range; i++) { - u64 final_start, final_end; u64 common_start, common_end; if (!range[i].end) @@ -45,12 +44,16 @@ int add_range_with_merge(struct range *range, int az, int nr_range, if (common_start > common_end) continue; - final_start = min(range[i].start, start); - final_end = max(range[i].end, end); + /* new start/end, will add it back at last */ + start = min(range[i].start, start); + end = max(range[i].end, end); - range[i].start = final_start; - range[i].end = final_end; - return nr_range; + memmove(&range[i], &range[i + 1], + (nr_range - (i + 1)) * sizeof(range[i])); + range[nr_range - 1].start = 0; + range[nr_range - 1].end = 0; + nr_range--; + i--; } /* Need to add it: */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 16ea6792501..35380019f0f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1451,9 +1451,9 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); #ifdef CONFIG_PROVE_RCU_DELAY - if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 && + if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 && system_state == SYSTEM_RUNNING) - schedule_timeout_uninterruptible(2); + udelay(200); #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ cond_resched(); } @@ -1613,6 +1613,14 @@ static int __noreturn rcu_gp_kthread(void *arg) } } +static void rsp_wakeup(struct irq_work *work) +{ + struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work); + + /* Wake up rcu_gp_kthread() to start the grace period. */ + wake_up(&rsp->gp_wq); +} + /* * Start a new RCU grace period if warranted, re-initializing the hierarchy * in preparation for detecting the next grace period. The caller must hold @@ -1637,8 +1645,12 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, } rsp->gp_flags = RCU_GP_FLAG_INIT; - /* Wake up rcu_gp_kthread() to start the grace period. */ - wake_up(&rsp->gp_wq); + /* + * We can't do wakeups while holding the rnp->lock, as that + * could cause possible deadlocks with the rq->lock. Deter + * the wakeup to interrupt context. + */ + irq_work_queue(&rsp->wakeup_work); } /* @@ -3235,6 +3247,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, rsp->rda = rda; init_waitqueue_head(&rsp->gp_wq); + init_irq_work(&rsp->wakeup_work, rsp_wakeup); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index da77a8f57ff..4df503470e4 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -27,6 +27,7 @@ #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/seqlock.h> +#include <linux/irq_work.h> /* * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and @@ -442,6 +443,7 @@ struct rcu_state { char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ struct list_head flavors; /* List of RCU flavors. */ + struct irq_work wakeup_work; /* Postponed wakeups */ }; /* Values for rcu_state structure's gp_flags field. */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 58453b8272f..e8b335016c5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -633,7 +633,19 @@ void wake_up_nohz_cpu(int cpu) static inline bool got_nohz_idle_kick(void) { int cpu = smp_processor_id(); - return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + + if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) + return false; + + if (idle_cpu(cpu) && !need_resched()) + return true; + + /* + * We can't run Idle Load Balance on this CPU for this time so we + * cancel it and clear NOHZ_BALANCE_KICK + */ + clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + return false; } #else /* CONFIG_NO_HZ_COMMON */ @@ -1393,8 +1405,9 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() - && !tick_nohz_full_cpu(smp_processor_id())) + if (llist_empty(&this_rq()->wake_list) + && !tick_nohz_full_cpu(smp_processor_id()) + && !got_nohz_idle_kick()) return; /* @@ -1417,7 +1430,7 @@ void scheduler_ipi(void) /* * Check if someone kicked us for doing the nohz idle load balance. */ - if (unlikely(got_nohz_idle_kick() && !need_resched())) { + if (unlikely(got_nohz_idle_kick())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } @@ -4745,7 +4758,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) */ idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); - vtime_init_idle(idle); + vtime_init_idle(idle, cpu); #if defined(CONFIG_SMP) sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index cc2dc3eea8a..b5ccba22603 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -747,17 +747,17 @@ void arch_vtime_task_switch(struct task_struct *prev) write_seqlock(¤t->vtime_seqlock); current->vtime_snap_whence = VTIME_SYS; - current->vtime_snap = sched_clock(); + current->vtime_snap = sched_clock_cpu(smp_processor_id()); write_sequnlock(¤t->vtime_seqlock); } -void vtime_init_idle(struct task_struct *t) +void vtime_init_idle(struct task_struct *t, int cpu) { unsigned long flags; write_seqlock_irqsave(&t->vtime_seqlock, flags); t->vtime_snap_whence = VTIME_SYS; - t->vtime_snap = sched_clock(); + t->vtime_snap = sched_clock_cpu(cpu); write_sequnlock_irqrestore(&t->vtime_seqlock, flags); } diff --git a/kernel/softirq.c b/kernel/softirq.c index b5197dcb0da..3d6833f125d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -195,8 +195,12 @@ void local_bh_enable_ip(unsigned long ip) EXPORT_SYMBOL(local_bh_enable_ip); /* - * We restart softirq processing for at most 2 ms, - * and if need_resched() is not set. + * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, + * but break the loop if need_resched() is set or after 2 ms. + * The MAX_SOFTIRQ_TIME provides a nice upper bound in most cases, but in + * certain cases, such as stop_machine(), jiffies may cease to + * increment and so we need the MAX_SOFTIRQ_RESTART limit as + * well to make sure we eventually return from this method. * * These limits have been established via experimentation. * The two things to balance is latency against fairness - @@ -204,6 +208,7 @@ EXPORT_SYMBOL(local_bh_enable_ip); * should not be able to lock up the box. */ #define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) +#define MAX_SOFTIRQ_RESTART 10 asmlinkage void __do_softirq(void) { @@ -212,6 +217,7 @@ asmlinkage void __do_softirq(void) unsigned long end = jiffies + MAX_SOFTIRQ_TIME; int cpu; unsigned long old_flags = current->flags; + int max_restart = MAX_SOFTIRQ_RESTART; /* * Mask out PF_MEMALLOC s current task context is borrowed for the @@ -265,7 +271,8 @@ restart: pending = local_softirq_pending(); if (pending) { - if (time_before(jiffies, end) && !need_resched()) + if (time_before(jiffies, end) && !need_resched() && + --max_restart) goto restart; wakeup_softirqd(); diff --git a/kernel/sys.c b/kernel/sys.c index b95d3c72ba2..2bbd9a73b54 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -362,6 +362,29 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); +/* Add backwards compatibility for stable trees. */ +#ifndef PF_NO_SETAFFINITY +#define PF_NO_SETAFFINITY PF_THREAD_BOUND +#endif + +static void migrate_to_reboot_cpu(void) +{ + /* The boot cpu is always logical cpu 0 */ + int cpu = 0; + + cpu_hotplug_disable(); + + /* Make certain the cpu I'm about to reboot on is online */ + if (!cpu_online(cpu)) + cpu = cpumask_first(cpu_online_mask); + + /* Prevent races with other tasks migrating this task */ + current->flags |= PF_NO_SETAFFINITY; + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed_ptr(current, cpumask_of(cpu)); +} + /** * kernel_restart - reboot the system * @cmd: pointer to buffer containing command to execute for restart @@ -373,7 +396,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); if (!cmd) printk(KERN_EMERG "Restarting system.\n"); @@ -400,7 +423,7 @@ static void kernel_shutdown_prepare(enum system_states state) void kernel_halt(void) { kernel_shutdown_prepare(SYSTEM_HALT); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); printk(KERN_EMERG "System halted.\n"); kmsg_dump(KMSG_DUMP_HALT); @@ -419,7 +442,7 @@ void kernel_power_off(void) kernel_shutdown_prepare(SYSTEM_POWER_OFF); if (pm_power_off_prepare) pm_power_off_prepare(); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); printk(KERN_EMERG "Power down.\n"); kmsg_dump(KMSG_DUMP_POWEROFF); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 12ff13a838c..8f5b3b98577 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -874,7 +874,6 @@ static void hardpps_update_phase(long error) void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) { struct pps_normtime pts_norm, freq_norm; - unsigned long flags; pts_norm = pps_normalize_ts(*phase_ts); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 24938d57766..20d6fba7065 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -511,6 +511,12 @@ again: } } + /* + * Remove the current cpu from the pending mask. The event is + * delivered immediately in tick_do_broadcast() ! + */ + cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask); + /* Take care of enforced broadcast requests */ cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); cpumask_clear(tick_broadcast_force_mask); @@ -575,8 +581,8 @@ void tick_broadcast_oneshot_control(unsigned long reason) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { - WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { + WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); /* * We only reprogram the broadcast timer if we @@ -593,8 +599,6 @@ void tick_broadcast_oneshot_control(unsigned long reason) } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); - if (dev->next_event.tv64 == KTIME_MAX) - goto out; /* * The cpu which was handling the broadcast * timer marked this cpu in the broadcast @@ -609,6 +613,11 @@ void tick_broadcast_oneshot_control(unsigned long reason) goto out; /* + * Bail out if there is no next event. + */ + if (dev->next_event.tv64 == KTIME_MAX) + goto out; + /* * If the pending bit is not set, then we are * either the CPU handling the broadcast * interrupt or we got woken by something else. @@ -692,10 +701,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) bc->event_handler = tick_handle_oneshot_broadcast; - /* Take the do_timer update */ - if (!tick_nohz_full_cpu(cpu)) - tick_do_timer_cpu = cpu; - /* * We must be careful here. There might be other CPUs * waiting for periodic broadcast. We need to set the diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f4208138fbf..0cf1c145318 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -306,7 +306,7 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, * we can't safely shutdown that CPU. */ if (have_nohz_full_mask && tick_do_timer_cpu == cpu) - return -EINVAL; + return NOTIFY_BAD; break; } return NOTIFY_OK; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 98cd470bbe4..baeeb5c87cf 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -975,6 +975,14 @@ static int timekeeping_suspend(void) read_persistent_clock(&timekeeping_suspend_time); + /* + * On some systems the persistent_clock can not be detected at + * timekeeping_init by its return value, so if we see a valid + * value returned, update the persistent_clock_exists flag. + */ + if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) + persistent_clock_exist = true; + raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b549b0f5b97..6c508ff33c6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -120,22 +120,22 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); /* * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw() is that elements removed from this list + * can use rcu_dereference_raw_notrace() is that elements removed from this list * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw() calls are needed to handle + * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle * concurrent insertions into the ftrace_global_list. * * Silly Alpha and silly pointer-speculation compiler optimizations! */ #define do_for_each_ftrace_op(op, list) \ - op = rcu_dereference_raw(list); \ + op = rcu_dereference_raw_notrace(list); \ do /* * Optimized for just a single item in the list (as that is the normal case). */ #define while_for_each_ftrace_op(op) \ - while (likely(op = rcu_dereference_raw((op)->next)) && \ + while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \ unlikely((op) != &ftrace_list_end)) static inline void ftrace_ops_init(struct ftrace_ops *ops) @@ -779,7 +779,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) if (hlist_empty(hhd)) return NULL; - hlist_for_each_entry_rcu(rec, hhd, node) { + hlist_for_each_entry_rcu_notrace(rec, hhd, node) { if (rec->ip == ip) return rec; } @@ -1165,7 +1165,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) hhd = &hash->buckets[key]; - hlist_for_each_entry_rcu(entry, hhd, hlist) { + hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) { if (entry->ip == ip) return entry; } @@ -1422,8 +1422,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) struct ftrace_hash *notrace_hash; int ret; - filter_hash = rcu_dereference_raw(ops->filter_hash); - notrace_hash = rcu_dereference_raw(ops->notrace_hash); + filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); + notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); if ((ftrace_hash_empty(filter_hash) || ftrace_lookup_ip(filter_hash, ip)) && @@ -2920,7 +2920,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, * on the hash. rcu_read_lock is too dangerous here. */ preempt_disable_notrace(); - hlist_for_each_entry_rcu(entry, hhd, node) { + hlist_for_each_entry_rcu_notrace(entry, hhd, node) { if (entry->ip == ip) entry->ops->func(ip, parent_ip, &entry->data); } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b59aea2c48c..e444ff88f0a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -620,6 +620,9 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, if (cpu == RING_BUFFER_ALL_CPUS) work = &buffer->irq_work; else { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + cpu_buffer = buffer->buffers[cpu]; work = &cpu_buffer->irq_work; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ae6fa2d1cdf..e71a8be4a6e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -652,8 +652,6 @@ static struct { ARCH_TRACE_CLOCKS }; -int trace_clock_id; - /* * trace_parser_get_init - gets the buffer for trace parser */ @@ -843,7 +841,15 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); max_data->pid = tsk->pid; - max_data->uid = task_uid(tsk); + /* + * If tsk == current, then use current_uid(), as that does not use + * RCU. The irq tracer can be called out of RCU scope. + */ + if (tsk == current) + max_data->uid = current_uid(); + else + max_data->uid = task_uid(tsk); + max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; max_data->policy = tsk->policy; max_data->rt_priority = tsk->rt_priority; @@ -2818,7 +2824,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) iter->iter_flags |= TRACE_FILE_ANNOTATE; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ - if (trace_clocks[trace_clock_id].in_ns) + if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; /* stop the trace while dumping if we are not opening "snapshot" */ @@ -3817,7 +3823,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) iter->iter_flags |= TRACE_FILE_LAT_FMT; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ - if (trace_clocks[trace_clock_id].in_ns) + if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; iter->cpu_file = tc->cpu; @@ -5087,7 +5093,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "bytes: %ld\n", cnt); - if (trace_clocks[trace_clock_id].in_ns) { + if (trace_clocks[tr->clock_id].in_ns) { /* local or global for trace_clock */ t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); usec_rem = do_div(t, USEC_PER_SEC); @@ -6216,10 +6222,15 @@ __init static int tracer_alloc_buffers(void) trace_init_cmdlines(); - register_tracer(&nop_trace); - + /* + * register_tracer() might reference current_trace, so it + * needs to be set before we register anything. This is + * just a bootstrap of current_trace anyway. + */ global_trace.current_trace = &nop_trace; + register_tracer(&nop_trace); + /* All seems OK, enable tracing */ tracing_disabled = 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 711ca7d3e7f..20572ed88c5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -700,8 +700,6 @@ enum print_line_t print_trace_line(struct trace_iterator *iter); extern unsigned long trace_flags; -extern int trace_clock_id; - /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 55e2cf66967..2901e3b8859 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1159,7 +1159,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tracing_stop(); /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); trace->reset(tr); tracing_start(); diff --git a/kernel/wait.c b/kernel/wait.c index 6698e0c04ea..ce0daa320a2 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -287,3 +287,91 @@ wait_queue_head_t *bit_waitqueue(void *word, int bit) return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; } EXPORT_SYMBOL(bit_waitqueue); + +/* + * Manipulate the atomic_t address to produce a better bit waitqueue table hash + * index (we're keying off bit -1, but that would produce a horrible hash + * value). + */ +static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) +{ + if (BITS_PER_LONG == 64) { + unsigned long q = (unsigned long)p; + return bit_waitqueue((void *)(q & ~1), q & 1); + } + return bit_waitqueue(p, 0); +} + +static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync, + void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue *wait_bit + = container_of(wait, struct wait_bit_queue, wait); + atomic_t *val = key->flags; + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + atomic_read(val) != 0) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, + * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero + * return codes halt waiting and return. + */ +static __sched +int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(atomic_t *), unsigned mode) +{ + atomic_t *val; + int ret = 0; + + do { + prepare_to_wait(wq, &q->wait, mode); + val = q->key.flags; + if (atomic_read(val) == 0) + ret = (*action)(val); + } while (!ret && atomic_read(val) != 0); + finish_wait(wq, &q->wait); + return ret; +} + +#define DEFINE_WAIT_ATOMIC_T(name, p) \ + struct wait_bit_queue name = { \ + .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ + .wait = { \ + .private = current, \ + .func = wake_atomic_t_function, \ + .task_list = \ + LIST_HEAD_INIT((name).wait.task_list), \ + }, \ + } + +__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), + unsigned mode) +{ + wait_queue_head_t *wq = atomic_t_waitqueue(p); + DEFINE_WAIT_ATOMIC_T(wait, p); + + return __wait_on_atomic_t(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); + +/** + * wake_up_atomic_t - Wake up a waiter on a atomic_t + * @word: The word being waited on, a kernel virtual address + * @bit: The bit of the word being waited on + * + * Wake up anyone waiting for the atomic_t to go to zero. + * + * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t + * check is done by the waiter's wake function, not the by the waker itself). + */ +void wake_up_atomic_t(atomic_t *p) +{ + __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); +} +EXPORT_SYMBOL(wake_up_atomic_t); |