summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c23
-rw-r--r--kernel/cpu.c9
-rw-r--r--kernel/cpuset.c20
-rw-r--r--kernel/fork.c11
-rw-r--r--kernel/mutex.c4
-rw-r--r--kernel/nsproxy.c27
-rw-r--r--kernel/pid_namespace.c4
-rw-r--r--kernel/power/hibernate.c45
-rw-r--r--kernel/power/qos.c20
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/power/user.c24
-rw-r--r--kernel/printk/braille.c3
-rw-r--r--kernel/printk/printk.c7
-rw-r--r--kernel/ptrace.c1
-rw-r--r--kernel/sched/core.c96
-rw-r--r--kernel/sched/cpupri.c4
-rw-r--r--kernel/sched/fair.c10
-rw-r--r--kernel/time/sched_clock.c2
-rw-r--r--kernel/time/tick-sched.c5
-rw-r--r--kernel/time/timer_list.c41
-rw-r--r--kernel/trace/ftrace.c87
-rw-r--r--kernel/trace/trace.c27
-rw-r--r--kernel/trace/trace_events.c200
-rw-r--r--kernel/trace/trace_events_filter.c17
-rw-r--r--kernel/trace/trace_kprobe.c21
-rw-r--r--kernel/trace/trace_uprobe.c51
-rw-r--r--kernel/user_namespace.c17
-rw-r--r--kernel/wait.c3
-rw-r--r--kernel/workqueue.c80
29 files changed, 580 insertions, 283 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 789ec4683db..e91963302c0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4335,8 +4335,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
}
err = percpu_ref_init(&css->refcnt, css_release);
- if (err)
+ if (err) {
+ ss->css_free(cgrp);
goto err_free_all;
+ }
init_cgroup_css(css, ss, cgrp);
@@ -4478,6 +4480,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
struct dentry *d = cgrp->dentry;
struct cgroup_event *event, *tmp;
struct cgroup_subsys *ss;
+ struct cgroup *child;
bool empty;
lockdep_assert_held(&d->d_inode->i_mutex);
@@ -4488,12 +4491,28 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
* @cgrp from being removed while __put_css_set() is in progress.
*/
read_lock(&css_set_lock);
- empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children);
+ empty = list_empty(&cgrp->cset_links);
read_unlock(&css_set_lock);
if (!empty)
return -EBUSY;
/*
+ * Make sure there's no live children. We can't test ->children
+ * emptiness as dead children linger on it while being destroyed;
+ * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
+ */
+ empty = true;
+ rcu_read_lock();
+ list_for_each_entry_rcu(child, &cgrp->children, sibling) {
+ empty = cgroup_is_dead(child);
+ if (!empty)
+ break;
+ }
+ rcu_read_unlock();
+ if (!empty)
+ return -EBUSY;
+
+ /*
* Block new css_tryget() by killing css refcnts. cgroup core
* guarantees that, by the time ->css_offline() is invoked, no new
* css reference will be given out via css_tryget(). We can't
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b2b227b8212..d7f07a2da5a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
* get_online_cpus() not an api which is called all that often.
*
*/
-static void cpu_hotplug_begin(void)
+void cpu_hotplug_begin(void)
{
cpu_hotplug.active_writer = current;
@@ -127,7 +127,7 @@ static void cpu_hotplug_begin(void)
}
}
-static void cpu_hotplug_done(void)
+void cpu_hotplug_done(void)
{
cpu_hotplug.active_writer = NULL;
mutex_unlock(&cpu_hotplug.lock);
@@ -154,10 +154,7 @@ void cpu_hotplug_enable(void)
cpu_maps_update_done();
}
-#else /* #if CONFIG_HOTPLUG_CPU */
-static void cpu_hotplug_begin(void) {}
-static void cpu_hotplug_done(void) {}
-#endif /* #else #if CONFIG_HOTPLUG_CPU */
+#endif /* CONFIG_HOTPLUG_CPU */
/* Need to know about CPUs going up/down? */
int __ref register_cpu_notifier(struct notifier_block *nb)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e5657788fed..ea1966db34f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -475,13 +475,17 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
/*
* Cpusets with tasks - existing or newly being attached - can't
- * have empty cpus_allowed or mems_allowed.
+ * be changed to have empty cpus_allowed or mems_allowed.
*/
ret = -ENOSPC;
- if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
- (cpumask_empty(trial->cpus_allowed) &&
- nodes_empty(trial->mems_allowed)))
- goto out;
+ if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) {
+ if (!cpumask_empty(cur->cpus_allowed) &&
+ cpumask_empty(trial->cpus_allowed))
+ goto out;
+ if (!nodes_empty(cur->mems_allowed) &&
+ nodes_empty(trial->mems_allowed))
+ goto out;
+ }
ret = 0;
out:
@@ -1608,11 +1612,13 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
{
struct cpuset *cs = cgroup_cs(cgrp);
cpuset_filetype_t type = cft->private;
- int retval = -ENODEV;
+ int retval = 0;
mutex_lock(&cpuset_mutex);
- if (!is_cpuset_online(cs))
+ if (!is_cpuset_online(cs)) {
+ retval = -ENODEV;
goto out_unlock;
+ }
switch (type) {
case FILE_CPU_EXCLUSIVE:
diff --git a/kernel/fork.c b/kernel/fork.c
index 403d2bb8a96..bf46287c91a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1177,7 +1177,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
* don't allow the creation of threads.
*/
if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) &&
- (task_active_pid_ns(current) != current->nsproxy->pid_ns))
+ (task_active_pid_ns(current) !=
+ current->nsproxy->pid_ns_for_children))
return ERR_PTR(-EINVAL);
retval = security_task_create(clone_flags);
@@ -1351,7 +1352,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (pid != &init_struct_pid) {
retval = -ENOMEM;
- pid = alloc_pid(p->nsproxy->pid_ns);
+ pid = alloc_pid(p->nsproxy->pid_ns_for_children);
if (!pid)
goto bad_fork_cleanup_io;
}
@@ -1679,6 +1680,12 @@ SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
int __user *, parent_tidptr,
int __user *, child_tidptr,
int, tls_val)
+#elif defined(CONFIG_CLONE_BACKWARDS3)
+SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
+ int, stack_size,
+ int __user *, parent_tidptr,
+ int __user *, child_tidptr,
+ int, tls_val)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int __user *, parent_tidptr,
diff --git a/kernel/mutex.c b/kernel/mutex.c
index ff05f4bd86e..a52ee7bb830 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -686,7 +686,7 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
might_sleep();
ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
0, &ctx->dep_map, _RET_IP_, ctx);
- if (!ret && ctx->acquired > 0)
+ if (!ret && ctx->acquired > 1)
return ww_mutex_deadlock_injection(lock, ctx);
return ret;
@@ -702,7 +702,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
0, &ctx->dep_map, _RET_IP_, ctx);
- if (!ret && ctx->acquired > 0)
+ if (!ret && ctx->acquired > 1)
return ww_mutex_deadlock_injection(lock, ctx);
return ret;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 364ceab15f0..997cbb951a3 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -29,15 +29,15 @@
static struct kmem_cache *nsproxy_cachep;
struct nsproxy init_nsproxy = {
- .count = ATOMIC_INIT(1),
- .uts_ns = &init_uts_ns,
+ .count = ATOMIC_INIT(1),
+ .uts_ns = &init_uts_ns,
#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
- .ipc_ns = &init_ipc_ns,
+ .ipc_ns = &init_ipc_ns,
#endif
- .mnt_ns = NULL,
- .pid_ns = &init_pid_ns,
+ .mnt_ns = NULL,
+ .pid_ns_for_children = &init_pid_ns,
#ifdef CONFIG_NET
- .net_ns = &init_net,
+ .net_ns = &init_net,
#endif
};
@@ -85,9 +85,10 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
goto out_ipc;
}
- new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns);
- if (IS_ERR(new_nsp->pid_ns)) {
- err = PTR_ERR(new_nsp->pid_ns);
+ new_nsp->pid_ns_for_children =
+ copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children);
+ if (IS_ERR(new_nsp->pid_ns_for_children)) {
+ err = PTR_ERR(new_nsp->pid_ns_for_children);
goto out_pid;
}
@@ -100,8 +101,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
return new_nsp;
out_net:
- if (new_nsp->pid_ns)
- put_pid_ns(new_nsp->pid_ns);
+ if (new_nsp->pid_ns_for_children)
+ put_pid_ns(new_nsp->pid_ns_for_children);
out_pid:
if (new_nsp->ipc_ns)
put_ipc_ns(new_nsp->ipc_ns);
@@ -174,8 +175,8 @@ void free_nsproxy(struct nsproxy *ns)
put_uts_ns(ns->uts_ns);
if (ns->ipc_ns)
put_ipc_ns(ns->ipc_ns);
- if (ns->pid_ns)
- put_pid_ns(ns->pid_ns);
+ if (ns->pid_ns_for_children)
+ put_pid_ns(ns->pid_ns_for_children);
put_net(ns->net_ns);
kmem_cache_free(nsproxy_cachep, ns);
}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6917e8edb48..601bb361c23 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -349,8 +349,8 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns)
if (ancestor != active)
return -EINVAL;
- put_pid_ns(nsproxy->pid_ns);
- nsproxy->pid_ns = get_pid_ns(new);
+ put_pid_ns(nsproxy->pid_ns_for_children);
+ nsproxy->pid_ns_for_children = get_pid_ns(new);
return 0;
}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b26f5f1e773..0b78f72ad39 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -644,22 +644,23 @@ int hibernate(void)
if (error)
goto Exit;
- /* Allocate memory management structures */
- error = create_basic_memory_bitmaps();
- if (error)
- goto Exit;
-
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
printk("done.\n");
error = freeze_processes();
if (error)
- goto Free_bitmaps;
+ goto Exit;
+
+ lock_device_hotplug();
+ /* Allocate memory management structures */
+ error = create_basic_memory_bitmaps();
+ if (error)
+ goto Thaw;
error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
if (error || freezer_test_done)
- goto Thaw;
+ goto Free_bitmaps;
if (in_suspend) {
unsigned int flags = 0;
@@ -682,14 +683,14 @@ int hibernate(void)
pr_debug("PM: Image restored successfully.\n");
}
+ Free_bitmaps:
+ free_basic_memory_bitmaps();
Thaw:
+ unlock_device_hotplug();
thaw_processes();
/* Don't bother checking whether freezer_test_done is true */
freezer_test_done = false;
-
- Free_bitmaps:
- free_basic_memory_bitmaps();
Exit:
pm_notifier_call_chain(PM_POST_HIBERNATION);
pm_restore_console();
@@ -806,21 +807,20 @@ static int software_resume(void)
pm_prepare_console();
error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
if (error)
- goto close_finish;
-
- error = create_basic_memory_bitmaps();
- if (error)
- goto close_finish;
+ goto Close_Finish;
pr_debug("PM: Preparing processes for restore.\n");
error = freeze_processes();
- if (error) {
- swsusp_close(FMODE_READ);
- goto Done;
- }
+ if (error)
+ goto Close_Finish;
pr_debug("PM: Loading hibernation image.\n");
+ lock_device_hotplug();
+ error = create_basic_memory_bitmaps();
+ if (error)
+ goto Thaw;
+
error = swsusp_read(&flags);
swsusp_close(FMODE_READ);
if (!error)
@@ -828,9 +828,10 @@ static int software_resume(void)
printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
swsusp_free();
- thaw_processes();
- Done:
free_basic_memory_bitmaps();
+ Thaw:
+ unlock_device_hotplug();
+ thaw_processes();
Finish:
pm_notifier_call_chain(PM_POST_RESTORE);
pm_restore_console();
@@ -840,7 +841,7 @@ static int software_resume(void)
mutex_unlock(&pm_mutex);
pr_debug("PM: Hibernation image not present or could not be loaded.\n");
return error;
-close_finish:
+ Close_Finish:
swsusp_close(FMODE_READ);
goto Finish;
}
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 06fe28589e9..a394297f8b2 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -296,6 +296,17 @@ int pm_qos_request_active(struct pm_qos_request *req)
}
EXPORT_SYMBOL_GPL(pm_qos_request_active);
+static void __pm_qos_update_request(struct pm_qos_request *req,
+ s32 new_value)
+{
+ trace_pm_qos_update_request(req->pm_qos_class, new_value);
+
+ if (new_value != req->node.prio)
+ pm_qos_update_target(
+ pm_qos_array[req->pm_qos_class]->constraints,
+ &req->node, PM_QOS_UPDATE_REQ, new_value);
+}
+
/**
* pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout
* @work: work struct for the delayed work (timeout)
@@ -308,7 +319,7 @@ static void pm_qos_work_fn(struct work_struct *work)
struct pm_qos_request,
work);
- pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE);
+ __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE);
}
/**
@@ -364,12 +375,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
}
cancel_delayed_work_sync(&req->work);
-
- trace_pm_qos_update_request(req->pm_qos_class, new_value);
- if (new_value != req->node.prio)
- pm_qos_update_target(
- pm_qos_array[req->pm_qos_class]->constraints,
- &req->node, PM_QOS_UPDATE_REQ, new_value);
+ __pm_qos_update_request(req, new_value);
}
EXPORT_SYMBOL_GPL(pm_qos_update_request);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ece04223bb1..62ee437b5c7 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -210,6 +210,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
goto Platform_wake;
}
+ ftrace_stop();
error = disable_nonboot_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
@@ -232,6 +233,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
Enable_cpus:
enable_nonboot_cpus();
+ ftrace_start();
Platform_wake:
if (need_suspend_ops(state) && suspend_ops->wake)
@@ -265,7 +267,6 @@ int suspend_devices_and_enter(suspend_state_t state)
goto Close;
}
suspend_console();
- ftrace_stop();
suspend_test_start();
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
@@ -285,7 +286,6 @@ int suspend_devices_and_enter(suspend_state_t state)
suspend_test_start();
dpm_resume_end(PMSG_RESUME);
suspend_test_finish("resume devices");
- ftrace_start();
resume_console();
Close:
if (need_suspend_ops(state) && suspend_ops->end)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 4ed81e74f86..72e8f4fd616 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -60,11 +60,6 @@ static int snapshot_open(struct inode *inode, struct file *filp)
error = -ENOSYS;
goto Unlock;
}
- if(create_basic_memory_bitmaps()) {
- atomic_inc(&snapshot_device_available);
- error = -ENOMEM;
- goto Unlock;
- }
nonseekable_open(inode, filp);
data = &snapshot_state;
filp->private_data = data;
@@ -90,10 +85,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
if (error)
pm_notifier_call_chain(PM_POST_RESTORE);
}
- if (error) {
- free_basic_memory_bitmaps();
+ if (error)
atomic_inc(&snapshot_device_available);
- }
+
data->frozen = 0;
data->ready = 0;
data->platform_support = 0;
@@ -111,11 +105,11 @@ static int snapshot_release(struct inode *inode, struct file *filp)
lock_system_sleep();
swsusp_free();
- free_basic_memory_bitmaps();
data = filp->private_data;
free_all_swap_pages(data->swap);
if (data->frozen) {
pm_restore_gfp_mask();
+ free_basic_memory_bitmaps();
thaw_processes();
}
pm_notifier_call_chain(data->mode == O_RDONLY ?
@@ -207,6 +201,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
if (!mutex_trylock(&pm_mutex))
return -EBUSY;
+ lock_device_hotplug();
data = filp->private_data;
switch (cmd) {
@@ -220,14 +215,22 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
printk("done.\n");
error = freeze_processes();
- if (!error)
+ if (error)
+ break;
+
+ error = create_basic_memory_bitmaps();
+ if (error)
+ thaw_processes();
+ else
data->frozen = 1;
+
break;
case SNAPSHOT_UNFREEZE:
if (!data->frozen || data->ready)
break;
pm_restore_gfp_mask();
+ free_basic_memory_bitmaps();
thaw_processes();
data->frozen = 0;
break;
@@ -371,6 +374,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
}
+ unlock_device_hotplug();
mutex_unlock(&pm_mutex);
return error;
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index b51087fb9ac..276762f3a46 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -19,7 +19,8 @@ char *_braille_console_setup(char **str, char **brl_options)
pr_err("need port name after brl=\n");
else
*((*str)++) = 0;
- }
+ } else
+ return NULL;
return *str;
}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 5b5a7080e2a..b4e8500afdb 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2226,6 +2226,13 @@ void register_console(struct console *newcon)
struct console *bcon = NULL;
struct console_cmdline *c;
+ if (console_drivers)
+ for_each_console(bcon)
+ if (WARN(bcon == newcon,
+ "console '%s%d' already registered\n",
+ bcon->name, bcon->index))
+ return;
+
/*
* before we register a new CON_BOOT console, make sure we don't
* already have a valid console
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 4041f5747e7..a146ee327f6 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -469,7 +469,6 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
/* Architecture-specific hardware disable .. */
ptrace_disable(child);
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
- flush_ptrace_hw_breakpoint(child);
write_lock_irq(&tasklist_lock);
/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7c32cb7bfe..05c39f03031 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -933,6 +933,8 @@ static int effective_prio(struct task_struct *p)
/**
* task_curr - is this task currently executing on a CPU?
* @p: the task in question.
+ *
+ * Return: 1 if the task is currently executing. 0 otherwise.
*/
inline int task_curr(const struct task_struct *p)
{
@@ -1482,7 +1484,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
* the simpler "current->state = TASK_RUNNING" to mark yourself
* runnable without the overhead of this.
*
- * Returns %true if @p was woken up, %false if it was already running
+ * Return: %true if @p was woken up, %false if it was already running.
* or @state didn't match @p's state.
*/
static int
@@ -1491,7 +1493,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
unsigned long flags;
int cpu, success = 0;
- smp_wmb();
+ /*
+ * If we are going to wake up a thread waiting for CONDITION we
+ * need to ensure that CONDITION=1 done by the caller can not be
+ * reordered with p->state check below. This pairs with mb() in
+ * set_current_state() the waiting thread does.
+ */
+ smp_mb__before_spinlock();
raw_spin_lock_irqsave(&p->pi_lock, flags);
if (!(p->state & state))
goto out;
@@ -1577,8 +1585,9 @@ out:
* @p: The process to be woken up.
*
* Attempt to wake up the nominated process and move it to the set of runnable
- * processes. Returns 1 if the process was woken up, 0 if it was already
- * running.
+ * processes.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
@@ -2191,6 +2200,8 @@ void scheduler_tick(void)
* This makes sure that uptime, CFS vruntime, load
* balancing, etc... continue to move forward, even
* with a very low granularity.
+ *
+ * Return: Maximum deferment in nanoseconds.
*/
u64 scheduler_tick_max_deferment(void)
{
@@ -2394,6 +2405,12 @@ need_resched:
if (sched_feat(HRTICK))
hrtick_clear(rq);
+ /*
+ * Make sure that signal_pending_state()->signal_pending() below
+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+ * done by the caller to avoid the race with signal_wake_up().
+ */
+ smp_mb__before_spinlock();
raw_spin_lock_irq(&rq->lock);
switch_count = &prev->nivcsw;
@@ -2796,8 +2813,8 @@ EXPORT_SYMBOL(wait_for_completion);
* specified timeout to expire. The timeout is in jiffies. It is not
* interruptible.
*
- * The return value is 0 if timed out, and positive (at least 1, or number of
- * jiffies left till timeout) if completed.
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
*/
unsigned long __sched
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
@@ -2829,8 +2846,8 @@ EXPORT_SYMBOL(wait_for_completion_io);
* specified timeout to expire. The timeout is in jiffies. It is not
* interruptible. The caller is accounted as waiting for IO.
*
- * The return value is 0 if timed out, and positive (at least 1, or number of
- * jiffies left till timeout) if completed.
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
*/
unsigned long __sched
wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
@@ -2846,7 +2863,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout);
* This waits for completion of a specific task to be signaled. It is
* interruptible.
*
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
*/
int __sched wait_for_completion_interruptible(struct completion *x)
{
@@ -2865,8 +2882,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. It is interruptible. The timeout is in jiffies.
*
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
*/
long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
@@ -2883,7 +2900,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
* This waits to be signaled for completion of a specific task. It can be
* interrupted by a kill signal.
*
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
*/
int __sched wait_for_completion_killable(struct completion *x)
{
@@ -2903,8 +2920,8 @@ EXPORT_SYMBOL(wait_for_completion_killable);
* signaled or for a specified timeout to expire. It can be
* interrupted by a kill signal. The timeout is in jiffies.
*
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
*/
long __sched
wait_for_completion_killable_timeout(struct completion *x,
@@ -2918,7 +2935,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout);
* try_wait_for_completion - try to decrement a completion without blocking
* @x: completion structure
*
- * Returns: 0 if a decrement cannot be done without blocking
+ * Return: 0 if a decrement cannot be done without blocking
* 1 if a decrement succeeded.
*
* If a completion is being used as a counting completion,
@@ -2945,7 +2962,7 @@ EXPORT_SYMBOL(try_wait_for_completion);
* completion_done - Test to see if a completion has any waiters
* @x: completion structure
*
- * Returns: 0 if there are waiters (wait_for_completion() in progress)
+ * Return: 0 if there are waiters (wait_for_completion() in progress)
* 1 if there are no waiters.
*
*/
@@ -3182,7 +3199,7 @@ SYSCALL_DEFINE1(nice, int, increment)
* task_prio - return the priority value of a given task.
* @p: the task in question.
*
- * This is the priority value as seen by users in /proc.
+ * Return: The priority value as seen by users in /proc.
* RT tasks are offset by -200. Normal tasks are centered
* around 0, value goes from -16 to +15.
*/
@@ -3194,6 +3211,8 @@ int task_prio(const struct task_struct *p)
/**
* task_nice - return the nice value of a given task.
* @p: the task in question.
+ *
+ * Return: The nice value [ -20 ... 0 ... 19 ].
*/
int task_nice(const struct task_struct *p)
{
@@ -3204,6 +3223,8 @@ EXPORT_SYMBOL(task_nice);
/**
* idle_cpu - is a given cpu idle currently?
* @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
*/
int idle_cpu(int cpu)
{
@@ -3226,6 +3247,8 @@ int idle_cpu(int cpu)
/**
* idle_task - return the idle task for a given cpu.
* @cpu: the processor in question.
+ *
+ * Return: The idle task for the cpu @cpu.
*/
struct task_struct *idle_task(int cpu)
{
@@ -3235,6 +3258,8 @@ struct task_struct *idle_task(int cpu)
/**
* find_process_by_pid - find a process with a matching PID value.
* @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
*/
static struct task_struct *find_process_by_pid(pid_t pid)
{
@@ -3432,6 +3457,8 @@ recheck:
* @policy: new policy.
* @param: structure containing the new RT priority.
*
+ * Return: 0 on success. An error code otherwise.
+ *
* NOTE that the task may be already dead.
*/
int sched_setscheduler(struct task_struct *p, int policy,
@@ -3451,6 +3478,8 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
* current context has permission. For example, this is needed in
* stop_machine(): we create temporary high priority worker threads,
* but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
*/
int sched_setscheduler_nocheck(struct task_struct *p, int policy,
const struct sched_param *param)
@@ -3485,6 +3514,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
* @pid: the pid in question.
* @policy: new policy.
* @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
struct sched_param __user *, param)
@@ -3500,6 +3531,8 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
* sys_sched_setparam - set/change the RT priority of a thread
* @pid: the pid in question.
* @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
{
@@ -3509,6 +3542,9 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
/**
* sys_sched_getscheduler - get the policy (scheduling class) of a thread
* @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
*/
SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
{
@@ -3535,6 +3571,9 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
* sys_sched_getparam - get the RT priority of a thread
* @pid: the pid in question.
* @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
*/
SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
{
@@ -3659,6 +3698,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
* @pid: pid of the process
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to the new cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
@@ -3710,6 +3751,8 @@ out_unlock:
* @pid: pid of the process
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to hold the current cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
@@ -3744,6 +3787,8 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
*
* This function yields the current CPU to other tasks. If there are no
* other threads running on this CPU then this function will return.
+ *
+ * Return: 0.
*/
SYSCALL_DEFINE0(sched_yield)
{
@@ -3869,7 +3914,7 @@ EXPORT_SYMBOL(yield);
* It's the caller's job to ensure that the target task struct
* can't go away on us before we can do any checks.
*
- * Returns:
+ * Return:
* true (>0) if we indeed boosted the target task.
* false (0) if we failed to boost the target.
* -ESRCH if there's no task to yield to.
@@ -3972,8 +4017,9 @@ long __sched io_schedule_timeout(long timeout)
* sys_sched_get_priority_max - return maximum RT priority.
* @policy: scheduling class.
*
- * this syscall returns the maximum rt_priority that can be used
- * by a given scheduling class.
+ * Return: On success, this syscall returns the maximum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
*/
SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
{
@@ -3997,8 +4043,9 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
* sys_sched_get_priority_min - return minimum RT priority.
* @policy: scheduling class.
*
- * this syscall returns the minimum rt_priority that can be used
- * by a given scheduling class.
+ * Return: On success, this syscall returns the minimum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
*/
SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
{
@@ -4024,6 +4071,9 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
*
* this syscall writes the default timeslice value of a given process
* into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
*/
SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
struct timespec __user *, interval)
@@ -6632,6 +6682,8 @@ void normalize_rt_tasks(void)
* @cpu: the processor in question.
*
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ *
+ * Return: The current task for @cpu.
*/
struct task_struct *curr_task(int cpu)
{
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 1095e878a46..8b836b376d9 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -62,7 +62,7 @@ static int convert_prio(int prio)
* any discrepancies created by racing against the uncertainty of the current
* priority configuration.
*
- * Returns: (int)bool - CPUs were found
+ * Return: (int)bool - CPUs were found
*/
int cpupri_find(struct cpupri *cp, struct task_struct *p,
struct cpumask *lowest_mask)
@@ -203,7 +203,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
* cpupri_init - initialize the cpupri structure
* @cp: The cpupri context
*
- * Returns: -ENOMEM if memory fails.
+ * Return: -ENOMEM on memory allocation failure.
*/
int cpupri_init(struct cpupri *cp)
{
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9565645e320..68f1609ca14 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2032,6 +2032,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
*/
update_entity_load_avg(curr, 1);
update_cfs_rq_blocked_load(cfs_rq, 1);
+ update_cfs_shares(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -4280,6 +4281,8 @@ struct sg_lb_stats {
* get_sd_load_idx - Obtain the load index for a given sched domain.
* @sd: The sched_domain whose load_idx is to be obtained.
* @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ *
+ * Return: The load index.
*/
static inline int get_sd_load_idx(struct sched_domain *sd,
enum cpu_idle_type idle)
@@ -4574,6 +4577,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
*
* Determine if @sg is a busier group than the previously selected
* busiest group.
+ *
+ * Return: %true if @sg is a busier group than the previously selected
+ * busiest group. %false otherwise.
*/
static bool update_sd_pick_busiest(struct lb_env *env,
struct sd_lb_stats *sds,
@@ -4691,7 +4697,7 @@ static inline void update_sd_lb_stats(struct lb_env *env,
* assuming lower CPU number will be equivalent to lower a SMT thread
* number.
*
- * Returns 1 when packing is required and a task should be moved to
+ * Return: 1 when packing is required and a task should be moved to
* this CPU. The amount of the imbalance is returned in *imbalance.
*
* @env: The load balancing environment.
@@ -4869,7 +4875,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* @balance: Pointer to a variable indicating if this_cpu
* is the appropriate cpu to perform load balancing at this_level.
*
- * Returns: - the busiest group if imbalance exists.
+ * Return: - The busiest group if imbalance exists.
* - If no imbalance and user has opted for power-savings balance,
* return the least loaded group whose CPUs can be
* put to idle by rebalancing its tasks onto our group.
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index a326f27d7f0..0b479a6a22b 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -121,7 +121,7 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
BUG_ON(bits > 32);
WARN_ON(!irqs_disabled());
read_sched_clock = read;
- sched_clock_mask = (1 << bits) - 1;
+ sched_clock_mask = (1ULL << bits) - 1;
cd.rate = rate;
/* calculate the mult/shift to convert counter ticks to ns. */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e77edc97e03..e8a1516cc0a 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -182,7 +182,8 @@ static bool can_stop_full_tick(void)
* Don't allow the user to think they can get
* full NO_HZ with this machine.
*/
- WARN_ONCE(1, "NO_HZ FULL will not work with unstable sched clock");
+ WARN_ONCE(have_nohz_full_mask,
+ "NO_HZ FULL will not work with unstable sched clock");
return false;
}
#endif
@@ -343,8 +344,6 @@ static int tick_nohz_init_all(void)
void __init tick_nohz_init(void)
{
- int cpu;
-
if (!have_nohz_full_mask) {
if (tick_nohz_init_all() < 0)
return;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 3bdf2832301..61ed862cdd3 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -265,10 +265,9 @@ static inline void timer_list_header(struct seq_file *m, u64 now)
static int timer_list_show(struct seq_file *m, void *v)
{
struct timer_list_iter *iter = v;
- u64 now = ktime_to_ns(ktime_get());
if (iter->cpu == -1 && !iter->second_pass)
- timer_list_header(m, now);
+ timer_list_header(m, iter->now);
else if (!iter->second_pass)
print_cpu(m, iter->cpu, iter->now);
#ifdef CONFIG_GENERIC_CLOCKEVENTS
@@ -298,33 +297,41 @@ void sysrq_timer_list_show(void)
return;
}
-static void *timer_list_start(struct seq_file *file, loff_t *offset)
+static void *move_iter(struct timer_list_iter *iter, loff_t offset)
{
- struct timer_list_iter *iter = file->private;
-
- if (!*offset) {
- iter->cpu = -1;
- iter->now = ktime_to_ns(ktime_get());
- } else if (iter->cpu >= nr_cpu_ids) {
+ for (; offset; offset--) {
+ iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
+ if (iter->cpu >= nr_cpu_ids) {
#ifdef CONFIG_GENERIC_CLOCKEVENTS
- if (!iter->second_pass) {
- iter->cpu = -1;
- iter->second_pass = true;
- } else
- return NULL;
+ if (!iter->second_pass) {
+ iter->cpu = -1;
+ iter->second_pass = true;
+ } else
+ return NULL;
#else
- return NULL;
+ return NULL;
#endif
+ }
}
return iter;
}
+static void *timer_list_start(struct seq_file *file, loff_t *offset)
+{
+ struct timer_list_iter *iter = file->private;
+
+ if (!*offset)
+ iter->now = ktime_to_ns(ktime_get());
+ iter->cpu = -1;
+ iter->second_pass = false;
+ return move_iter(iter, *offset);
+}
+
static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset)
{
struct timer_list_iter *iter = file->private;
- iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
++*offset;
- return timer_list_start(file, offset);
+ return move_iter(iter, 1);
}
static void timer_list_stop(struct seq_file *seq, void *v)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8ce9eefc5bb..a6d098c6df3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2169,12 +2169,57 @@ static cycle_t ftrace_update_time;
static unsigned long ftrace_update_cnt;
unsigned long ftrace_update_tot_cnt;
-static int ops_traces_mod(struct ftrace_ops *ops)
+static inline int ops_traces_mod(struct ftrace_ops *ops)
{
- struct ftrace_hash *hash;
+ /*
+ * Filter_hash being empty will default to trace module.
+ * But notrace hash requires a test of individual module functions.
+ */
+ return ftrace_hash_empty(ops->filter_hash) &&
+ ftrace_hash_empty(ops->notrace_hash);
+}
+
+/*
+ * Check if the current ops references the record.
+ *
+ * If the ops traces all functions, then it was already accounted for.
+ * If the ops does not trace the current record function, skip it.
+ * If the ops ignores the function via notrace filter, skip it.
+ */
+static inline bool
+ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ /* If ops isn't enabled, ignore it */
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return 0;
+
+ /* If ops traces all mods, we already accounted for it */
+ if (ops_traces_mod(ops))
+ return 0;
+
+ /* The function must be in the filter */
+ if (!ftrace_hash_empty(ops->filter_hash) &&
+ !ftrace_lookup_ip(ops->filter_hash, rec->ip))
+ return 0;
+
+ /* If in notrace hash, we ignore it too */
+ if (ftrace_lookup_ip(ops->notrace_hash, rec->ip))
+ return 0;
+
+ return 1;
+}
+
+static int referenced_filters(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *ops;
+ int cnt = 0;
- hash = ops->filter_hash;
- return ftrace_hash_empty(hash);
+ for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
+ if (ops_references_rec(ops, rec))
+ cnt++;
+ }
+
+ return cnt;
}
static int ftrace_update_code(struct module *mod)
@@ -2183,6 +2228,7 @@ static int ftrace_update_code(struct module *mod)
struct dyn_ftrace *p;
cycle_t start, stop;
unsigned long ref = 0;
+ bool test = false;
int i;
/*
@@ -2196,9 +2242,12 @@ static int ftrace_update_code(struct module *mod)
for (ops = ftrace_ops_list;
ops != &ftrace_list_end; ops = ops->next) {
- if (ops->flags & FTRACE_OPS_FL_ENABLED &&
- ops_traces_mod(ops))
- ref++;
+ if (ops->flags & FTRACE_OPS_FL_ENABLED) {
+ if (ops_traces_mod(ops))
+ ref++;
+ else
+ test = true;
+ }
}
}
@@ -2208,12 +2257,16 @@ static int ftrace_update_code(struct module *mod)
for (pg = ftrace_new_pgs; pg; pg = pg->next) {
for (i = 0; i < pg->index; i++) {
+ int cnt = ref;
+
/* If something went wrong, bail without enabling anything */
if (unlikely(ftrace_disabled))
return -1;
p = &pg->records[i];
- p->flags = ref;
+ if (test)
+ cnt += referenced_filters(p);
+ p->flags = cnt;
/*
* Do the initial record conversion from mcount jump
@@ -2233,7 +2286,7 @@ static int ftrace_update_code(struct module *mod)
* conversion puts the module to the correct state, thus
* passing the ftrace_make_call check.
*/
- if (ftrace_start_up && ref) {
+ if (ftrace_start_up && cnt) {
int failed = __ftrace_replace_code(p, 1);
if (failed)
ftrace_bug(failed, p->ip);
@@ -3384,6 +3437,12 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
return add_hash_entry(hash, ip);
}
+static void ftrace_ops_update_code(struct ftrace_ops *ops)
+{
+ if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled)
+ ftrace_run_update_code(FTRACE_UPDATE_CALLS);
+}
+
static int
ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
unsigned long ip, int remove, int reset, int enable)
@@ -3426,9 +3485,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
mutex_lock(&ftrace_lock);
ret = ftrace_hash_move(ops, enable, orig_hash, hash);
- if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED
- && ftrace_enabled)
- ftrace_run_update_code(FTRACE_UPDATE_CALLS);
+ if (!ret)
+ ftrace_ops_update_code(ops);
mutex_unlock(&ftrace_lock);
@@ -3655,9 +3713,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
mutex_lock(&ftrace_lock);
ret = ftrace_hash_move(iter->ops, filter_hash,
orig_hash, iter->hash);
- if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)
- && ftrace_enabled)
- ftrace_run_update_code(FTRACE_UPDATE_CALLS);
+ if (!ret)
+ ftrace_ops_update_code(iter->ops);
mutex_unlock(&ftrace_lock);
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 882ec1dd151..496f94d5769 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -243,20 +243,25 @@ int filter_current_check_discard(struct ring_buffer *buffer,
}
EXPORT_SYMBOL_GPL(filter_current_check_discard);
-cycle_t ftrace_now(int cpu)
+cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
{
u64 ts;
/* Early boot up does not have a buffer yet */
- if (!global_trace.trace_buffer.buffer)
+ if (!buf->buffer)
return trace_clock_local();
- ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu);
- ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts);
+ ts = ring_buffer_time_stamp(buf->buffer, cpu);
+ ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts);
return ts;
}
+cycle_t ftrace_now(int cpu)
+{
+ return buffer_ftrace_now(&global_trace.trace_buffer, cpu);
+}
+
/**
* tracing_is_enabled - Show if global_trace has been disabled
*
@@ -1211,7 +1216,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf)
/* Make sure all commits have finished */
synchronize_sched();
- buf->time_start = ftrace_now(buf->cpu);
+ buf->time_start = buffer_ftrace_now(buf, buf->cpu);
for_each_online_cpu(cpu)
ring_buffer_reset_cpu(buffer, cpu);
@@ -1219,11 +1224,6 @@ void tracing_reset_online_cpus(struct trace_buffer *buf)
ring_buffer_record_enable(buffer);
}
-void tracing_reset_current(int cpu)
-{
- tracing_reset(&global_trace.trace_buffer, cpu);
-}
-
/* Must have trace_types_lock held */
void tracing_reset_all_online_cpus(void)
{
@@ -4151,6 +4151,7 @@ waitagain:
memset(&iter->seq, 0,
sizeof(struct trace_iterator) -
offsetof(struct trace_iterator, seq));
+ cpumask_clear(iter->started);
iter->pos = -1;
trace_event_read_lock();
@@ -4468,7 +4469,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
/* disable tracing ? */
if (trace_flags & TRACE_ITER_STOP_ON_FREE)
- tracing_off();
+ tracer_tracing_off(tr);
/* resize the ring buffer to 0 */
tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);
@@ -4633,12 +4634,12 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
* New clock may not be consistent with the previous clock.
* Reset the buffer so that it doesn't have incomparable timestamps.
*/
- tracing_reset_online_cpus(&global_trace.trace_buffer);
+ tracing_reset_online_cpus(&tr->trace_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
- tracing_reset_online_cpus(&global_trace.max_buffer);
+ tracing_reset_online_cpus(&tr->max_buffer);
#endif
mutex_unlock(&trace_types_lock);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 898f868833f..29a7ebcfb42 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -409,33 +409,42 @@ static void put_system(struct ftrace_subsystem_dir *dir)
mutex_unlock(&event_mutex);
}
-/*
- * Open and update trace_array ref count.
- * Must have the current trace_array passed to it.
- */
-static int tracing_open_generic_file(struct inode *inode, struct file *filp)
+static void remove_subsystem(struct ftrace_subsystem_dir *dir)
{
- struct ftrace_event_file *file = inode->i_private;
- struct trace_array *tr = file->tr;
- int ret;
+ if (!dir)
+ return;
- if (trace_array_get(tr) < 0)
- return -ENODEV;
+ if (!--dir->nr_events) {
+ debugfs_remove_recursive(dir->entry);
+ list_del(&dir->list);
+ __put_system_dir(dir);
+ }
+}
- ret = tracing_open_generic(inode, filp);
- if (ret < 0)
- trace_array_put(tr);
- return ret;
+static void *event_file_data(struct file *filp)
+{
+ return ACCESS_ONCE(file_inode(filp)->i_private);
}
-static int tracing_release_generic_file(struct inode *inode, struct file *filp)
+static void remove_event_file_dir(struct ftrace_event_file *file)
{
- struct ftrace_event_file *file = inode->i_private;
- struct trace_array *tr = file->tr;
+ struct dentry *dir = file->dir;
+ struct dentry *child;
- trace_array_put(tr);
+ if (dir) {
+ spin_lock(&dir->d_lock); /* probably unneeded */
+ list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) {
+ if (child->d_inode) /* probably unneeded */
+ child->d_inode->i_private = NULL;
+ }
+ spin_unlock(&dir->d_lock);
- return 0;
+ debugfs_remove_recursive(dir);
+ }
+
+ list_del(&file->list);
+ remove_subsystem(file->system);
+ kmem_cache_free(file_cachep, file);
}
/*
@@ -679,15 +688,25 @@ static ssize_t
event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_file *file = filp->private_data;
+ struct ftrace_event_file *file;
+ unsigned long flags;
char buf[4] = "0";
- if (file->flags & FTRACE_EVENT_FL_ENABLED &&
- !(file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+ mutex_lock(&event_mutex);
+ file = event_file_data(filp);
+ if (likely(file))
+ flags = file->flags;
+ mutex_unlock(&event_mutex);
+
+ if (!file)
+ return -ENODEV;
+
+ if (flags & FTRACE_EVENT_FL_ENABLED &&
+ !(flags & FTRACE_EVENT_FL_SOFT_DISABLED))
strcpy(buf, "1");
- if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED ||
- file->flags & FTRACE_EVENT_FL_SOFT_MODE)
+ if (flags & FTRACE_EVENT_FL_SOFT_DISABLED ||
+ flags & FTRACE_EVENT_FL_SOFT_MODE)
strcat(buf, "*");
strcat(buf, "\n");
@@ -699,13 +718,10 @@ static ssize_t
event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_file *file = filp->private_data;
+ struct ftrace_event_file *file;
unsigned long val;
int ret;
- if (!file)
- return -EINVAL;
-
ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
if (ret)
return ret;
@@ -717,8 +733,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
switch (val) {
case 0:
case 1:
+ ret = -ENODEV;
mutex_lock(&event_mutex);
- ret = ftrace_event_enable_disable(file, val);
+ file = event_file_data(filp);
+ if (likely(file))
+ ret = ftrace_event_enable_disable(file, val);
mutex_unlock(&event_mutex);
break;
@@ -825,7 +844,7 @@ enum {
static void *f_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct ftrace_event_call *call = m->private;
+ struct ftrace_event_call *call = event_file_data(m->private);
struct list_head *common_head = &ftrace_common_fields;
struct list_head *head = trace_get_fields(call);
struct list_head *node = v;
@@ -857,7 +876,7 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
static int f_show(struct seq_file *m, void *v)
{
- struct ftrace_event_call *call = m->private;
+ struct ftrace_event_call *call = event_file_data(m->private);
struct ftrace_event_field *field;
const char *array_descriptor;
@@ -910,6 +929,11 @@ static void *f_start(struct seq_file *m, loff_t *pos)
void *p = (void *)FORMAT_HEADER;
loff_t l = 0;
+ /* ->stop() is called even if ->start() fails */
+ mutex_lock(&event_mutex);
+ if (!event_file_data(m->private))
+ return ERR_PTR(-ENODEV);
+
while (l < *pos && p)
p = f_next(m, p, &l);
@@ -918,6 +942,7 @@ static void *f_start(struct seq_file *m, loff_t *pos)
static void f_stop(struct seq_file *m, void *p)
{
+ mutex_unlock(&event_mutex);
}
static const struct seq_operations trace_format_seq_ops = {
@@ -929,7 +954,6 @@ static const struct seq_operations trace_format_seq_ops = {
static int trace_format_open(struct inode *inode, struct file *file)
{
- struct ftrace_event_call *call = inode->i_private;
struct seq_file *m;
int ret;
@@ -938,7 +962,7 @@ static int trace_format_open(struct inode *inode, struct file *file)
return ret;
m = file->private_data;
- m->private = call;
+ m->private = file;
return 0;
}
@@ -946,14 +970,18 @@ static int trace_format_open(struct inode *inode, struct file *file)
static ssize_t
event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
- struct ftrace_event_call *call = filp->private_data;
+ int id = (long)event_file_data(filp);
char buf[32];
int len;
if (*ppos)
return 0;
- len = sprintf(buf, "%d\n", call->event.type);
+ if (unlikely(!id))
+ return -ENODEV;
+
+ len = sprintf(buf, "%d\n", id);
+
return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
}
@@ -961,21 +989,28 @@ static ssize_t
event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_call *call = filp->private_data;
+ struct ftrace_event_call *call;
struct trace_seq *s;
- int r;
+ int r = -ENODEV;
if (*ppos)
return 0;
s = kmalloc(sizeof(*s), GFP_KERNEL);
+
if (!s)
return -ENOMEM;
trace_seq_init(s);
- print_event_filter(call, s);
- r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+ mutex_lock(&event_mutex);
+ call = event_file_data(filp);
+ if (call)
+ print_event_filter(call, s);
+ mutex_unlock(&event_mutex);
+
+ if (call)
+ r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
kfree(s);
@@ -986,9 +1021,9 @@ static ssize_t
event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_call *call = filp->private_data;
+ struct ftrace_event_call *call;
char *buf;
- int err;
+ int err = -ENODEV;
if (cnt >= PAGE_SIZE)
return -EINVAL;
@@ -1003,7 +1038,12 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
}
buf[cnt] = '\0';
- err = apply_event_filter(call, buf);
+ mutex_lock(&event_mutex);
+ call = event_file_data(filp);
+ if (call)
+ err = apply_event_filter(call, buf);
+ mutex_unlock(&event_mutex);
+
free_page((unsigned long) buf);
if (err < 0)
return err;
@@ -1225,10 +1265,9 @@ static const struct file_operations ftrace_set_event_fops = {
};
static const struct file_operations ftrace_enable_fops = {
- .open = tracing_open_generic_file,
+ .open = tracing_open_generic,
.read = event_enable_read,
.write = event_enable_write,
- .release = tracing_release_generic_file,
.llseek = default_llseek,
};
@@ -1240,7 +1279,6 @@ static const struct file_operations ftrace_event_format_fops = {
};
static const struct file_operations ftrace_event_id_fops = {
- .open = tracing_open_generic,
.read = event_id_read,
.llseek = default_llseek,
};
@@ -1488,8 +1526,8 @@ event_create_dir(struct dentry *parent,
#ifdef CONFIG_PERF_EVENTS
if (call->event.type && call->class->reg)
- trace_create_file("id", 0444, file->dir, call,
- id);
+ trace_create_file("id", 0444, file->dir,
+ (void *)(long)call->event.type, id);
#endif
/*
@@ -1514,33 +1552,16 @@ event_create_dir(struct dentry *parent,
return 0;
}
-static void remove_subsystem(struct ftrace_subsystem_dir *dir)
-{
- if (!dir)
- return;
-
- if (!--dir->nr_events) {
- debugfs_remove_recursive(dir->entry);
- list_del(&dir->list);
- __put_system_dir(dir);
- }
-}
-
static void remove_event_from_tracers(struct ftrace_event_call *call)
{
struct ftrace_event_file *file;
struct trace_array *tr;
do_for_each_event_file_safe(tr, file) {
-
if (file->event_call != call)
continue;
- list_del(&file->list);
- debugfs_remove_recursive(file->dir);
- remove_subsystem(file->system);
- kmem_cache_free(file_cachep, file);
-
+ remove_event_file_dir(file);
/*
* The do_for_each_event_file_safe() is
* a double loop. After finding the call for this
@@ -1692,16 +1713,53 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
destroy_preds(call);
}
+static int probe_remove_event_call(struct ftrace_event_call *call)
+{
+ struct trace_array *tr;
+ struct ftrace_event_file *file;
+
+#ifdef CONFIG_PERF_EVENTS
+ if (call->perf_refcount)
+ return -EBUSY;
+#endif
+ do_for_each_event_file(tr, file) {
+ if (file->event_call != call)
+ continue;
+ /*
+ * We can't rely on ftrace_event_enable_disable(enable => 0)
+ * we are going to do, FTRACE_EVENT_FL_SOFT_MODE can suppress
+ * TRACE_REG_UNREGISTER.
+ */
+ if (file->flags & FTRACE_EVENT_FL_ENABLED)
+ return -EBUSY;
+ /*
+ * The do_for_each_event_file_safe() is
+ * a double loop. After finding the call for this
+ * trace_array, we use break to jump to the next
+ * trace_array.
+ */
+ break;
+ } while_for_each_event_file();
+
+ __trace_remove_event_call(call);
+
+ return 0;
+}
+
/* Remove an event_call */
-void trace_remove_event_call(struct ftrace_event_call *call)
+int trace_remove_event_call(struct ftrace_event_call *call)
{
+ int ret;
+
mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
down_write(&trace_event_sem);
- __trace_remove_event_call(call);
+ ret = probe_remove_event_call(call);
up_write(&trace_event_sem);
mutex_unlock(&event_mutex);
mutex_unlock(&trace_types_lock);
+
+ return ret;
}
#define for_each_event(event, start, end) \
@@ -2270,12 +2328,8 @@ __trace_remove_event_dirs(struct trace_array *tr)
{
struct ftrace_event_file *file, *next;
- list_for_each_entry_safe(file, next, &tr->events, list) {
- list_del(&file->list);
- debugfs_remove_recursive(file->dir);
- remove_subsystem(file->system);
- kmem_cache_free(file_cachep, file);
- }
+ list_for_each_entry_safe(file, next, &tr->events, list)
+ remove_event_file_dir(file);
}
static void
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 0c7b75a8acc..97daa8cf958 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -637,17 +637,15 @@ static void append_filter_err(struct filter_parse_state *ps,
free_page((unsigned long) buf);
}
+/* caller must hold event_mutex */
void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
{
- struct event_filter *filter;
+ struct event_filter *filter = call->filter;
- mutex_lock(&event_mutex);
- filter = call->filter;
if (filter && filter->filter_string)
trace_seq_printf(s, "%s\n", filter->filter_string);
else
trace_seq_puts(s, "none\n");
- mutex_unlock(&event_mutex);
}
void print_subsystem_event_filter(struct event_subsystem *system,
@@ -1841,23 +1839,22 @@ static int create_system_filter(struct event_subsystem *system,
return err;
}
+/* caller must hold event_mutex */
int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
{
struct event_filter *filter;
- int err = 0;
-
- mutex_lock(&event_mutex);
+ int err;
if (!strcmp(strstrip(filter_string), "0")) {
filter_disable(call);
filter = call->filter;
if (!filter)
- goto out_unlock;
+ return 0;
RCU_INIT_POINTER(call->filter, NULL);
/* Make sure the filter is not being used */
synchronize_sched();
__free_filter(filter);
- goto out_unlock;
+ return 0;
}
err = create_filter(call, filter_string, true, &filter);
@@ -1884,8 +1881,6 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
__free_filter(tmp);
}
}
-out_unlock:
- mutex_unlock(&event_mutex);
return err;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 3811487e7a7..243f6834d02 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -95,7 +95,7 @@ static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp)
}
static int register_probe_event(struct trace_probe *tp);
-static void unregister_probe_event(struct trace_probe *tp);
+static int unregister_probe_event(struct trace_probe *tp);
static DEFINE_MUTEX(probe_lock);
static LIST_HEAD(probe_list);
@@ -351,9 +351,12 @@ static int unregister_trace_probe(struct trace_probe *tp)
if (trace_probe_is_enabled(tp))
return -EBUSY;
+ /* Will fail if probe is being used by ftrace or perf */
+ if (unregister_probe_event(tp))
+ return -EBUSY;
+
__unregister_trace_probe(tp);
list_del(&tp->list);
- unregister_probe_event(tp);
return 0;
}
@@ -632,7 +635,9 @@ static int release_all_trace_probes(void)
/* TODO: Use batch unregistration */
while (!list_empty(&probe_list)) {
tp = list_entry(probe_list.next, struct trace_probe, list);
- unregister_trace_probe(tp);
+ ret = unregister_trace_probe(tp);
+ if (ret)
+ goto end;
free_trace_probe(tp);
}
@@ -1247,11 +1252,15 @@ static int register_probe_event(struct trace_probe *tp)
return ret;
}
-static void unregister_probe_event(struct trace_probe *tp)
+static int unregister_probe_event(struct trace_probe *tp)
{
+ int ret;
+
/* tp->event is unregistered in trace_remove_event_call() */
- trace_remove_event_call(&tp->call);
- kfree(tp->call.print_fmt);
+ ret = trace_remove_event_call(&tp->call);
+ if (!ret)
+ kfree(tp->call.print_fmt);
+ return ret;
}
/* Make a debugfs interface for controlling probe points */
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index a23d2d71188..272261b5f94 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -70,7 +70,7 @@ struct trace_uprobe {
(sizeof(struct probe_arg) * (n)))
static int register_uprobe_event(struct trace_uprobe *tu);
-static void unregister_uprobe_event(struct trace_uprobe *tu);
+static int unregister_uprobe_event(struct trace_uprobe *tu);
static DEFINE_MUTEX(uprobe_lock);
static LIST_HEAD(uprobe_list);
@@ -164,11 +164,17 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou
}
/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */
-static void unregister_trace_uprobe(struct trace_uprobe *tu)
+static int unregister_trace_uprobe(struct trace_uprobe *tu)
{
+ int ret;
+
+ ret = unregister_uprobe_event(tu);
+ if (ret)
+ return ret;
+
list_del(&tu->list);
- unregister_uprobe_event(tu);
free_trace_uprobe(tu);
+ return 0;
}
/* Register a trace_uprobe and probe_event */
@@ -181,9 +187,12 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
/* register as an event */
old_tp = find_probe_event(tu->call.name, tu->call.class->system);
- if (old_tp)
+ if (old_tp) {
/* delete old event */
- unregister_trace_uprobe(old_tp);
+ ret = unregister_trace_uprobe(old_tp);
+ if (ret)
+ goto end;
+ }
ret = register_uprobe_event(tu);
if (ret) {
@@ -256,6 +265,8 @@ static int create_trace_uprobe(int argc, char **argv)
group = UPROBE_EVENT_SYSTEM;
if (is_delete) {
+ int ret;
+
if (!event) {
pr_info("Delete command needs an event name.\n");
return -EINVAL;
@@ -269,9 +280,9 @@ static int create_trace_uprobe(int argc, char **argv)
return -ENOENT;
}
/* delete an event */
- unregister_trace_uprobe(tu);
+ ret = unregister_trace_uprobe(tu);
mutex_unlock(&uprobe_lock);
- return 0;
+ return ret;
}
if (argc < 2) {
@@ -408,16 +419,20 @@ fail_address_parse:
return ret;
}
-static void cleanup_all_probes(void)
+static int cleanup_all_probes(void)
{
struct trace_uprobe *tu;
+ int ret = 0;
mutex_lock(&uprobe_lock);
while (!list_empty(&uprobe_list)) {
tu = list_entry(uprobe_list.next, struct trace_uprobe, list);
- unregister_trace_uprobe(tu);
+ ret = unregister_trace_uprobe(tu);
+ if (ret)
+ break;
}
mutex_unlock(&uprobe_lock);
+ return ret;
}
/* Probes listing interfaces */
@@ -462,8 +477,13 @@ static const struct seq_operations probes_seq_op = {
static int probes_open(struct inode *inode, struct file *file)
{
- if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
- cleanup_all_probes();
+ int ret;
+
+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+ ret = cleanup_all_probes();
+ if (ret)
+ return ret;
+ }
return seq_open(file, &probes_seq_op);
}
@@ -968,12 +988,17 @@ static int register_uprobe_event(struct trace_uprobe *tu)
return ret;
}
-static void unregister_uprobe_event(struct trace_uprobe *tu)
+static int unregister_uprobe_event(struct trace_uprobe *tu)
{
+ int ret;
+
/* tu->event is unregistered in trace_remove_event_call() */
- trace_remove_event_call(&tu->call);
+ ret = trace_remove_event_call(&tu->call);
+ if (ret)
+ return ret;
kfree(tu->call.print_fmt);
tu->call.print_fmt = NULL;
+ return 0;
}
/* Make a trace interface for controling probe points */
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index d8c30db06c5..9064b919a40 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -62,6 +62,9 @@ int create_user_ns(struct cred *new)
kgid_t group = new->egid;
int ret;
+ if (parent_ns->level > 32)
+ return -EUSERS;
+
/*
* Verify that we can not violate the policy of which files
* may be accessed that is specified by the root directory,
@@ -92,6 +95,7 @@ int create_user_ns(struct cred *new)
atomic_set(&ns->count, 1);
/* Leave the new->user_ns reference with the new user namespace. */
ns->parent = parent_ns;
+ ns->level = parent_ns->level + 1;
ns->owner = owner;
ns->group = group;
@@ -105,16 +109,21 @@ int create_user_ns(struct cred *new)
int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
{
struct cred *cred;
+ int err = -ENOMEM;
if (!(unshare_flags & CLONE_NEWUSER))
return 0;
cred = prepare_creds();
- if (!cred)
- return -ENOMEM;
+ if (cred) {
+ err = create_user_ns(cred);
+ if (err)
+ put_cred(cred);
+ else
+ *new_cred = cred;
+ }
- *new_cred = cred;
- return create_user_ns(cred);
+ return err;
}
void free_user_ns(struct user_namespace *ns)
diff --git a/kernel/wait.c b/kernel/wait.c
index dec68bd4e9d..d550920e040 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -363,8 +363,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
/**
* wake_up_atomic_t - Wake up a waiter on a atomic_t
- * @word: The word being waited on, a kernel virtual address
- * @bit: The bit of the word being waited on
+ * @p: The atomic_t being waited on, a kernel virtual address
*
* Wake up anyone waiting for the atomic_t to go to zero.
*
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0b72e816b8d..5f8ee91abdf 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2201,6 +2201,15 @@ __acquires(&pool->lock)
dump_stack();
}
+ /*
+ * The following prevents a kworker from hogging CPU on !PREEMPT
+ * kernels, where a requeueing work item waiting for something to
+ * happen could deadlock with stop_machine as such work item could
+ * indefinitely requeue itself while all other CPUs are trapped in
+ * stop_machine.
+ */
+ cond_resched();
+
spin_lock_irq(&pool->lock);
/* clear cpu intensive status */
@@ -2817,6 +2826,19 @@ already_gone:
return false;
}
+static bool __flush_work(struct work_struct *work)
+{
+ struct wq_barrier barr;
+
+ if (start_flush_work(work, &barr)) {
+ wait_for_completion(&barr.done);
+ destroy_work_on_stack(&barr.work);
+ return true;
+ } else {
+ return false;
+ }
+}
+
/**
* flush_work - wait for a work to finish executing the last queueing instance
* @work: the work to flush
@@ -2830,18 +2852,10 @@ already_gone:
*/
bool flush_work(struct work_struct *work)
{
- struct wq_barrier barr;
-
lock_map_acquire(&work->lockdep_map);
lock_map_release(&work->lockdep_map);
- if (start_flush_work(work, &barr)) {
- wait_for_completion(&barr.done);
- destroy_work_on_stack(&barr.work);
- return true;
- } else {
- return false;
- }
+ return __flush_work(work);
}
EXPORT_SYMBOL_GPL(flush_work);
@@ -3081,25 +3095,26 @@ static struct workqueue_struct *dev_to_wq(struct device *dev)
return wq_dev->wq;
}
-static ssize_t wq_per_cpu_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
{
struct workqueue_struct *wq = dev_to_wq(dev);
return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
}
+static DEVICE_ATTR_RO(per_cpu);
-static ssize_t wq_max_active_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t max_active_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct workqueue_struct *wq = dev_to_wq(dev);
return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
}
-static ssize_t wq_max_active_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t max_active_store(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
{
struct workqueue_struct *wq = dev_to_wq(dev);
int val;
@@ -3110,12 +3125,14 @@ static ssize_t wq_max_active_store(struct device *dev,
workqueue_set_max_active(wq, val);
return count;
}
+static DEVICE_ATTR_RW(max_active);
-static struct device_attribute wq_sysfs_attrs[] = {
- __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL),
- __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store),
- __ATTR_NULL,
+static struct attribute *wq_sysfs_attrs[] = {
+ &dev_attr_per_cpu.attr,
+ &dev_attr_max_active.attr,
+ NULL,
};
+ATTRIBUTE_GROUPS(wq_sysfs);
static ssize_t wq_pool_ids_show(struct device *dev,
struct device_attribute *attr, char *buf)
@@ -3265,7 +3282,7 @@ static struct device_attribute wq_sysfs_unbound_attrs[] = {
static struct bus_type wq_subsys = {
.name = "workqueue",
- .dev_attrs = wq_sysfs_attrs,
+ .dev_groups = wq_sysfs_groups,
};
static int __init wq_sysfs_init(void)
@@ -3411,6 +3428,12 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
{
to->nice = from->nice;
cpumask_copy(to->cpumask, from->cpumask);
+ /*
+ * Unlike hash and equality test, this function doesn't ignore
+ * ->no_numa as it is used for both pool and wq attrs. Instead,
+ * get_unbound_pool() explicitly clears ->no_numa after copying.
+ */
+ to->no_numa = from->no_numa;
}
/* hash value of the content of @attr */
@@ -3578,6 +3601,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
copy_workqueue_attrs(pool->attrs, attrs);
+ /*
+ * no_numa isn't a worker_pool attribute, always clear it. See
+ * 'struct workqueue_attrs' comments for detail.
+ */
+ pool->attrs->no_numa = false;
+
/* if cpumask is contained inside a NUMA node, we belong to that node */
if (wq_numa_enabled) {
for_each_node(node) {
@@ -4756,7 +4785,14 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
schedule_work_on(cpu, &wfc.work);
- flush_work(&wfc.work);
+
+ /*
+ * The work item is on-stack and can't lead to deadlock through
+ * flushing. Use __flush_work() to avoid spurious lockdep warnings
+ * when work_on_cpu()s are nested.
+ */
+ __flush_work(&wfc.work);
+
return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu);