summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/async.c76
-rw-r--r--kernel/audit.c30
-rw-r--r--kernel/audit_tree.c10
-rw-r--r--kernel/audit_watch.c25
-rw-r--r--kernel/cgroup.c76
-rw-r--r--kernel/cpuset.c130
-rw-r--r--kernel/debug/kdb/kdb_main.c91
-rw-r--r--kernel/debug/kdb/kdb_private.h1
-rw-r--r--kernel/events/core.c49
-rw-r--r--kernel/events/uprobes.c622
-rw-r--r--kernel/exit.c8
-rw-r--r--kernel/fork.c33
-rw-r--r--kernel/hrtimer.c53
-rw-r--r--kernel/irq/irqdomain.c8
-rw-r--r--kernel/irq/manage.c6
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c37
-rw-r--r--kernel/kthread.c88
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/hibernate.c50
-rw-r--r--kernel/power/main.c45
-rw-r--r--kernel/power/power.h3
-rw-r--r--kernel/power/suspend.c6
-rw-r--r--kernel/power/swap.c82
-rw-r--r--kernel/power/user.c2
-rw-r--r--kernel/power/wakelock.c7
-rw-r--r--kernel/printk.c472
-rw-r--r--kernel/rcupdate.c44
-rw-r--r--kernel/rcutiny.c4
-rw-r--r--kernel/rcutiny_plugin.h56
-rw-r--r--kernel/rcutorture.c72
-rw-r--r--kernel/rcutree.c479
-rw-r--r--kernel/rcutree.h47
-rw-r--r--kernel/rcutree_plugin.h237
-rw-r--r--kernel/rcutree_trace.c148
-rw-r--r--kernel/relay.c5
-rw-r--r--kernel/resource.c37
-rw-r--r--kernel/sched/core.c370
-rw-r--r--kernel/sched/fair.c113
-rw-r--r--kernel/sched/idle_task.c1
-rw-r--r--kernel/sched/sched.h25
-rw-r--r--kernel/signal.c15
-rw-r--r--kernel/smp.c20
-rw-r--r--kernel/smpboot.h2
-rw-r--r--kernel/sys.c73
-rw-r--r--kernel/sysctl.c43
-rw-r--r--kernel/task_work.c94
-rw-r--r--kernel/taskstats.c5
-rw-r--r--kernel/time/ntp.c8
-rw-r--r--kernel/time/tick-sched.c194
-rw-r--r--kernel/time/timekeeping.c511
-rw-r--r--kernel/time/timer_list.c4
-rw-r--r--kernel/timer.c110
-rw-r--r--kernel/trace/ftrace.c8
-rw-r--r--kernel/trace/ring_buffer.c10
-rw-r--r--kernel/trace/trace.c46
-rw-r--r--kernel/trace/trace.h8
-rw-r--r--kernel/trace/trace_functions.c36
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_output.c2
-rw-r--r--kernel/watchdog.c21
-rw-r--r--kernel/workqueue.c1144
63 files changed, 3444 insertions, 2574 deletions
diff --git a/kernel/async.c b/kernel/async.c
index bd0c168a3bb..9d311838485 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -62,8 +62,10 @@ static async_cookie_t next_cookie = 1;
#define MAX_WORK 32768
static LIST_HEAD(async_pending);
-static LIST_HEAD(async_running);
+static ASYNC_DOMAIN(async_running);
+static LIST_HEAD(async_domains);
static DEFINE_SPINLOCK(async_lock);
+static DEFINE_MUTEX(async_register_mutex);
struct async_entry {
struct list_head list;
@@ -71,7 +73,7 @@ struct async_entry {
async_cookie_t cookie;
async_func_ptr *func;
void *data;
- struct list_head *running;
+ struct async_domain *running;
};
static DECLARE_WAIT_QUEUE_HEAD(async_done);
@@ -82,13 +84,12 @@ static atomic_t entry_count;
/*
* MUST be called with the lock held!
*/
-static async_cookie_t __lowest_in_progress(struct list_head *running)
+static async_cookie_t __lowest_in_progress(struct async_domain *running)
{
struct async_entry *entry;
- if (!list_empty(running)) {
- entry = list_first_entry(running,
- struct async_entry, list);
+ if (!list_empty(&running->domain)) {
+ entry = list_first_entry(&running->domain, typeof(*entry), list);
return entry->cookie;
}
@@ -99,7 +100,7 @@ static async_cookie_t __lowest_in_progress(struct list_head *running)
return next_cookie; /* "infinity" value */
}
-static async_cookie_t lowest_in_progress(struct list_head *running)
+static async_cookie_t lowest_in_progress(struct async_domain *running)
{
unsigned long flags;
async_cookie_t ret;
@@ -119,10 +120,11 @@ static void async_run_entry_fn(struct work_struct *work)
container_of(work, struct async_entry, work);
unsigned long flags;
ktime_t uninitialized_var(calltime), delta, rettime;
+ struct async_domain *running = entry->running;
/* 1) move self to the running queue */
spin_lock_irqsave(&async_lock, flags);
- list_move_tail(&entry->list, entry->running);
+ list_move_tail(&entry->list, &running->domain);
spin_unlock_irqrestore(&async_lock, flags);
/* 2) run (and print duration) */
@@ -145,6 +147,8 @@ static void async_run_entry_fn(struct work_struct *work)
/* 3) remove self from the running queue */
spin_lock_irqsave(&async_lock, flags);
list_del(&entry->list);
+ if (running->registered && --running->count == 0)
+ list_del_init(&running->node);
/* 4) free the entry */
kfree(entry);
@@ -156,7 +160,7 @@ static void async_run_entry_fn(struct work_struct *work)
wake_up(&async_done);
}
-static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
+static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running)
{
struct async_entry *entry;
unsigned long flags;
@@ -187,6 +191,8 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
spin_lock_irqsave(&async_lock, flags);
newcookie = entry->cookie = next_cookie++;
list_add_tail(&entry->list, &async_pending);
+ if (running->registered && running->count++ == 0)
+ list_add_tail(&running->node, &async_domains);
atomic_inc(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
@@ -223,7 +229,7 @@ EXPORT_SYMBOL_GPL(async_schedule);
* Note: This function may be called from atomic or non-atomic contexts.
*/
async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
- struct list_head *running)
+ struct async_domain *running)
{
return __async_schedule(ptr, data, running);
}
@@ -236,22 +242,52 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
*/
void async_synchronize_full(void)
{
+ mutex_lock(&async_register_mutex);
do {
- async_synchronize_cookie(next_cookie);
- } while (!list_empty(&async_running) || !list_empty(&async_pending));
+ struct async_domain *domain = NULL;
+
+ spin_lock_irq(&async_lock);
+ if (!list_empty(&async_domains))
+ domain = list_first_entry(&async_domains, typeof(*domain), node);
+ spin_unlock_irq(&async_lock);
+
+ async_synchronize_cookie_domain(next_cookie, domain);
+ } while (!list_empty(&async_domains));
+ mutex_unlock(&async_register_mutex);
}
EXPORT_SYMBOL_GPL(async_synchronize_full);
/**
+ * async_unregister_domain - ensure no more anonymous waiters on this domain
+ * @domain: idle domain to flush out of any async_synchronize_full instances
+ *
+ * async_synchronize_{cookie|full}_domain() are not flushed since callers
+ * of these routines should know the lifetime of @domain
+ *
+ * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
+ */
+void async_unregister_domain(struct async_domain *domain)
+{
+ mutex_lock(&async_register_mutex);
+ spin_lock_irq(&async_lock);
+ WARN_ON(!domain->registered || !list_empty(&domain->node) ||
+ !list_empty(&domain->domain));
+ domain->registered = 0;
+ spin_unlock_irq(&async_lock);
+ mutex_unlock(&async_register_mutex);
+}
+EXPORT_SYMBOL_GPL(async_unregister_domain);
+
+/**
* async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
- * @list: running list to synchronize on
+ * @domain: running list to synchronize on
*
* This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @list have been done.
+ * synchronization domain specified by the running list @domain have been done.
*/
-void async_synchronize_full_domain(struct list_head *list)
+void async_synchronize_full_domain(struct async_domain *domain)
{
- async_synchronize_cookie_domain(next_cookie, list);
+ async_synchronize_cookie_domain(next_cookie, domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
@@ -261,14 +297,16 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
* @running: running list to synchronize on
*
* This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @list submitted
+ * synchronization domain specified by running list @running submitted
* prior to @cookie have been done.
*/
-void async_synchronize_cookie_domain(async_cookie_t cookie,
- struct list_head *running)
+void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running)
{
ktime_t uninitialized_var(starttime), delta, endtime;
+ if (!running)
+ return;
+
if (initcall_debug && system_state == SYSTEM_BOOTING) {
printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
diff --git a/kernel/audit.c b/kernel/audit.c
index 1c7f2c61416..4a3f28d2ca6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -384,7 +384,7 @@ static void audit_hold_skb(struct sk_buff *skb)
static void audit_printk_skb(struct sk_buff *skb)
{
struct nlmsghdr *nlh = nlmsg_hdr(skb);
- char *data = NLMSG_DATA(nlh);
+ char *data = nlmsg_data(nlh);
if (nlh->nlmsg_type != AUDIT_EOE) {
if (printk_ratelimit())
@@ -516,14 +516,15 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
if (!skb)
return NULL;
- nlh = NLMSG_NEW(skb, pid, seq, t, size, flags);
- data = NLMSG_DATA(nlh);
+ nlh = nlmsg_put(skb, pid, seq, t, size, flags);
+ if (!nlh)
+ goto out_kfree_skb;
+ data = nlmsg_data(nlh);
memcpy(data, payload, size);
return skb;
-nlmsg_failure: /* Used by NLMSG_NEW */
- if (skb)
- kfree_skb(skb);
+out_kfree_skb:
+ kfree_skb(skb);
return NULL;
}
@@ -680,7 +681,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
sessionid = audit_get_sessionid(current);
security_task_getsecid(current, &sid);
seq = nlh->nlmsg_seq;
- data = NLMSG_DATA(nlh);
+ data = nlmsg_data(nlh);
switch (msg_type) {
case AUDIT_GET:
@@ -961,14 +962,17 @@ static void audit_receive(struct sk_buff *skb)
static int __init audit_init(void)
{
int i;
+ struct netlink_kernel_cfg cfg = {
+ .input = audit_receive,
+ };
if (audit_initialized == AUDIT_DISABLED)
return 0;
printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
audit_default ? "enabled" : "disabled");
- audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
- audit_receive, NULL, THIS_MODULE);
+ audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT,
+ THIS_MODULE, &cfg);
if (!audit_sock)
audit_panic("cannot initialize netlink socket");
else
@@ -1060,13 +1064,15 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
if (!ab->skb)
- goto nlmsg_failure;
+ goto err;
- nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
+ nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0);
+ if (!nlh)
+ goto out_kfree_skb;
return ab;
-nlmsg_failure: /* Used by NLMSG_NEW */
+out_kfree_skb:
kfree_skb(ab->skb);
ab->skb = NULL;
err:
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 5bf0790497e..3a5ca582ba1 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -595,7 +595,7 @@ void audit_trim_trees(void)
root_mnt = collect_mounts(&path);
path_put(&path);
- if (!root_mnt)
+ if (IS_ERR(root_mnt))
goto skip_it;
spin_lock(&hash_lock);
@@ -669,8 +669,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
goto Err;
mnt = collect_mounts(&path);
path_put(&path);
- if (!mnt) {
- err = -ENOMEM;
+ if (IS_ERR(mnt)) {
+ err = PTR_ERR(mnt);
goto Err;
}
@@ -719,8 +719,8 @@ int audit_tag_tree(char *old, char *new)
return err;
tagged = collect_mounts(&path2);
path_put(&path2);
- if (!tagged)
- return -ENOMEM;
+ if (IS_ERR(tagged))
+ return PTR_ERR(tagged);
err = kern_path(old, 0, &path1);
if (err) {
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index e683869365d..3823281401b 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -355,34 +355,15 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
/* Get path information necessary for adding watches. */
static int audit_get_nd(struct audit_watch *watch, struct path *parent)
{
- struct nameidata nd;
- struct dentry *d;
- int err;
-
- err = kern_path_parent(watch->path, &nd);
- if (err)
- return err;
-
- if (nd.last_type != LAST_NORM) {
- path_put(&nd.path);
- return -EINVAL;
- }
-
- mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
- d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
- if (IS_ERR(d)) {
- mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
- path_put(&nd.path);
+ struct dentry *d = kern_path_locked(watch->path, parent);
+ if (IS_ERR(d))
return PTR_ERR(d);
- }
+ mutex_unlock(&parent->dentry->d_inode->i_mutex);
if (d->d_inode) {
/* update watch filter fields */
watch->dev = d->d_inode->i_sb->s_dev;
watch->ino = d->d_inode->i_ino;
}
- mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-
- *parent = nd.path;
dput(d);
return 0;
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2097684cf19..79818507e44 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -822,7 +822,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
*/
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
-static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
+static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
static int cgroup_populate_dir(struct cgroup *cgrp);
static const struct inode_operations cgroup_dir_inode_operations;
@@ -901,13 +901,10 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
mutex_unlock(&cgroup_mutex);
/*
- * We want to drop the active superblock reference from the
- * cgroup creation after all the dentry refs are gone -
- * kill_sb gets mighty unhappy otherwise. Mark
- * dentry->d_fsdata with cgroup_diput() to tell
- * cgroup_d_release() to call deactivate_super().
+ * Drop the active superblock reference that we took when we
+ * created the cgroup
*/
- dentry->d_fsdata = cgroup_diput;
+ deactivate_super(cgrp->root->sb);
/*
* if we're getting rid of the cgroup, refcount should ensure
@@ -933,13 +930,6 @@ static int cgroup_delete(const struct dentry *d)
return 1;
}
-static void cgroup_d_release(struct dentry *dentry)
-{
- /* did cgroup_diput() tell me to deactivate super? */
- if (dentry->d_fsdata == cgroup_diput)
- deactivate_super(dentry->d_sb);
-}
-
static void remove_dir(struct dentry *d)
{
struct dentry *parent = dget(d->d_parent);
@@ -964,7 +954,7 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
dget(d);
d_delete(d);
- simple_unlink(d->d_inode, d);
+ simple_unlink(cgrp->dentry->d_inode, d);
list_del_init(&cfe->node);
dput(d);
@@ -1078,28 +1068,24 @@ static int rebind_subsystems(struct cgroupfs_root *root,
BUG_ON(cgrp->subsys[i]);
BUG_ON(!dummytop->subsys[i]);
BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
- mutex_lock(&ss->hierarchy_mutex);
cgrp->subsys[i] = dummytop->subsys[i];
cgrp->subsys[i]->cgroup = cgrp;
list_move(&ss->sibling, &root->subsys_list);
ss->root = root;
if (ss->bind)
ss->bind(cgrp);
- mutex_unlock(&ss->hierarchy_mutex);
/* refcount was already taken, and we're keeping it */
} else if (bit & removed_bits) {
/* We're removing this subsystem */
BUG_ON(ss == NULL);
BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
- mutex_lock(&ss->hierarchy_mutex);
if (ss->bind)
ss->bind(dummytop);
dummytop->subsys[i]->cgroup = dummytop;
cgrp->subsys[i] = NULL;
subsys[i]->root = &rootnode;
list_move(&ss->sibling, &rootnode.subsys_list);
- mutex_unlock(&ss->hierarchy_mutex);
/* subsystem is now free - drop reference on module */
module_put(ss->module);
} else if (bit & final_bits) {
@@ -1547,7 +1533,6 @@ static int cgroup_get_rootdir(struct super_block *sb)
static const struct dentry_operations cgroup_dops = {
.d_iput = cgroup_diput,
.d_delete = cgroup_delete,
- .d_release = cgroup_d_release,
};
struct inode *inode =
@@ -1598,7 +1583,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
opts.new_root = new_root;
/* Locate an existing or new sb for this hierarchy */
- sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
+ sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
if (IS_ERR(sb)) {
ret = PTR_ERR(sb);
cgroup_drop_root(opts.new_root);
@@ -2581,7 +2566,7 @@ static const struct inode_operations cgroup_dir_inode_operations = {
.rename = cgroup_rename,
};
-static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
@@ -3894,8 +3879,12 @@ static void css_dput_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, dput_work);
+ struct dentry *dentry = css->cgroup->dentry;
+ struct super_block *sb = dentry->d_sb;
- dput(css->cgroup->dentry);
+ atomic_inc(&sb->s_active);
+ dput(dentry);
+ deactivate_super(sb);
}
static void init_cgroup_css(struct cgroup_subsys_state *css,
@@ -3922,37 +3911,6 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
}
-static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
-{
- /* We need to take each hierarchy_mutex in a consistent order */
- int i;
-
- /*
- * No worry about a race with rebind_subsystems that might mess up the
- * locking order, since both parties are under cgroup_mutex.
- */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss == NULL)
- continue;
- if (ss->root == root)
- mutex_lock(&ss->hierarchy_mutex);
- }
-}
-
-static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
-{
- int i;
-
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss == NULL)
- continue;
- if (ss->root == root)
- mutex_unlock(&ss->hierarchy_mutex);
- }
-}
-
/*
* cgroup_create - create a cgroup
* @parent: cgroup that will be parent of the new cgroup
@@ -4013,9 +3971,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
ss->post_clone(cgrp);
}
- cgroup_lock_hierarchy(root);
list_add(&cgrp->sibling, &cgrp->parent->children);
- cgroup_unlock_hierarchy(root);
root->number_of_cgroups++;
err = cgroup_create_dir(cgrp, dentry, mode);
@@ -4042,9 +3998,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
err_remove:
- cgroup_lock_hierarchy(root);
list_del(&cgrp->sibling);
- cgroup_unlock_hierarchy(root);
root->number_of_cgroups--;
err_destroy:
@@ -4252,10 +4206,8 @@ again:
list_del_init(&cgrp->release_list);
raw_spin_unlock(&release_list_lock);
- cgroup_lock_hierarchy(cgrp->root);
/* delete this cgroup from parent->children */
list_del_init(&cgrp->sibling);
- cgroup_unlock_hierarchy(cgrp->root);
list_del_init(&cgrp->allcg_node);
@@ -4329,8 +4281,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
* need to invoke fork callbacks here. */
BUG_ON(!list_empty(&init_task.tasks));
- mutex_init(&ss->hierarchy_mutex);
- lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
ss->active = 1;
/* this function shouldn't be used with modular subsystems, since they
@@ -4457,8 +4407,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
}
write_unlock(&css_set_lock);
- mutex_init(&ss->hierarchy_mutex);
- lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
ss->active = 1;
/* success! */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8c8bd652dd1..f33c7153b6d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -147,6 +147,12 @@ typedef enum {
CS_SPREAD_SLAB,
} cpuset_flagbits_t;
+/* the type of hotplug event */
+enum hotplug_event {
+ CPUSET_CPU_OFFLINE,
+ CPUSET_MEM_OFFLINE,
+};
+
/* convenient tests for these bits */
static inline int is_cpu_exclusive(const struct cpuset *cs)
{
@@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
}
/*
- * Walk the specified cpuset subtree and look for empty cpusets.
- * The tasks of such cpuset must be moved to a parent cpuset.
+ * Helper function to traverse cpusets.
+ * It can be used to walk the cpuset tree from top to bottom, completing
+ * one layer before dropping down to the next (thus always processing a
+ * node before any of its children).
+ */
+static struct cpuset *cpuset_next(struct list_head *queue)
+{
+ struct cpuset *cp;
+ struct cpuset *child; /* scans child cpusets of cp */
+ struct cgroup *cont;
+
+ if (list_empty(queue))
+ return NULL;
+
+ cp = list_first_entry(queue, struct cpuset, stack_list);
+ list_del(queue->next);
+ list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+ child = cgroup_cs(cont);
+ list_add_tail(&child->stack_list, queue);
+ }
+
+ return cp;
+}
+
+
+/*
+ * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
+ * online/offline) and update the cpusets accordingly.
+ * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
+ * cpuset must be moved to a parent cpuset.
*
* Called with cgroup_mutex held. We take callback_mutex to modify
* cpus_allowed and mems_allowed.
@@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
* before dropping down to the next. It always processes a node before
* any of its children.
*
- * For now, since we lack memory hot unplug, we'll never see a cpuset
- * that has tasks along with an empty 'mems'. But if we did see such
- * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
+ * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY
+ * if all present pages from a node are offlined.
*/
-static void scan_for_empty_cpusets(struct cpuset *root)
+static void
+scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
{
LIST_HEAD(queue);
- struct cpuset *cp; /* scans cpusets being updated */
- struct cpuset *child; /* scans child cpusets of cp */
- struct cgroup *cont;
+ struct cpuset *cp; /* scans cpusets being updated */
static nodemask_t oldmems; /* protected by cgroup_mutex */
list_add_tail((struct list_head *)&root->stack_list, &queue);
- while (!list_empty(&queue)) {
- cp = list_first_entry(&queue, struct cpuset, stack_list);
- list_del(queue.next);
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &queue);
+ switch (event) {
+ case CPUSET_CPU_OFFLINE:
+ while ((cp = cpuset_next(&queue)) != NULL) {
+
+ /* Continue past cpusets with all cpus online */
+ if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
+ continue;
+
+ /* Remove offline cpus from this cpuset. */
+ mutex_lock(&callback_mutex);
+ cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
+ cpu_active_mask);
+ mutex_unlock(&callback_mutex);
+
+ /* Move tasks from the empty cpuset to a parent */
+ if (cpumask_empty(cp->cpus_allowed))
+ remove_tasks_in_empty_cpuset(cp);
+ else
+ update_tasks_cpumask(cp, NULL);
}
+ break;
- /* Continue past cpusets with all cpus, mems online */
- if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
- nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
- continue;
+ case CPUSET_MEM_OFFLINE:
+ while ((cp = cpuset_next(&queue)) != NULL) {
- oldmems = cp->mems_allowed;
+ /* Continue past cpusets with all mems online */
+ if (nodes_subset(cp->mems_allowed,
+ node_states[N_HIGH_MEMORY]))
+ continue;
- /* Remove offline cpus and mems from this cpuset. */
- mutex_lock(&callback_mutex);
- cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
- cpu_active_mask);
- nodes_and(cp->mems_allowed, cp->mems_allowed,
+ oldmems = cp->mems_allowed;
+
+ /* Remove offline mems from this cpuset. */
+ mutex_lock(&callback_mutex);
+ nodes_and(cp->mems_allowed, cp->mems_allowed,
node_states[N_HIGH_MEMORY]);
- mutex_unlock(&callback_mutex);
+ mutex_unlock(&callback_mutex);
- /* Move tasks from the empty cpuset to a parent */
- if (cpumask_empty(cp->cpus_allowed) ||
- nodes_empty(cp->mems_allowed))
- remove_tasks_in_empty_cpuset(cp);
- else {
- update_tasks_cpumask(cp, NULL);
- update_tasks_nodemask(cp, &oldmems, NULL);
+ /* Move tasks from the empty cpuset to a parent */
+ if (nodes_empty(cp->mems_allowed))
+ remove_tasks_in_empty_cpuset(cp);
+ else
+ update_tasks_nodemask(cp, &oldmems, NULL);
}
}
}
@@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root)
* (of no affect) on systems that are actively using CPU hotplug
* but making no active use of cpusets.
*
+ * The only exception to this is suspend/resume, where we don't
+ * modify cpusets at all.
+ *
* This routine ensures that top_cpuset.cpus_allowed tracks
* cpu_active_mask on each CPU hotplug (cpuhp) event.
*
* Called within get_online_cpus(). Needs to call cgroup_lock()
* before calling generate_sched_domains().
+ *
+ * @cpu_online: Indicates whether this is a CPU online event (true) or
+ * a CPU offline event (false).
*/
-void cpuset_update_active_cpus(void)
+void cpuset_update_active_cpus(bool cpu_online)
{
struct sched_domain_attr *attr;
cpumask_var_t *doms;
@@ -2070,7 +2121,10 @@ void cpuset_update_active_cpus(void)
mutex_lock(&callback_mutex);
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
mutex_unlock(&callback_mutex);
- scan_for_empty_cpusets(&top_cpuset);
+
+ if (!cpu_online)
+ scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
+
ndoms = generate_sched_domains(&doms, &attr);
cgroup_unlock();
@@ -2082,7 +2136,7 @@ void cpuset_update_active_cpus(void)
/*
* Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
* Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
- * See also the previous routine cpuset_track_online_cpus().
+ * See cpuset_update_active_cpus() for CPU hotplug handling.
*/
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
@@ -2101,9 +2155,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
case MEM_OFFLINE:
/*
* needn't update top_cpuset.mems_allowed explicitly because
- * scan_for_empty_cpusets() will update it.
+ * scan_cpusets_upon_hotplug() will update it.
*/
- scan_for_empty_cpusets(&top_cpuset);
+ scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
break;
default:
break;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 67b847dfa2b..1f91413edb8 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -14,6 +14,7 @@
#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/kernel.h>
+#include <linux/kmsg_dump.h>
#include <linux/reboot.h>
#include <linux/sched.h>
#include <linux/sysrq.h>
@@ -2040,8 +2041,15 @@ static int kdb_env(int argc, const char **argv)
*/
static int kdb_dmesg(int argc, const char **argv)
{
- char *syslog_data[4], *start, *end, c = '\0', *p;
- int diag, logging, logsize, lines = 0, adjust = 0, n;
+ int diag;
+ int logging;
+ int lines = 0;
+ int adjust = 0;
+ int n = 0;
+ int skip = 0;
+ struct kmsg_dumper dumper = { .active = 1 };
+ size_t len;
+ char buf[201];
if (argc > 2)
return KDB_ARGCOUNT;
@@ -2064,22 +2072,10 @@ static int kdb_dmesg(int argc, const char **argv)
kdb_set(2, setargs);
}
- /* syslog_data[0,1] physical start, end+1. syslog_data[2,3]
- * logical start, end+1. */
- kdb_syslog_data(syslog_data);
- if (syslog_data[2] == syslog_data[3])
- return 0;
- logsize = syslog_data[1] - syslog_data[0];
- start = syslog_data[2];
- end = syslog_data[3];
-#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
- for (n = 0, p = start; p < end; ++p) {
- c = *KDB_WRAP(p);
- if (c == '\n')
- ++n;
- }
- if (c != '\n')
- ++n;
+ kmsg_dump_rewind_nolock(&dumper);
+ while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
+ n++;
+
if (lines < 0) {
if (adjust >= n)
kdb_printf("buffer only contains %d lines, nothing "
@@ -2087,21 +2083,11 @@ static int kdb_dmesg(int argc, const char **argv)
else if (adjust - lines >= n)
kdb_printf("buffer only contains %d lines, last %d "
"lines printed\n", n, n - adjust);
- if (adjust) {
- for (; start < end && adjust; ++start) {
- if (*KDB_WRAP(start) == '\n')
- --adjust;
- }
- if (start < end)
- ++start;
- }
- for (p = start; p < end && lines; ++p) {
- if (*KDB_WRAP(p) == '\n')
- ++lines;
- }
- end = p;
+ skip = adjust;
+ lines = abs(lines);
} else if (lines > 0) {
- int skip = n - (adjust + lines);
+ skip = n - lines - adjust;
+ lines = abs(lines);
if (adjust >= n) {
kdb_printf("buffer only contains %d lines, "
"nothing printed\n", n);
@@ -2112,35 +2098,24 @@ static int kdb_dmesg(int argc, const char **argv)
kdb_printf("buffer only contains %d lines, first "
"%d lines printed\n", n, lines);
}
- for (; start < end && skip; ++start) {
- if (*KDB_WRAP(start) == '\n')
- --skip;
- }
- for (p = start; p < end && lines; ++p) {
- if (*KDB_WRAP(p) == '\n')
- --lines;
- }
- end = p;
+ } else {
+ lines = n;
}
- /* Do a line at a time (max 200 chars) to reduce protocol overhead */
- c = '\n';
- while (start != end) {
- char buf[201];
- p = buf;
- if (KDB_FLAG(CMD_INTERRUPT))
- return 0;
- while (start < end && (c = *KDB_WRAP(start)) &&
- (p - buf) < sizeof(buf)-1) {
- ++start;
- *p++ = c;
- if (c == '\n')
- break;
+
+ if (skip >= n || skip < 0)
+ return 0;
+
+ kmsg_dump_rewind_nolock(&dumper);
+ while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
+ if (skip) {
+ skip--;
+ continue;
}
- *p = '\0';
- kdb_printf("%s", buf);
+ if (!lines--)
+ break;
+
+ kdb_printf("%.*s\n", (int)len - 1, buf);
}
- if (c != '\n')
- kdb_printf("\n");
return 0;
}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 47c4e56e513..392ec6a2584 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -205,7 +205,6 @@ extern char kdb_grep_string[];
extern int kdb_grep_leading;
extern int kdb_grep_trailing;
extern char *kdb_cmds[];
-extern void kdb_syslog_data(char *syslog_data[]);
extern unsigned long kdb_task_state_string(const char *);
extern char kdb_task_state_char (const struct task_struct *);
extern unsigned long kdb_task_state(const struct task_struct *p,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d7d71d6ec97..f1cf0edeb39 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1645,6 +1645,8 @@ perf_install_in_context(struct perf_event_context *ctx,
lockdep_assert_held(&ctx->mutex);
event->ctx = ctx;
+ if (event->cpu != -1)
+ event->cpu = cpu;
if (!task) {
/*
@@ -6252,6 +6254,8 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
+ get_online_cpus();
+
event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
NULL, NULL);
if (IS_ERR(event)) {
@@ -6304,7 +6308,7 @@ SYSCALL_DEFINE5(perf_event_open,
/*
* Get the target context (task or percpu):
*/
- ctx = find_get_context(pmu, task, cpu);
+ ctx = find_get_context(pmu, task, event->cpu);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
goto err_alloc;
@@ -6377,20 +6381,23 @@ SYSCALL_DEFINE5(perf_event_open,
mutex_lock(&ctx->mutex);
if (move_group) {
- perf_install_in_context(ctx, group_leader, cpu);
+ synchronize_rcu();
+ perf_install_in_context(ctx, group_leader, event->cpu);
get_ctx(ctx);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_install_in_context(ctx, sibling, cpu);
+ perf_install_in_context(ctx, sibling, event->cpu);
get_ctx(ctx);
}
}
- perf_install_in_context(ctx, event, cpu);
+ perf_install_in_context(ctx, event, event->cpu);
++ctx->generation;
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
+ put_online_cpus();
+
event->owner = current;
mutex_lock(&current->perf_event_mutex);
@@ -6419,6 +6426,7 @@ err_context:
err_alloc:
free_event(event);
err_task:
+ put_online_cpus();
if (task)
put_task_struct(task);
err_group_fd:
@@ -6479,6 +6487,39 @@ err:
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
+void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+{
+ struct perf_event_context *src_ctx;
+ struct perf_event_context *dst_ctx;
+ struct perf_event *event, *tmp;
+ LIST_HEAD(events);
+
+ src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
+ dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
+
+ mutex_lock(&src_ctx->mutex);
+ list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
+ event_entry) {
+ perf_remove_from_context(event);
+ put_ctx(src_ctx);
+ list_add(&event->event_entry, &events);
+ }
+ mutex_unlock(&src_ctx->mutex);
+
+ synchronize_rcu();
+
+ mutex_lock(&dst_ctx->mutex);
+ list_for_each_entry_safe(event, tmp, &events, event_entry) {
+ list_del(&event->event_entry);
+ if (event->state >= PERF_EVENT_STATE_OFF)
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ perf_install_in_context(dst_ctx, event, dst_cpu);
+ get_ctx(dst_ctx);
+ }
+ mutex_unlock(&dst_ctx->mutex);
+}
+EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
+
static void sync_child_event(struct perf_event *child_event,
struct task_struct *child)
{
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 985be4d80fe..c08a22d02f7 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -32,19 +32,36 @@
#include <linux/swap.h> /* try_to_free_swap */
#include <linux/ptrace.h> /* user_enable_single_step */
#include <linux/kdebug.h> /* notifier mechanism */
+#include "../../mm/internal.h" /* munlock_vma_page */
#include <linux/uprobes.h>
#define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
-static struct srcu_struct uprobes_srcu;
static struct rb_root uprobes_tree = RB_ROOT;
static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
#define UPROBES_HASH_SZ 13
+/*
+ * We need separate register/unregister and mmap/munmap lock hashes because
+ * of mmap_sem nesting.
+ *
+ * uprobe_register() needs to install probes on (potentially) all processes
+ * and thus needs to acquire multiple mmap_sems (consequtively, not
+ * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
+ * for the particular process doing the mmap.
+ *
+ * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
+ * because of lock order against i_mmap_mutex. This means there's a hole in
+ * the register vma iteration where a mmap() can happen.
+ *
+ * Thus uprobe_register() can race with uprobe_mmap() and we can try and
+ * install a probe where one is already installed.
+ */
+
/* serialize (un)register */
static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
@@ -61,17 +78,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
*/
static atomic_t uprobe_events = ATOMIC_INIT(0);
-/*
- * Maintain a temporary per vma info that can be used to search if a vma
- * has already been handled. This structure is introduced since extending
- * vm_area_struct wasnt recommended.
- */
-struct vma_info {
- struct list_head probe_list;
- struct mm_struct *mm;
- loff_t vaddr;
-};
-
struct uprobe {
struct rb_node rb_node; /* node in the rb tree */
atomic_t ref;
@@ -100,20 +106,21 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register)
if (!is_register)
return true;
- if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC))
+ if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED))
+ == (VM_READ|VM_EXEC))
return true;
return false;
}
-static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
+static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
{
- loff_t vaddr;
-
- vaddr = vma->vm_start + offset;
- vaddr -= vma->vm_pgoff << PAGE_SHIFT;
+ return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+}
- return vaddr;
+static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
+{
+ return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
}
/**
@@ -121,41 +128,27 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
* based on replace_page in mm/ksm.c
*
* @vma: vma that holds the pte pointing to page
+ * @addr: address the old @page is mapped at
* @page: the cowed page we are replacing by kpage
* @kpage: the modified page we replace page by
*
* Returns 0 on success, -EFAULT on failure.
*/
-static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage)
+static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
+ struct page *page, struct page *kpage)
{
struct mm_struct *mm = vma->vm_mm;
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *ptep;
spinlock_t *ptl;
- unsigned long addr;
- int err = -EFAULT;
-
- addr = page_address_in_vma(page, vma);
- if (addr == -EFAULT)
- goto out;
-
- pgd = pgd_offset(mm, addr);
- if (!pgd_present(*pgd))
- goto out;
-
- pud = pud_offset(pgd, addr);
- if (!pud_present(*pud))
- goto out;
+ pte_t *ptep;
+ int err;
- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd))
- goto out;
+ /* For try_to_free_swap() and munlock_vma_page() below */
+ lock_page(page);
- ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ err = -EAGAIN;
+ ptep = page_check_address(page, mm, addr, &ptl, 0);
if (!ptep)
- goto out;
+ goto unlock;
get_page(kpage);
page_add_new_anon_rmap(kpage, vma, addr);
@@ -172,11 +165,15 @@ static int __replace_page(struct vm_area_struct *vma, struct page *page, struct
page_remove_rmap(page);
if (!page_mapped(page))
try_to_free_swap(page);
- put_page(page);
pte_unmap_unlock(ptep, ptl);
- err = 0;
-out:
+ if (vma->vm_flags & VM_LOCKED)
+ munlock_vma_page(page);
+ put_page(page);
+
+ err = 0;
+ unlock:
+ unlock_page(page);
return err;
}
@@ -218,79 +215,46 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
unsigned long vaddr, uprobe_opcode_t opcode)
{
struct page *old_page, *new_page;
- struct address_space *mapping;
void *vaddr_old, *vaddr_new;
struct vm_area_struct *vma;
- struct uprobe *uprobe;
- loff_t addr;
int ret;
+retry:
/* Read the page with vaddr into memory */
ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
if (ret <= 0)
return ret;
- ret = -EINVAL;
-
- /*
- * We are interested in text pages only. Our pages of interest
- * should be mapped for read and execute only. We desist from
- * adding probes in write mapped pages since the breakpoints
- * might end up in the file copy.
- */
- if (!valid_vma(vma, is_swbp_insn(&opcode)))
- goto put_out;
-
- uprobe = container_of(auprobe, struct uprobe, arch);
- mapping = uprobe->inode->i_mapping;
- if (mapping != vma->vm_file->f_mapping)
- goto put_out;
-
- addr = vma_address(vma, uprobe->offset);
- if (vaddr != (unsigned long)addr)
- goto put_out;
-
ret = -ENOMEM;
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
if (!new_page)
- goto put_out;
+ goto put_old;
__SetPageUptodate(new_page);
- /*
- * lock page will serialize against do_wp_page()'s
- * PageAnon() handling
- */
- lock_page(old_page);
/* copy the page now that we've got it stable */
vaddr_old = kmap_atomic(old_page);
vaddr_new = kmap_atomic(new_page);
memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
-
- /* poke the new insn in, ASSUMES we don't cross page boundary */
- vaddr &= ~PAGE_MASK;
- BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
- memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE);
kunmap_atomic(vaddr_new);
kunmap_atomic(vaddr_old);
ret = anon_vma_prepare(vma);
if (ret)
- goto unlock_out;
+ goto put_new;
- lock_page(new_page);
- ret = __replace_page(vma, old_page, new_page);
- unlock_page(new_page);
+ ret = __replace_page(vma, vaddr, old_page, new_page);
-unlock_out:
- unlock_page(old_page);
+put_new:
page_cache_release(new_page);
-
-put_out:
+put_old:
put_page(old_page);
+ if (unlikely(ret == -EAGAIN))
+ goto retry;
return ret;
}
@@ -312,7 +276,7 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_
void *vaddr_new;
int ret;
- ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL);
+ ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
if (ret <= 0)
return ret;
@@ -333,10 +297,20 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
uprobe_opcode_t opcode;
int result;
+ if (current->mm == mm) {
+ pagefault_disable();
+ result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
+ sizeof(opcode));
+ pagefault_enable();
+
+ if (likely(result == 0))
+ goto out;
+ }
+
result = read_opcode(mm, vaddr, &opcode);
if (result)
return result;
-
+out:
if (is_swbp_insn(&opcode))
return 1;
@@ -355,7 +329,9 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
int result;
-
+ /*
+ * See the comment near uprobes_hash().
+ */
result = is_swbp_at_addr(mm, vaddr);
if (result == 1)
return -EEXIST;
@@ -520,7 +496,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
uprobe->inode = igrab(inode);
uprobe->offset = offset;
init_rwsem(&uprobe->consumer_rwsem);
- INIT_LIST_HEAD(&uprobe->pending_list);
/* add to uprobes_tree, sorted on inode:offset */
cur_uprobe = insert_uprobe(uprobe);
@@ -588,20 +563,22 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
}
static int
-__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn,
- unsigned long nbytes, unsigned long offset)
+__copy_insn(struct address_space *mapping, struct file *filp, char *insn,
+ unsigned long nbytes, loff_t offset)
{
- struct file *filp = vma->vm_file;
struct page *page;
void *vaddr;
- unsigned long off1;
- unsigned long idx;
+ unsigned long off;
+ pgoff_t idx;
if (!filp)
return -EINVAL;
- idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT);
- off1 = offset &= ~PAGE_MASK;
+ if (!mapping->a_ops->readpage)
+ return -EIO;
+
+ idx = offset >> PAGE_CACHE_SHIFT;
+ off = offset & ~PAGE_MASK;
/*
* Ensure that the page that has the original instruction is
@@ -612,22 +589,20 @@ __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *ins
return PTR_ERR(page);
vaddr = kmap_atomic(page);
- memcpy(insn, vaddr + off1, nbytes);
+ memcpy(insn, vaddr + off, nbytes);
kunmap_atomic(vaddr);
page_cache_release(page);
return 0;
}
-static int
-copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
+static int copy_insn(struct uprobe *uprobe, struct file *filp)
{
struct address_space *mapping;
unsigned long nbytes;
int bytes;
- addr &= ~PAGE_MASK;
- nbytes = PAGE_SIZE - addr;
+ nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK);
mapping = uprobe->inode->i_mapping;
/* Instruction at end of binary; copy only available bytes */
@@ -638,13 +613,13 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
/* Instruction at the page-boundary; copy bytes in second page */
if (nbytes < bytes) {
- if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes,
- bytes - nbytes, uprobe->offset + nbytes))
- return -ENOMEM;
-
+ int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
+ bytes - nbytes, uprobe->offset + nbytes);
+ if (err)
+ return err;
bytes = nbytes;
}
- return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset);
+ return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
}
/*
@@ -672,9 +647,8 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
*/
static int
install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
- struct vm_area_struct *vma, loff_t vaddr)
+ struct vm_area_struct *vma, unsigned long vaddr)
{
- unsigned long addr;
int ret;
/*
@@ -687,20 +661,22 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
if (!uprobe->consumers)
return -EEXIST;
- addr = (unsigned long)vaddr;
-
if (!(uprobe->flags & UPROBE_COPY_INSN)) {
- ret = copy_insn(uprobe, vma, addr);
+ ret = copy_insn(uprobe, vma->vm_file);
if (ret)
return ret;
if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
- return -EEXIST;
+ return -ENOTSUPP;
- ret = arch_uprobe_analyze_insn(&uprobe->arch, mm);
+ ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
if (ret)
return ret;
+ /* write_opcode() assumes we don't cross page boundary */
+ BUG_ON((uprobe->offset & ~PAGE_MASK) +
+ UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
+
uprobe->flags |= UPROBE_COPY_INSN;
}
@@ -713,7 +689,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
* Hence increment before and decrement on failure.
*/
atomic_inc(&mm->uprobes_state.count);
- ret = set_swbp(&uprobe->arch, mm, addr);
+ ret = set_swbp(&uprobe->arch, mm, vaddr);
if (ret)
atomic_dec(&mm->uprobes_state.count);
@@ -721,27 +697,21 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
}
static void
-remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr)
+remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
{
- if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true))
+ if (!set_orig_insn(&uprobe->arch, mm, vaddr, true))
atomic_dec(&mm->uprobes_state.count);
}
/*
- * There could be threads that have hit the breakpoint and are entering the
- * notifier code and trying to acquire the uprobes_treelock. The thread
- * calling delete_uprobe() that is removing the uprobe from the rb_tree can
- * race with these threads and might acquire the uprobes_treelock compared
- * to some of the breakpoint hit threads. In such a case, the breakpoint
- * hit threads will not find the uprobe. The current unregistering thread
- * waits till all other threads have hit a breakpoint, to acquire the
- * uprobes_treelock before the uprobe is removed from the rbtree.
+ * There could be threads that have already hit the breakpoint. They
+ * will recheck the current insn and restart if find_uprobe() fails.
+ * See find_active_uprobe().
*/
static void delete_uprobe(struct uprobe *uprobe)
{
unsigned long flags;
- synchronize_srcu(&uprobes_srcu);
spin_lock_irqsave(&uprobes_treelock, flags);
rb_erase(&uprobe->rb_node, &uprobes_tree);
spin_unlock_irqrestore(&uprobes_treelock, flags);
@@ -750,139 +720,136 @@ static void delete_uprobe(struct uprobe *uprobe)
atomic_dec(&uprobe_events);
}
-static struct vma_info *
-__find_next_vma_info(struct address_space *mapping, struct list_head *head,
- struct vma_info *vi, loff_t offset, bool is_register)
+struct map_info {
+ struct map_info *next;
+ struct mm_struct *mm;
+ unsigned long vaddr;
+};
+
+static inline struct map_info *free_map_info(struct map_info *info)
+{
+ struct map_info *next = info->next;
+ kfree(info);
+ return next;
+}
+
+static struct map_info *
+build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
{
+ unsigned long pgoff = offset >> PAGE_SHIFT;
struct prio_tree_iter iter;
struct vm_area_struct *vma;
- struct vma_info *tmpvi;
- unsigned long pgoff;
- int existing_vma;
- loff_t vaddr;
-
- pgoff = offset >> PAGE_SHIFT;
+ struct map_info *curr = NULL;
+ struct map_info *prev = NULL;
+ struct map_info *info;
+ int more = 0;
+ again:
+ mutex_lock(&mapping->i_mmap_mutex);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
if (!valid_vma(vma, is_register))
continue;
- existing_vma = 0;
- vaddr = vma_address(vma, offset);
-
- list_for_each_entry(tmpvi, head, probe_list) {
- if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) {
- existing_vma = 1;
- break;
- }
+ if (!prev && !more) {
+ /*
+ * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
+ * reclaim. This is optimistic, no harm done if it fails.
+ */
+ prev = kmalloc(sizeof(struct map_info),
+ GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ if (prev)
+ prev->next = NULL;
}
-
- /*
- * Another vma needs a probe to be installed. However skip
- * installing the probe if the vma is about to be unlinked.
- */
- if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) {
- vi->mm = vma->vm_mm;
- vi->vaddr = vaddr;
- list_add(&vi->probe_list, head);
-
- return vi;
+ if (!prev) {
+ more++;
+ continue;
}
- }
-
- return NULL;
-}
-/*
- * Iterate in the rmap prio tree and find a vma where a probe has not
- * yet been inserted.
- */
-static struct vma_info *
-find_next_vma_info(struct address_space *mapping, struct list_head *head,
- loff_t offset, bool is_register)
-{
- struct vma_info *vi, *retvi;
+ if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
+ continue;
- vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL);
- if (!vi)
- return ERR_PTR(-ENOMEM);
+ info = prev;
+ prev = prev->next;
+ info->next = curr;
+ curr = info;
- mutex_lock(&mapping->i_mmap_mutex);
- retvi = __find_next_vma_info(mapping, head, vi, offset, is_register);
+ info->mm = vma->vm_mm;
+ info->vaddr = offset_to_vaddr(vma, offset);
+ }
mutex_unlock(&mapping->i_mmap_mutex);
- if (!retvi)
- kfree(vi);
+ if (!more)
+ goto out;
+
+ prev = curr;
+ while (curr) {
+ mmput(curr->mm);
+ curr = curr->next;
+ }
- return retvi;
+ do {
+ info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
+ if (!info) {
+ curr = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+ info->next = prev;
+ prev = info;
+ } while (--more);
+
+ goto again;
+ out:
+ while (prev)
+ prev = free_map_info(prev);
+ return curr;
}
static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
{
- struct list_head try_list;
- struct vm_area_struct *vma;
- struct address_space *mapping;
- struct vma_info *vi, *tmpvi;
- struct mm_struct *mm;
- loff_t vaddr;
- int ret;
-
- mapping = uprobe->inode->i_mapping;
- INIT_LIST_HEAD(&try_list);
+ struct map_info *info;
+ int err = 0;
- ret = 0;
+ info = build_map_info(uprobe->inode->i_mapping,
+ uprobe->offset, is_register);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
- for (;;) {
- vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register);
- if (!vi)
- break;
+ while (info) {
+ struct mm_struct *mm = info->mm;
+ struct vm_area_struct *vma;
- if (IS_ERR(vi)) {
- ret = PTR_ERR(vi);
- break;
- }
+ if (err)
+ goto free;
- mm = vi->mm;
- down_read(&mm->mmap_sem);
- vma = find_vma(mm, (unsigned long)vi->vaddr);
- if (!vma || !valid_vma(vma, is_register)) {
- list_del(&vi->probe_list);
- kfree(vi);
- up_read(&mm->mmap_sem);
- mmput(mm);
- continue;
- }
- vaddr = vma_address(vma, uprobe->offset);
- if (vma->vm_file->f_mapping->host != uprobe->inode ||
- vaddr != vi->vaddr) {
- list_del(&vi->probe_list);
- kfree(vi);
- up_read(&mm->mmap_sem);
- mmput(mm);
- continue;
- }
+ down_write(&mm->mmap_sem);
+ vma = find_vma(mm, info->vaddr);
+ if (!vma || !valid_vma(vma, is_register) ||
+ vma->vm_file->f_mapping->host != uprobe->inode)
+ goto unlock;
- if (is_register)
- ret = install_breakpoint(uprobe, mm, vma, vi->vaddr);
- else
- remove_breakpoint(uprobe, mm, vi->vaddr);
+ if (vma->vm_start > info->vaddr ||
+ vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
+ goto unlock;
- up_read(&mm->mmap_sem);
- mmput(mm);
if (is_register) {
- if (ret && ret == -EEXIST)
- ret = 0;
- if (ret)
- break;
+ err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+ /*
+ * We can race against uprobe_mmap(), see the
+ * comment near uprobe_hash().
+ */
+ if (err == -EEXIST)
+ err = 0;
+ } else {
+ remove_breakpoint(uprobe, mm, info->vaddr);
}
+ unlock:
+ up_write(&mm->mmap_sem);
+ free:
+ mmput(mm);
+ info = free_map_info(info);
}
- list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) {
- list_del(&vi->probe_list);
- kfree(vi);
- }
-
- return ret;
+ return err;
}
static int __uprobe_register(struct uprobe *uprobe)
@@ -977,59 +944,66 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
put_uprobe(uprobe);
}
-/*
- * Of all the nodes that correspond to the given inode, return the node
- * with the least offset.
- */
-static struct rb_node *find_least_offset_node(struct inode *inode)
+static struct rb_node *
+find_node_in_range(struct inode *inode, loff_t min, loff_t max)
{
- struct uprobe u = { .inode = inode, .offset = 0};
struct rb_node *n = uprobes_tree.rb_node;
- struct rb_node *close_node = NULL;
- struct uprobe *uprobe;
- int match;
while (n) {
- uprobe = rb_entry(n, struct uprobe, rb_node);
- match = match_uprobe(&u, uprobe);
+ struct uprobe *u = rb_entry(n, struct uprobe, rb_node);
- if (uprobe->inode == inode)
- close_node = n;
-
- if (!match)
- return close_node;
-
- if (match < 0)
+ if (inode < u->inode) {
n = n->rb_left;
- else
+ } else if (inode > u->inode) {
n = n->rb_right;
+ } else {
+ if (max < u->offset)
+ n = n->rb_left;
+ else if (min > u->offset)
+ n = n->rb_right;
+ else
+ break;
+ }
}
- return close_node;
+ return n;
}
/*
- * For a given inode, build a list of probes that need to be inserted.
+ * For a given range in vma, build a list of probes that need to be inserted.
*/
-static void build_probe_list(struct inode *inode, struct list_head *head)
+static void build_probe_list(struct inode *inode,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct list_head *head)
{
- struct uprobe *uprobe;
+ loff_t min, max;
unsigned long flags;
- struct rb_node *n;
-
- spin_lock_irqsave(&uprobes_treelock, flags);
-
- n = find_least_offset_node(inode);
+ struct rb_node *n, *t;
+ struct uprobe *u;
- for (; n; n = rb_next(n)) {
- uprobe = rb_entry(n, struct uprobe, rb_node);
- if (uprobe->inode != inode)
- break;
+ INIT_LIST_HEAD(head);
+ min = vaddr_to_offset(vma, start);
+ max = min + (end - start) - 1;
- list_add(&uprobe->pending_list, head);
- atomic_inc(&uprobe->ref);
+ spin_lock_irqsave(&uprobes_treelock, flags);
+ n = find_node_in_range(inode, min, max);
+ if (n) {
+ for (t = n; t; t = rb_prev(t)) {
+ u = rb_entry(t, struct uprobe, rb_node);
+ if (u->inode != inode || u->offset < min)
+ break;
+ list_add(&u->pending_list, head);
+ atomic_inc(&u->ref);
+ }
+ for (t = n; (t = rb_next(t)); ) {
+ u = rb_entry(t, struct uprobe, rb_node);
+ if (u->inode != inode || u->offset > max)
+ break;
+ list_add(&u->pending_list, head);
+ atomic_inc(&u->ref);
+ }
}
-
spin_unlock_irqrestore(&uprobes_treelock, flags);
}
@@ -1059,28 +1033,21 @@ int uprobe_mmap(struct vm_area_struct *vma)
if (!inode)
return 0;
- INIT_LIST_HEAD(&tmp_list);
mutex_lock(uprobes_mmap_hash(inode));
- build_probe_list(inode, &tmp_list);
+ build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
ret = 0;
count = 0;
list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
- loff_t vaddr;
-
- list_del(&uprobe->pending_list);
if (!ret) {
- vaddr = vma_address(vma, uprobe->offset);
-
- if (vaddr < vma->vm_start || vaddr >= vma->vm_end) {
- put_uprobe(uprobe);
- continue;
- }
+ unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
-
- /* Ignore double add: */
+ /*
+ * We can race against uprobe_register(), see the
+ * comment near uprobe_hash().
+ */
if (ret == -EEXIST) {
ret = 0;
@@ -1121,6 +1088,9 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
return;
+ if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
+ return;
+
if (!atomic_read(&vma->vm_mm->uprobes_state.count))
return;
@@ -1128,24 +1098,17 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
if (!inode)
return;
- INIT_LIST_HEAD(&tmp_list);
mutex_lock(uprobes_mmap_hash(inode));
- build_probe_list(inode, &tmp_list);
+ build_probe_list(inode, vma, start, end, &tmp_list);
list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
- loff_t vaddr;
-
- list_del(&uprobe->pending_list);
- vaddr = vma_address(vma, uprobe->offset);
-
- if (vaddr >= start && vaddr < end) {
- /*
- * An unregister could have removed the probe before
- * unmap. So check before we decrement the count.
- */
- if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
- atomic_dec(&vma->vm_mm->uprobes_state.count);
- }
+ unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
+ /*
+ * An unregister could have removed the probe before
+ * unmap. So check before we decrement the count.
+ */
+ if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
+ atomic_dec(&vma->vm_mm->uprobes_state.count);
put_uprobe(uprobe);
}
mutex_unlock(uprobes_mmap_hash(inode));
@@ -1378,9 +1341,6 @@ void uprobe_free_utask(struct task_struct *t)
{
struct uprobe_task *utask = t->utask;
- if (t->uprobe_srcu_id != -1)
- srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id);
-
if (!utask)
return;
@@ -1398,7 +1358,6 @@ void uprobe_free_utask(struct task_struct *t)
void uprobe_copy_process(struct task_struct *t)
{
t->utask = NULL;
- t->uprobe_srcu_id = -1;
}
/*
@@ -1417,7 +1376,6 @@ static struct uprobe_task *add_utask(void)
if (unlikely(!utask))
return NULL;
- utask->active_uprobe = NULL;
current->utask = utask;
return utask;
}
@@ -1479,41 +1437,61 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
return false;
}
+static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
+{
+ struct mm_struct *mm = current->mm;
+ struct uprobe *uprobe = NULL;
+ struct vm_area_struct *vma;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, bp_vaddr);
+ if (vma && vma->vm_start <= bp_vaddr) {
+ if (valid_vma(vma, false)) {
+ struct inode *inode = vma->vm_file->f_mapping->host;
+ loff_t offset = vaddr_to_offset(vma, bp_vaddr);
+
+ uprobe = find_uprobe(inode, offset);
+ }
+
+ if (!uprobe)
+ *is_swbp = is_swbp_at_addr(mm, bp_vaddr);
+ } else {
+ *is_swbp = -EFAULT;
+ }
+ up_read(&mm->mmap_sem);
+
+ return uprobe;
+}
+
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
*/
static void handle_swbp(struct pt_regs *regs)
{
- struct vm_area_struct *vma;
struct uprobe_task *utask;
struct uprobe *uprobe;
- struct mm_struct *mm;
unsigned long bp_vaddr;
+ int uninitialized_var(is_swbp);
- uprobe = NULL;
bp_vaddr = uprobe_get_swbp_addr(regs);
- mm = current->mm;
- down_read(&mm->mmap_sem);
- vma = find_vma(mm, bp_vaddr);
-
- if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) {
- struct inode *inode;
- loff_t offset;
-
- inode = vma->vm_file->f_mapping->host;
- offset = bp_vaddr - vma->vm_start;
- offset += (vma->vm_pgoff << PAGE_SHIFT);
- uprobe = find_uprobe(inode, offset);
- }
-
- srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id);
- current->uprobe_srcu_id = -1;
- up_read(&mm->mmap_sem);
+ uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
if (!uprobe) {
- /* No matching uprobe; signal SIGTRAP. */
- send_sig(SIGTRAP, current, 0);
+ if (is_swbp > 0) {
+ /* No matching uprobe; signal SIGTRAP. */
+ send_sig(SIGTRAP, current, 0);
+ } else {
+ /*
+ * Either we raced with uprobe_unregister() or we can't
+ * access this memory. The latter is only possible if
+ * another thread plays with our ->mm. In both cases
+ * we can simply restart. If this vma was unmapped we
+ * can pretend this insn was not executed yet and get
+ * the (correct) SIGSEGV after restart.
+ */
+ instruction_pointer_set(regs, bp_vaddr);
+ }
return;
}
@@ -1620,7 +1598,6 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs)
utask->state = UTASK_BP_HIT;
set_thread_flag(TIF_UPROBE);
- current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu);
return 1;
}
@@ -1655,7 +1632,6 @@ static int __init init_uprobes(void)
mutex_init(&uprobes_mutex[i]);
mutex_init(&uprobes_mmap_mutex[i]);
}
- init_srcu_struct(&uprobes_srcu);
return register_die_notifier(&uprobe_exception_nb);
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 2f59cc33451..f65345f9e5b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -483,7 +483,7 @@ static void close_files(struct files_struct * files)
rcu_read_unlock();
for (;;) {
unsigned long set;
- i = j * __NFDBITS;
+ i = j * BITS_PER_LONG;
if (i >= fdt->max_fds)
break;
set = fdt->open_fds[j++];
@@ -953,14 +953,11 @@ void do_exit(long code)
exit_signals(tsk); /* sets PF_EXITING */
/*
* tsk->flags are checked in the futex code to protect against
- * an exiting task cleaning up the robust pi futexes, and in
- * task_work_add() to avoid the race with exit_task_work().
+ * an exiting task cleaning up the robust pi futexes.
*/
smp_mb();
raw_spin_unlock_wait(&tsk->pi_lock);
- exit_task_work(tsk);
-
if (unlikely(in_atomic()))
printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
current->comm, task_pid_nr(current),
@@ -995,6 +992,7 @@ void do_exit(long code)
exit_shm(tsk);
exit_files(tsk);
exit_fs(tsk);
+ exit_task_work(tsk);
check_stack_usage();
exit_thread();
diff --git a/kernel/fork.c b/kernel/fork.c
index ab5211b9e62..8efac1fe56b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -114,6 +114,10 @@ int nr_processes(void)
return total;
}
+void __weak arch_release_task_struct(struct task_struct *tsk)
+{
+}
+
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static struct kmem_cache *task_struct_cachep;
@@ -122,17 +126,17 @@ static inline struct task_struct *alloc_task_struct_node(int node)
return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
}
-void __weak arch_release_task_struct(struct task_struct *tsk) { }
-
static inline void free_task_struct(struct task_struct *tsk)
{
- arch_release_task_struct(tsk);
kmem_cache_free(task_struct_cachep, tsk);
}
#endif
+void __weak arch_release_thread_info(struct thread_info *ti)
+{
+}
+
#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
-void __weak arch_release_thread_info(struct thread_info *ti) { }
/*
* Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
@@ -150,7 +154,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
static inline void free_thread_info(struct thread_info *ti)
{
- arch_release_thread_info(ti);
free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
}
# else
@@ -164,7 +167,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
static void free_thread_info(struct thread_info *ti)
{
- arch_release_thread_info(ti);
kmem_cache_free(thread_info_cache, ti);
}
@@ -205,10 +207,12 @@ static void account_kernel_stack(struct thread_info *ti, int account)
void free_task(struct task_struct *tsk)
{
account_kernel_stack(tsk->stack, -1);
+ arch_release_thread_info(tsk->stack);
free_thread_info(tsk->stack);
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
put_seccomp_filter(tsk);
+ arch_release_task_struct(tsk);
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
@@ -298,14 +302,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
return NULL;
ti = alloc_thread_info_node(tsk, node);
- if (!ti) {
- free_task_struct(tsk);
- return NULL;
- }
+ if (!ti)
+ goto free_tsk;
err = arch_dup_task_struct(tsk, orig);
if (err)
- goto out;
+ goto free_ti;
tsk->stack = ti;
@@ -333,8 +335,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
return tsk;
-out:
+free_ti:
free_thread_info(ti);
+free_tsk:
free_task_struct(tsk);
return NULL;
}
@@ -386,8 +389,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
}
charge = 0;
if (mpnt->vm_flags & VM_ACCOUNT) {
- unsigned long len;
- len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
+ unsigned long len = vma_pages(mpnt);
+
if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
goto fail_nomem;
charge = len;
@@ -1415,7 +1418,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
*/
p->group_leader = p;
INIT_LIST_HEAD(&p->thread_group);
- INIT_HLIST_HEAD(&p->task_works);
+ p->task_works = NULL;
/* Now that the task is set up, run cgroup callbacks if
* necessary. We need to run them before the task is visible
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ae34bf51682..6db7a5ed52b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
return 0;
}
+static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
+{
+ ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+ ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+
+ return ktime_get_update_offsets(offs_real, offs_boot);
+}
+
/*
* Retrigger next event is called after clock was set
*
@@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
static void retrigger_next_event(void *arg)
{
struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
- struct timespec realtime_offset, xtim, wtm, sleep;
if (!hrtimer_hres_active())
return;
- /* Optimized out for !HIGH_RES */
- get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
- set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
-
- /* Adjust CLOCK_REALTIME offset */
raw_spin_lock(&base->lock);
- base->clock_base[HRTIMER_BASE_REALTIME].offset =
- timespec_to_ktime(realtime_offset);
- base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
- timespec_to_ktime(sleep);
-
+ hrtimer_update_base(base);
hrtimer_force_reprogram(base, 0);
raw_spin_unlock(&base->lock);
}
@@ -710,13 +708,25 @@ static int hrtimer_switch_to_hres(void)
base->clock_base[i].resolution = KTIME_HIGH_RES;
tick_setup_sched_timer();
-
/* "Retrigger" the interrupt to get things going */
retrigger_next_event(NULL);
local_irq_restore(flags);
return 1;
}
+/*
+ * Called from timekeeping code to reprogramm the hrtimer interrupt
+ * device. If called from the timer interrupt context we defer it to
+ * softirq context.
+ */
+void clock_was_set_delayed(void)
+{
+ struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+
+ cpu_base->clock_was_set = 1;
+ __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+}
+
#else
static inline int hrtimer_hres_active(void) { return 0; }
@@ -1250,11 +1260,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
cpu_base->nr_events++;
dev->next_event.tv64 = KTIME_MAX;
- entry_time = now = ktime_get();
+ raw_spin_lock(&cpu_base->lock);
+ entry_time = now = hrtimer_update_base(cpu_base);
retry:
expires_next.tv64 = KTIME_MAX;
-
- raw_spin_lock(&cpu_base->lock);
/*
* We set expires_next to KTIME_MAX here with cpu_base->lock
* held to prevent that a timer is enqueued in our queue via
@@ -1330,8 +1339,12 @@ retry:
* We need to prevent that we loop forever in the hrtimer
* interrupt routine. We give it 3 attempts to avoid
* overreacting on some spurious event.
+ *
+ * Acquire base lock for updating the offsets and retrieving
+ * the current time.
*/
- now = ktime_get();
+ raw_spin_lock(&cpu_base->lock);
+ now = hrtimer_update_base(cpu_base);
cpu_base->nr_retries++;
if (++retries < 3)
goto retry;
@@ -1343,6 +1356,7 @@ retry:
*/
cpu_base->nr_hangs++;
cpu_base->hang_detected = 1;
+ raw_spin_unlock(&cpu_base->lock);
delta = ktime_sub(now, entry_time);
if (delta.tv64 > cpu_base->max_hang_time.tv64)
cpu_base->max_hang_time = delta;
@@ -1395,6 +1409,13 @@ void hrtimer_peek_ahead_timers(void)
static void run_hrtimer_softirq(struct softirq_action *h)
{
+ struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+
+ if (cpu_base->clock_was_set) {
+ cpu_base->clock_was_set = 0;
+ clock_was_set();
+ }
+
hrtimer_peek_ahead_timers();
}
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 41c1564103f..38c5eb839c9 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -448,7 +448,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
}
pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
- hwirq, domain->of_node ? domain->of_node->full_name : "null", virq);
+ hwirq, of_node_full_name(domain->of_node), virq);
return virq;
}
@@ -477,7 +477,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
return intspec[0];
#endif
pr_warning("no irq domain found for %s !\n",
- controller->full_name);
+ of_node_full_name(controller));
return 0;
}
@@ -725,8 +725,8 @@ static int virq_debug_show(struct seq_file *m, void *private)
data = irq_desc_get_chip_data(desc);
seq_printf(m, data ? "0x%p " : " %p ", data);
- if (desc->irq_data.domain && desc->irq_data.domain->of_node)
- p = desc->irq_data.domain->of_node->full_name;
+ if (desc->irq_data.domain)
+ p = of_node_full_name(desc->irq_data.domain->of_node);
else
p = none;
seq_printf(m, "%s\n", p);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5e42eb11967..0a8e8f05962 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -781,7 +781,7 @@ static void wake_threads_waitq(struct irq_desc *desc)
wake_up(&desc->wait_for_threads);
}
-static void irq_thread_dtor(struct task_work *unused)
+static void irq_thread_dtor(struct callback_head *unused)
{
struct task_struct *tsk = current;
struct irq_desc *desc;
@@ -813,7 +813,7 @@ static void irq_thread_dtor(struct task_work *unused)
*/
static int irq_thread(void *data)
{
- struct task_work on_exit_work;
+ struct callback_head on_exit_work;
static const struct sched_param param = {
.sched_priority = MAX_USER_RT_PRIO/2,
};
@@ -830,7 +830,7 @@ static int irq_thread(void *data)
sched_setscheduler(current, SCHED_FIFO, &param);
- init_task_work(&on_exit_work, irq_thread_dtor, NULL);
+ init_task_work(&on_exit_work, irq_thread_dtor);
task_work_add(current, &on_exit_work, false);
while (!irq_wait_for_interrupt(action)) {
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4e2e472f6ae..0668d58d641 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1424,7 +1424,7 @@ static void update_vmcoreinfo_note(void)
void crash_save_vmcoreinfo(void)
{
- vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
+ vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
update_vmcoreinfo_note();
}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index ff2c7cb86d7..6f99aead66c 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -45,6 +45,13 @@ extern int max_threads;
static struct workqueue_struct *khelper_wq;
+/*
+ * kmod_thread_locker is used for deadlock avoidance. There is no explicit
+ * locking to protect this global - it is private to the singleton khelper
+ * thread and should only ever be modified by that thread.
+ */
+static const struct task_struct *kmod_thread_locker;
+
#define CAP_BSET (void *)1
#define CAP_PI (void *)2
@@ -221,6 +228,13 @@ fail:
return 0;
}
+static int call_helper(void *data)
+{
+ /* Worker thread started blocking khelper thread. */
+ kmod_thread_locker = current;
+ return ____call_usermodehelper(data);
+}
+
static void call_usermodehelper_freeinfo(struct subprocess_info *info)
{
if (info->cleanup)
@@ -295,9 +309,12 @@ static void __call_usermodehelper(struct work_struct *work)
if (wait == UMH_WAIT_PROC)
pid = kernel_thread(wait_for_helper, sub_info,
CLONE_FS | CLONE_FILES | SIGCHLD);
- else
- pid = kernel_thread(____call_usermodehelper, sub_info,
+ else {
+ pid = kernel_thread(call_helper, sub_info,
CLONE_VFORK | SIGCHLD);
+ /* Worker thread stopped blocking khelper thread. */
+ kmod_thread_locker = NULL;
+ }
switch (wait) {
case UMH_NO_WAIT:
@@ -548,6 +565,16 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
retval = -EBUSY;
goto out;
}
+ /*
+ * Worker thread must not wait for khelper thread at below
+ * wait_for_completion() if the thread was created with CLONE_VFORK
+ * flag, for khelper thread is already waiting for the thread at
+ * wait_for_completion() in do_fork().
+ */
+ if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
+ retval = -EBUSY;
+ goto out;
+ }
sub_info->complete = &done;
sub_info->wait = wait;
@@ -577,6 +604,12 @@ unlock:
return retval;
}
+/*
+ * call_usermodehelper_fns() will not run the caller-provided cleanup function
+ * if a memory allocation failure is experienced. So the caller might need to
+ * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform
+ * the necessaary cleanup within the caller.
+ */
int call_usermodehelper_fns(
char *path, char **argv, char **envp, int wait,
int (*init)(struct subprocess_info *info, struct cred *new),
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3d3de633702..b579af57ea1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -360,16 +360,12 @@ repeat:
struct kthread_work, node);
list_del_init(&work->node);
}
+ worker->current_work = work;
spin_unlock_irq(&worker->lock);
if (work) {
__set_current_state(TASK_RUNNING);
work->func(work);
- smp_wmb(); /* wmb worker-b0 paired with flush-b1 */
- work->done_seq = work->queue_seq;
- smp_mb(); /* mb worker-b1 paired with flush-b0 */
- if (atomic_read(&work->flushing))
- wake_up_all(&work->done);
} else if (!freezing(current))
schedule();
@@ -378,6 +374,19 @@ repeat:
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);
+/* insert @work before @pos in @worker */
+static void insert_kthread_work(struct kthread_worker *worker,
+ struct kthread_work *work,
+ struct list_head *pos)
+{
+ lockdep_assert_held(&worker->lock);
+
+ list_add_tail(&work->node, pos);
+ work->worker = worker;
+ if (likely(worker->task))
+ wake_up_process(worker->task);
+}
+
/**
* queue_kthread_work - queue a kthread_work
* @worker: target kthread_worker
@@ -395,10 +404,7 @@ bool queue_kthread_work(struct kthread_worker *worker,
spin_lock_irqsave(&worker->lock, flags);
if (list_empty(&work->node)) {
- list_add_tail(&work->node, &worker->work_list);
- work->queue_seq++;
- if (likely(worker->task))
- wake_up_process(worker->task);
+ insert_kthread_work(worker, work, &worker->work_list);
ret = true;
}
spin_unlock_irqrestore(&worker->lock, flags);
@@ -406,6 +412,18 @@ bool queue_kthread_work(struct kthread_worker *worker,
}
EXPORT_SYMBOL_GPL(queue_kthread_work);
+struct kthread_flush_work {
+ struct kthread_work work;
+ struct completion done;
+};
+
+static void kthread_flush_work_fn(struct kthread_work *work)
+{
+ struct kthread_flush_work *fwork =
+ container_of(work, struct kthread_flush_work, work);
+ complete(&fwork->done);
+}
+
/**
* flush_kthread_work - flush a kthread_work
* @work: work to flush
@@ -414,39 +432,37 @@ EXPORT_SYMBOL_GPL(queue_kthread_work);
*/
void flush_kthread_work(struct kthread_work *work)
{
- int seq = work->queue_seq;
-
- atomic_inc(&work->flushing);
+ struct kthread_flush_work fwork = {
+ KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
+ COMPLETION_INITIALIZER_ONSTACK(fwork.done),
+ };
+ struct kthread_worker *worker;
+ bool noop = false;
- /*
- * mb flush-b0 paired with worker-b1, to make sure either
- * worker sees the above increment or we see done_seq update.
- */
- smp_mb__after_atomic_inc();
+retry:
+ worker = work->worker;
+ if (!worker)
+ return;
- /* A - B <= 0 tests whether B is in front of A regardless of overflow */
- wait_event(work->done, seq - work->done_seq <= 0);
- atomic_dec(&work->flushing);
+ spin_lock_irq(&worker->lock);
+ if (work->worker != worker) {
+ spin_unlock_irq(&worker->lock);
+ goto retry;
+ }
- /*
- * rmb flush-b1 paired with worker-b0, to make sure our caller
- * sees every change made by work->func().
- */
- smp_mb__after_atomic_dec();
-}
-EXPORT_SYMBOL_GPL(flush_kthread_work);
+ if (!list_empty(&work->node))
+ insert_kthread_work(worker, &fwork.work, work->node.next);
+ else if (worker->current_work == work)
+ insert_kthread_work(worker, &fwork.work, worker->work_list.next);
+ else
+ noop = true;
-struct kthread_flush_work {
- struct kthread_work work;
- struct completion done;
-};
+ spin_unlock_irq(&worker->lock);
-static void kthread_flush_work_fn(struct kthread_work *work)
-{
- struct kthread_flush_work *fwork =
- container_of(work, struct kthread_flush_work, work);
- complete(&fwork->done);
+ if (!noop)
+ wait_for_completion(&fwork.done);
}
+EXPORT_SYMBOL_GPL(flush_kthread_work);
/**
* flush_kthread_worker - flush all current works on a kthread_worker
diff --git a/kernel/panic.c b/kernel/panic.c
index d2a5f4ecc6d..e1b2822fff9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -75,6 +75,14 @@ void panic(const char *fmt, ...)
int state = 0;
/*
+ * Disable local interrupts. This will prevent panic_smp_self_stop
+ * from deadlocking the first cpu that invokes the panic, since
+ * there is nothing to prevent an interrupt handler (that runs
+ * after the panic_lock is acquired) from invoking panic again.
+ */
+ local_irq_disable();
+
+ /*
* It's possible to come here directly from a panic-assertion and
* not have preempt disabled. Some functions called from here want
* preempt to be disabled. No point enabling it later though...
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 8f9b4eb974e..a70518c9d82 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -175,7 +175,7 @@ config PM_TEST_SUSPEND
You probably want to have your system's RTC driver statically
linked, ensuring that it's available when this test runs.
-config CAN_PM_TRACE
+config PM_SLEEP_DEBUG
def_bool y
depends on PM_DEBUG && PM_SLEEP
@@ -196,7 +196,7 @@ config PM_TRACE
config PM_TRACE_RTC
bool "Suspend/resume event tracing"
- depends on CAN_PM_TRACE
+ depends on PM_SLEEP_DEBUG
depends on X86
select PM_TRACE
---help---
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8b53db38a27..b26f5f1e773 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -5,6 +5,7 @@
* Copyright (c) 2003 Open Source Development Lab
* Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
* Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
+ * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com>
*
* This file is released under the GPLv2.
*/
@@ -27,7 +28,6 @@
#include <linux/syscore_ops.h>
#include <linux/ctype.h>
#include <linux/genhd.h>
-#include <scsi/scsi_scan.h>
#include "power.h"
@@ -46,6 +46,9 @@ enum {
HIBERNATION_PLATFORM,
HIBERNATION_SHUTDOWN,
HIBERNATION_REBOOT,
+#ifdef CONFIG_SUSPEND
+ HIBERNATION_SUSPEND,
+#endif
/* keep last */
__HIBERNATION_AFTER_LAST
};
@@ -354,6 +357,7 @@ int hibernation_snapshot(int platform_mode)
}
suspend_console();
+ ftrace_stop();
pm_restrict_gfp_mask();
error = dpm_suspend(PMSG_FREEZE);
@@ -379,6 +383,7 @@ int hibernation_snapshot(int platform_mode)
if (error || !in_suspend)
pm_restore_gfp_mask();
+ ftrace_start();
resume_console();
dpm_complete(msg);
@@ -481,6 +486,7 @@ int hibernation_restore(int platform_mode)
pm_prepare_console();
suspend_console();
+ ftrace_stop();
pm_restrict_gfp_mask();
error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
@@ -488,6 +494,7 @@ int hibernation_restore(int platform_mode)
dpm_resume_end(PMSG_RECOVER);
}
pm_restore_gfp_mask();
+ ftrace_start();
resume_console();
pm_restore_console();
return error;
@@ -514,6 +521,7 @@ int hibernation_platform_enter(void)
entering_platform_hibernation = true;
suspend_console();
+ ftrace_stop();
error = dpm_suspend_start(PMSG_HIBERNATE);
if (error) {
if (hibernation_ops->recover)
@@ -557,6 +565,7 @@ int hibernation_platform_enter(void)
Resume_devices:
entering_platform_hibernation = false;
dpm_resume_end(PMSG_RESTORE);
+ ftrace_start();
resume_console();
Close:
@@ -574,6 +583,10 @@ int hibernation_platform_enter(void)
*/
static void power_down(void)
{
+#ifdef CONFIG_SUSPEND
+ int error;
+#endif
+
switch (hibernation_mode) {
case HIBERNATION_REBOOT:
kernel_restart(NULL);
@@ -583,6 +596,25 @@ static void power_down(void)
case HIBERNATION_SHUTDOWN:
kernel_power_off();
break;
+#ifdef CONFIG_SUSPEND
+ case HIBERNATION_SUSPEND:
+ error = suspend_devices_and_enter(PM_SUSPEND_MEM);
+ if (error) {
+ if (hibernation_ops)
+ hibernation_mode = HIBERNATION_PLATFORM;
+ else
+ hibernation_mode = HIBERNATION_SHUTDOWN;
+ power_down();
+ }
+ /*
+ * Restore swap signature.
+ */
+ error = swsusp_unmark();
+ if (error)
+ printk(KERN_ERR "PM: Swap will be unusable! "
+ "Try swapon -a.\n");
+ return;
+#endif
}
kernel_halt();
/*
@@ -748,13 +780,6 @@ static int software_resume(void)
async_synchronize_full();
}
- /*
- * We can't depend on SCSI devices being available after loading
- * one of their modules until scsi_complete_async_scans() is
- * called and the resume device usually is a SCSI one.
- */
- scsi_complete_async_scans();
-
swsusp_resume_device = name_to_dev_t(resume_file);
if (!swsusp_resume_device) {
error = -ENODEV;
@@ -827,6 +852,9 @@ static const char * const hibernation_modes[] = {
[HIBERNATION_PLATFORM] = "platform",
[HIBERNATION_SHUTDOWN] = "shutdown",
[HIBERNATION_REBOOT] = "reboot",
+#ifdef CONFIG_SUSPEND
+ [HIBERNATION_SUSPEND] = "suspend",
+#endif
};
/*
@@ -867,6 +895,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
switch (i) {
case HIBERNATION_SHUTDOWN:
case HIBERNATION_REBOOT:
+#ifdef CONFIG_SUSPEND
+ case HIBERNATION_SUSPEND:
+#endif
break;
case HIBERNATION_PLATFORM:
if (hibernation_ops)
@@ -907,6 +938,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
switch (mode) {
case HIBERNATION_SHUTDOWN:
case HIBERNATION_REBOOT:
+#ifdef CONFIG_SUSPEND
+ case HIBERNATION_SUSPEND:
+#endif
hibernation_mode = mode;
break;
case HIBERNATION_PLATFORM:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 428f8a034e9..f458238109c 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -235,6 +235,47 @@ late_initcall(pm_debugfs_init);
#endif /* CONFIG_PM_SLEEP */
+#ifdef CONFIG_PM_SLEEP_DEBUG
+/*
+ * pm_print_times: print time taken by devices to suspend and resume.
+ *
+ * show() returns whether printing of suspend and resume times is enabled.
+ * store() accepts 0 or 1. 0 disables printing and 1 enables it.
+ */
+bool pm_print_times_enabled;
+
+static ssize_t pm_print_times_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", pm_print_times_enabled);
+}
+
+static ssize_t pm_print_times_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ if (val > 1)
+ return -EINVAL;
+
+ pm_print_times_enabled = !!val;
+ return n;
+}
+
+power_attr(pm_print_times);
+
+static inline void pm_print_times_init(void)
+{
+ pm_print_times_enabled = !!initcall_debug;
+}
+#else /* !CONFIG_PP_SLEEP_DEBUG */
+static inline void pm_print_times_init(void) {}
+#endif /* CONFIG_PM_SLEEP_DEBUG */
+
struct kobject *power_kobj;
/**
@@ -531,6 +572,9 @@ static struct attribute * g[] = {
#ifdef CONFIG_PM_DEBUG
&pm_test_attr.attr,
#endif
+#ifdef CONFIG_PM_SLEEP_DEBUG
+ &pm_print_times_attr.attr,
+#endif
#endif
NULL,
};
@@ -566,6 +610,7 @@ static int __init pm_init(void)
error = sysfs_create_group(power_kobj, &attr_group);
if (error)
return error;
+ pm_print_times_init();
return pm_autosleep_init();
}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index b0bd4beaebf..7d4b7ffb3c1 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -156,6 +156,9 @@ extern void swsusp_free(void);
extern int swsusp_read(unsigned int *flags_p);
extern int swsusp_write(unsigned int flags);
extern void swsusp_close(fmode_t);
+#ifdef CONFIG_SUSPEND
+extern int swsusp_unmark(void);
+#endif
/* kernel/power/block_io.c */
extern struct block_device *hib_resume_bdev;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 396d262b8fd..1da39ea248f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -24,6 +24,7 @@
#include <linux/export.h>
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
+#include <linux/ftrace.h>
#include <trace/events/power.h>
#include "power.h"
@@ -177,6 +178,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
arch_suspend_enable_irqs();
BUG_ON(irqs_disabled());
+ /* Kick the lockup detector */
+ lockup_detector_bootcpu_resume();
+
Enable_cpus:
enable_nonboot_cpus();
@@ -212,6 +216,7 @@ int suspend_devices_and_enter(suspend_state_t state)
goto Close;
}
suspend_console();
+ ftrace_stop();
suspend_test_start();
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
@@ -231,6 +236,7 @@ int suspend_devices_and_enter(suspend_state_t state)
suspend_test_start();
dpm_resume_end(PMSG_RESUME);
suspend_test_finish("resume devices");
+ ftrace_start();
resume_console();
Close:
if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 11e22c068e8..3c9d764eb0d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -448,9 +448,9 @@ static int save_image(struct swap_map_handle *handle,
struct timeval start;
struct timeval stop;
- printk(KERN_INFO "PM: Saving image data pages (%u pages) ... ",
+ printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
nr_to_write);
- m = nr_to_write / 100;
+ m = nr_to_write / 10;
if (!m)
m = 1;
nr_pages = 0;
@@ -464,7 +464,8 @@ static int save_image(struct swap_map_handle *handle,
if (ret)
break;
if (!(nr_pages % m))
- printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
+ printk(KERN_INFO "PM: Image saving progress: %3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
}
err2 = hib_wait_on_bio_chain(&bio);
@@ -472,9 +473,7 @@ static int save_image(struct swap_map_handle *handle,
if (!ret)
ret = err2;
if (!ret)
- printk(KERN_CONT "\b\b\b\bdone\n");
- else
- printk(KERN_CONT "\n");
+ printk(KERN_INFO "PM: Image saving done.\n");
swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
return ret;
}
@@ -668,9 +667,9 @@ static int save_image_lzo(struct swap_map_handle *handle,
printk(KERN_INFO
"PM: Using %u thread(s) for compression.\n"
- "PM: Compressing and saving image data (%u pages) ... ",
+ "PM: Compressing and saving image data (%u pages)...\n",
nr_threads, nr_to_write);
- m = nr_to_write / 100;
+ m = nr_to_write / 10;
if (!m)
m = 1;
nr_pages = 0;
@@ -690,8 +689,10 @@ static int save_image_lzo(struct swap_map_handle *handle,
data_of(*snapshot), PAGE_SIZE);
if (!(nr_pages % m))
- printk(KERN_CONT "\b\b\b\b%3d%%",
- nr_pages / m);
+ printk(KERN_INFO
+ "PM: Image saving progress: "
+ "%3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
}
if (!off)
@@ -761,11 +762,8 @@ out_finish:
do_gettimeofday(&stop);
if (!ret)
ret = err2;
- if (!ret) {
- printk(KERN_CONT "\b\b\b\bdone\n");
- } else {
- printk(KERN_CONT "\n");
- }
+ if (!ret)
+ printk(KERN_INFO "PM: Image saving done.\n");
swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
out_clean:
if (crc) {
@@ -973,9 +971,9 @@ static int load_image(struct swap_map_handle *handle,
int err2;
unsigned nr_pages;
- printk(KERN_INFO "PM: Loading image data pages (%u pages) ... ",
+ printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
nr_to_read);
- m = nr_to_read / 100;
+ m = nr_to_read / 10;
if (!m)
m = 1;
nr_pages = 0;
@@ -993,7 +991,8 @@ static int load_image(struct swap_map_handle *handle,
if (ret)
break;
if (!(nr_pages % m))
- printk("\b\b\b\b%3d%%", nr_pages / m);
+ printk(KERN_INFO "PM: Image loading progress: %3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
}
err2 = hib_wait_on_bio_chain(&bio);
@@ -1001,12 +1000,11 @@ static int load_image(struct swap_map_handle *handle,
if (!ret)
ret = err2;
if (!ret) {
- printk("\b\b\b\bdone\n");
+ printk(KERN_INFO "PM: Image loading done.\n");
snapshot_write_finalize(snapshot);
if (!snapshot_image_loaded(snapshot))
ret = -ENODATA;
- } else
- printk("\n");
+ }
swsusp_show_speed(&start, &stop, nr_to_read, "Read");
return ret;
}
@@ -1185,9 +1183,9 @@ static int load_image_lzo(struct swap_map_handle *handle,
printk(KERN_INFO
"PM: Using %u thread(s) for decompression.\n"
- "PM: Loading and decompressing image data (%u pages) ... ",
+ "PM: Loading and decompressing image data (%u pages)...\n",
nr_threads, nr_to_read);
- m = nr_to_read / 100;
+ m = nr_to_read / 10;
if (!m)
m = 1;
nr_pages = 0;
@@ -1319,7 +1317,10 @@ static int load_image_lzo(struct swap_map_handle *handle,
data[thr].unc + off, PAGE_SIZE);
if (!(nr_pages % m))
- printk("\b\b\b\b%3d%%", nr_pages / m);
+ printk(KERN_INFO
+ "PM: Image loading progress: "
+ "%3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
ret = snapshot_write_next(snapshot);
@@ -1344,7 +1345,7 @@ out_finish:
}
do_gettimeofday(&stop);
if (!ret) {
- printk("\b\b\b\bdone\n");
+ printk(KERN_INFO "PM: Image loading done.\n");
snapshot_write_finalize(snapshot);
if (!snapshot_image_loaded(snapshot))
ret = -ENODATA;
@@ -1357,8 +1358,7 @@ out_finish:
}
}
}
- } else
- printk("\n");
+ }
swsusp_show_speed(&start, &stop, nr_to_read, "Read");
out_clean:
for (i = 0; i < ring_size; i++)
@@ -1472,6 +1472,34 @@ void swsusp_close(fmode_t mode)
blkdev_put(hib_resume_bdev, mode);
}
+/**
+ * swsusp_unmark - Unmark swsusp signature in the resume device
+ */
+
+#ifdef CONFIG_SUSPEND
+int swsusp_unmark(void)
+{
+ int error;
+
+ hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+ if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
+ memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
+ error = hib_bio_write_page(swsusp_resume_block,
+ swsusp_header, NULL);
+ } else {
+ printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
+ error = -ENODEV;
+ }
+
+ /*
+ * We just returned from suspend, we don't need the image any more.
+ */
+ free_all_swap_pages(root_swap);
+
+ return error;
+}
+#endif
+
static int swsusp_header_init(void)
{
swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 91b0fd021a9..4ed81e74f86 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -24,7 +24,6 @@
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/freezer.h>
-#include <scsi/scsi_scan.h>
#include <asm/uaccess.h>
@@ -84,7 +83,6 @@ static int snapshot_open(struct inode *inode, struct file *filp)
* appear.
*/
wait_for_device_probe();
- scsi_complete_async_scans();
data->swap = -1;
data->mode = O_WRONLY;
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index c8fba338007..8f50de394d2 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -9,6 +9,7 @@
* manipulate wakelocks on Android.
*/
+#include <linux/capability.h>
#include <linux/ctype.h>
#include <linux/device.h>
#include <linux/err.h>
@@ -188,6 +189,9 @@ int pm_wake_lock(const char *buf)
size_t len;
int ret = 0;
+ if (!capable(CAP_BLOCK_SUSPEND))
+ return -EPERM;
+
while (*str && !isspace(*str))
str++;
@@ -231,6 +235,9 @@ int pm_wake_unlock(const char *buf)
size_t len;
int ret = 0;
+ if (!capable(CAP_BLOCK_SUSPEND))
+ return -EPERM;
+
len = strlen(buf);
if (!len)
return -EINVAL;
diff --git a/kernel/printk.c b/kernel/printk.c
index dba18211685..6a76ab9d447 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -194,8 +194,10 @@ static int console_may_schedule;
*/
enum log_flags {
- LOG_DEFAULT = 0,
- LOG_NOCONS = 1, /* already flushed, do not print to console */
+ LOG_NOCONS = 1, /* already flushed, do not print to console */
+ LOG_NEWLINE = 2, /* text ended with a newline */
+ LOG_PREFIX = 4, /* text started with a prefix */
+ LOG_CONT = 8, /* text is a fragment of a continuation line */
};
struct log {
@@ -214,9 +216,12 @@ struct log {
*/
static DEFINE_RAW_SPINLOCK(logbuf_lock);
+#ifdef CONFIG_PRINTK
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
static u32 syslog_idx;
+static enum log_flags syslog_prev;
+static size_t syslog_partial;
/* index and sequence number of the first record stored in the buffer */
static u64 log_first_seq;
@@ -224,14 +229,19 @@ static u32 log_first_idx;
/* index and sequence number of the next record to store in the buffer */
static u64 log_next_seq;
-#ifdef CONFIG_PRINTK
static u32 log_next_idx;
+/* the next printk record to write to the console */
+static u64 console_seq;
+static u32 console_idx;
+static enum log_flags console_prev;
+
/* the next printk record to read after the last 'clear' command */
static u64 clear_seq;
static u32 clear_idx;
-#define LOG_LINE_MAX 1024
+#define PREFIX_MAX 32
+#define LOG_LINE_MAX 1024 - PREFIX_MAX
/* record buffer */
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
@@ -356,6 +366,7 @@ static void log_store(int facility, int level,
struct devkmsg_user {
u64 seq;
u32 idx;
+ enum log_flags prev;
struct mutex lock;
char buf[8192];
};
@@ -378,8 +389,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
line = buf;
for (i = 0; i < count; i++) {
- if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len))
+ if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) {
+ ret = -EFAULT;
goto out;
+ }
line += iv[i].iov_len;
}
@@ -421,6 +434,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
struct log *msg;
u64 ts_usec;
size_t i;
+ char cont = '-';
size_t len;
ssize_t ret;
@@ -430,20 +444,20 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
ret = mutex_lock_interruptible(&user->lock);
if (ret)
return ret;
- raw_spin_lock(&logbuf_lock);
+ raw_spin_lock_irq(&logbuf_lock);
while (user->seq == log_next_seq) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
- raw_spin_unlock(&logbuf_lock);
+ raw_spin_unlock_irq(&logbuf_lock);
goto out;
}
- raw_spin_unlock(&logbuf_lock);
+ raw_spin_unlock_irq(&logbuf_lock);
ret = wait_event_interruptible(log_wait,
user->seq != log_next_seq);
if (ret)
goto out;
- raw_spin_lock(&logbuf_lock);
+ raw_spin_lock_irq(&logbuf_lock);
}
if (user->seq < log_first_seq) {
@@ -451,21 +465,38 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
user->idx = log_first_idx;
user->seq = log_first_seq;
ret = -EPIPE;
- raw_spin_unlock(&logbuf_lock);
+ raw_spin_unlock_irq(&logbuf_lock);
goto out;
}
msg = log_from_idx(user->idx);
ts_usec = msg->ts_nsec;
do_div(ts_usec, 1000);
- len = sprintf(user->buf, "%u,%llu,%llu;",
- (msg->facility << 3) | msg->level, user->seq, ts_usec);
+
+ /*
+ * If we couldn't merge continuation line fragments during the print,
+ * export the stored flags to allow an optional external merge of the
+ * records. Merging the records isn't always neccessarily correct, like
+ * when we hit a race during printing. In most cases though, it produces
+ * better readable output. 'c' in the record flags mark the first
+ * fragment of a line, '+' the following.
+ */
+ if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT))
+ cont = 'c';
+ else if ((msg->flags & LOG_CONT) ||
+ ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
+ cont = '+';
+
+ len = sprintf(user->buf, "%u,%llu,%llu,%c;",
+ (msg->facility << 3) | msg->level,
+ user->seq, ts_usec, cont);
+ user->prev = msg->flags;
/* escape non-printable characters */
for (i = 0; i < msg->text_len; i++) {
unsigned char c = log_text(msg)[i];
- if (c < ' ' || c >= 128)
+ if (c < ' ' || c >= 127 || c == '\\')
len += sprintf(user->buf + len, "\\x%02x", c);
else
user->buf[len++] = c;
@@ -489,7 +520,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
continue;
}
- if (c < ' ' || c >= 128) {
+ if (c < ' ' || c >= 127 || c == '\\') {
len += sprintf(user->buf + len, "\\x%02x", c);
continue;
}
@@ -501,7 +532,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
user->idx = log_next(user->idx);
user->seq++;
- raw_spin_unlock(&logbuf_lock);
+ raw_spin_unlock_irq(&logbuf_lock);
if (len > count) {
ret = -EINVAL;
@@ -528,7 +559,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
if (offset)
return -ESPIPE;
- raw_spin_lock(&logbuf_lock);
+ raw_spin_lock_irq(&logbuf_lock);
switch (whence) {
case SEEK_SET:
/* the first record */
@@ -552,7 +583,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
default:
ret = -EINVAL;
}
- raw_spin_unlock(&logbuf_lock);
+ raw_spin_unlock_irq(&logbuf_lock);
return ret;
}
@@ -566,14 +597,14 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
poll_wait(file, &log_wait, wait);
- raw_spin_lock(&logbuf_lock);
+ raw_spin_lock_irq(&logbuf_lock);
if (user->seq < log_next_seq) {
/* return error when data has vanished underneath us */
if (user->seq < log_first_seq)
ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
ret = POLLIN|POLLRDNORM;
}
- raw_spin_unlock(&logbuf_lock);
+ raw_spin_unlock_irq(&logbuf_lock);
return ret;
}
@@ -597,10 +628,10 @@ static int devkmsg_open(struct inode *inode, struct file *file)
mutex_init(&user->lock);
- raw_spin_lock(&logbuf_lock);
+ raw_spin_lock_irq(&logbuf_lock);
user->idx = log_first_idx;
user->seq = log_first_seq;
- raw_spin_unlock(&logbuf_lock);
+ raw_spin_unlock_irq(&logbuf_lock);
file->private_data = user;
return 0;
@@ -642,6 +673,15 @@ void log_buf_kexec_setup(void)
VMCOREINFO_SYMBOL(log_buf_len);
VMCOREINFO_SYMBOL(log_first_idx);
VMCOREINFO_SYMBOL(log_next_idx);
+ /*
+ * Export struct log size and field offsets. User space tools can
+ * parse it and detect any changes to structure down the line.
+ */
+ VMCOREINFO_STRUCT_SIZE(log);
+ VMCOREINFO_OFFSET(log, ts_nsec);
+ VMCOREINFO_OFFSET(log, len);
+ VMCOREINFO_OFFSET(log, text_len);
+ VMCOREINFO_OFFSET(log, dict_len);
}
#endif
@@ -818,15 +858,18 @@ static size_t print_time(u64 ts, char *buf)
static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
{
size_t len = 0;
+ unsigned int prefix = (msg->facility << 3) | msg->level;
if (syslog) {
if (buf) {
- len += sprintf(buf, "<%u>", msg->level);
+ len += sprintf(buf, "<%u>", prefix);
} else {
len += 3;
- if (msg->level > 9)
- len++;
- if (msg->level > 99)
+ if (prefix > 999)
+ len += 3;
+ else if (prefix > 99)
+ len += 2;
+ else if (prefix > 9)
len++;
}
}
@@ -835,13 +878,26 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
return len;
}
-static size_t msg_print_text(const struct log *msg, bool syslog,
- char *buf, size_t size)
+static size_t msg_print_text(const struct log *msg, enum log_flags prev,
+ bool syslog, char *buf, size_t size)
{
const char *text = log_text(msg);
size_t text_size = msg->text_len;
+ bool prefix = true;
+ bool newline = true;
size_t len = 0;
+ if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))
+ prefix = false;
+
+ if (msg->flags & LOG_CONT) {
+ if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE))
+ prefix = false;
+
+ if (!(msg->flags & LOG_NEWLINE))
+ newline = false;
+ }
+
do {
const char *next = memchr(text, '\n', text_size);
size_t text_len;
@@ -856,19 +912,25 @@ static size_t msg_print_text(const struct log *msg, bool syslog,
if (buf) {
if (print_prefix(msg, syslog, NULL) +
- text_len + 1>= size - len)
+ text_len + 1 >= size - len)
break;
- len += print_prefix(msg, syslog, buf + len);
+ if (prefix)
+ len += print_prefix(msg, syslog, buf + len);
memcpy(buf + len, text, text_len);
len += text_len;
- buf[len++] = '\n';
+ if (next || newline)
+ buf[len++] = '\n';
} else {
/* SYSLOG_ACTION_* buffer size only calculation */
- len += print_prefix(msg, syslog, NULL);
- len += text_len + 1;
+ if (prefix)
+ len += print_prefix(msg, syslog, NULL);
+ len += text_len;
+ if (next || newline)
+ len++;
}
+ prefix = true;
text = next;
} while (text);
@@ -881,28 +943,42 @@ static int syslog_print(char __user *buf, int size)
struct log *msg;
int len = 0;
- text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
+ text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
while (size > 0) {
size_t n;
+ size_t skip;
raw_spin_lock_irq(&logbuf_lock);
if (syslog_seq < log_first_seq) {
/* messages are gone, move to first one */
syslog_seq = log_first_seq;
syslog_idx = log_first_idx;
+ syslog_prev = 0;
+ syslog_partial = 0;
}
if (syslog_seq == log_next_seq) {
raw_spin_unlock_irq(&logbuf_lock);
break;
}
+
+ skip = syslog_partial;
msg = log_from_idx(syslog_idx);
- n = msg_print_text(msg, true, text, LOG_LINE_MAX);
- if (n <= size) {
+ n = msg_print_text(msg, syslog_prev, true, text,
+ LOG_LINE_MAX + PREFIX_MAX);
+ if (n - syslog_partial <= size) {
+ /* message fits into buffer, move forward */
syslog_idx = log_next(syslog_idx);
syslog_seq++;
+ syslog_prev = msg->flags;
+ n -= syslog_partial;
+ syslog_partial = 0;
+ } else if (!len){
+ /* partial read(), remember position */
+ n = size;
+ syslog_partial += n;
} else
n = 0;
raw_spin_unlock_irq(&logbuf_lock);
@@ -910,17 +986,15 @@ static int syslog_print(char __user *buf, int size)
if (!n)
break;
- len += n;
- size -= n;
- buf += n;
- n = copy_to_user(buf - n, text, n);
-
- if (n) {
- len -= n;
+ if (copy_to_user(buf, text + skip, n)) {
if (!len)
len = -EFAULT;
break;
}
+
+ len += n;
+ size -= n;
+ buf += n;
}
kfree(text);
@@ -932,7 +1006,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
char *text;
int len = 0;
- text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
+ text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
@@ -941,6 +1015,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
u64 next_seq;
u64 seq;
u32 idx;
+ enum log_flags prev;
if (clear_seq < log_first_seq) {
/* messages are gone, move to first available one */
@@ -954,10 +1029,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
*/
seq = clear_seq;
idx = clear_idx;
+ prev = 0;
while (seq < log_next_seq) {
struct log *msg = log_from_idx(idx);
- len += msg_print_text(msg, true, NULL, 0);
+ len += msg_print_text(msg, prev, true, NULL, 0);
idx = log_next(idx);
seq++;
}
@@ -965,10 +1041,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
/* move first record forward until length fits into the buffer */
seq = clear_seq;
idx = clear_idx;
+ prev = 0;
while (len > size && seq < log_next_seq) {
struct log *msg = log_from_idx(idx);
- len -= msg_print_text(msg, true, NULL, 0);
+ len -= msg_print_text(msg, prev, true, NULL, 0);
idx = log_next(idx);
seq++;
}
@@ -977,17 +1054,20 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
next_seq = log_next_seq;
len = 0;
+ prev = 0;
while (len >= 0 && seq < next_seq) {
struct log *msg = log_from_idx(idx);
int textlen;
- textlen = msg_print_text(msg, true, text, LOG_LINE_MAX);
+ textlen = msg_print_text(msg, prev, true, text,
+ LOG_LINE_MAX + PREFIX_MAX);
if (textlen < 0) {
len = textlen;
break;
}
idx = log_next(idx);
seq++;
+ prev = msg->flags;
raw_spin_unlock_irq(&logbuf_lock);
if (copy_to_user(buf + len, text, textlen))
@@ -1000,6 +1080,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
/* messages are gone, move to next one */
seq = log_first_seq;
idx = log_first_idx;
+ prev = 0;
}
}
}
@@ -1018,7 +1099,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
{
bool clear = false;
static int saved_console_loglevel = -1;
- static DEFINE_MUTEX(syslog_mutex);
int error;
error = check_syslog_permissions(type, from_file);
@@ -1045,17 +1125,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
error = -EFAULT;
goto out;
}
- error = mutex_lock_interruptible(&syslog_mutex);
- if (error)
- goto out;
error = wait_event_interruptible(log_wait,
syslog_seq != log_next_seq);
- if (error) {
- mutex_unlock(&syslog_mutex);
+ if (error)
goto out;
- }
error = syslog_print(buf, len);
- mutex_unlock(&syslog_mutex);
break;
/* Read/clear last kernel messages */
case SYSLOG_ACTION_READ_CLEAR:
@@ -1111,6 +1185,8 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
/* messages are gone, move to first one */
syslog_seq = log_first_seq;
syslog_idx = log_first_idx;
+ syslog_prev = 0;
+ syslog_partial = 0;
}
if (from_file) {
/*
@@ -1120,19 +1196,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
*/
error = log_next_idx - syslog_idx;
} else {
- u64 seq;
- u32 idx;
+ u64 seq = syslog_seq;
+ u32 idx = syslog_idx;
+ enum log_flags prev = syslog_prev;
error = 0;
- seq = syslog_seq;
- idx = syslog_idx;
while (seq < log_next_seq) {
struct log *msg = log_from_idx(idx);
- error += msg_print_text(msg, true, NULL, 0);
+ error += msg_print_text(msg, prev, true, NULL, 0);
idx = log_next(idx);
seq++;
+ prev = msg->flags;
}
+ error -= syslog_partial;
}
raw_spin_unlock_irq(&logbuf_lock);
break;
@@ -1153,21 +1230,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
}
-#ifdef CONFIG_KGDB_KDB
-/* kdb dmesg command needs access to the syslog buffer. do_syslog()
- * uses locks so it cannot be used during debugging. Just tell kdb
- * where the start and end of the physical and logical logs are. This
- * is equivalent to do_syslog(3).
- */
-void kdb_syslog_data(char *syslog_data[4])
-{
- syslog_data[0] = log_buf;
- syslog_data[1] = log_buf + log_buf_len;
- syslog_data[2] = log_buf + log_first_idx;
- syslog_data[3] = log_buf + log_next_idx;
-}
-#endif /* CONFIG_KGDB_KDB */
-
static bool __read_mostly ignore_loglevel;
static int __init ignore_loglevel_setup(char *str)
@@ -1325,20 +1387,36 @@ static struct cont {
u64 ts_nsec; /* time of first print */
u8 level; /* log level of first message */
u8 facility; /* log level of first message */
+ enum log_flags flags; /* prefix, newline flags */
bool flushed:1; /* buffer sealed and committed */
} cont;
-static void cont_flush(void)
+static void cont_flush(enum log_flags flags)
{
if (cont.flushed)
return;
if (cont.len == 0)
return;
- log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec,
- NULL, 0, cont.buf, cont.len);
-
- cont.flushed = true;
+ if (cont.cons) {
+ /*
+ * If a fragment of this line was directly flushed to the
+ * console; wait for the console to pick up the rest of the
+ * line. LOG_NOCONS suppresses a duplicated output.
+ */
+ log_store(cont.facility, cont.level, flags | LOG_NOCONS,
+ cont.ts_nsec, NULL, 0, cont.buf, cont.len);
+ cont.flags = flags;
+ cont.flushed = true;
+ } else {
+ /*
+ * If no fragment of this line ever reached the console,
+ * just submit it to the store and free the buffer.
+ */
+ log_store(cont.facility, cont.level, flags, 0,
+ NULL, 0, cont.buf, cont.len);
+ cont.len = 0;
+ }
}
static bool cont_add(int facility, int level, const char *text, size_t len)
@@ -1347,7 +1425,8 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
return false;
if (cont.len + len > sizeof(cont.buf)) {
- cont_flush();
+ /* the line gets too long, split it up in separate records */
+ cont_flush(LOG_CONT);
return false;
}
@@ -1356,12 +1435,17 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
cont.level = level;
cont.owner = current;
cont.ts_nsec = local_clock();
+ cont.flags = 0;
cont.cons = 0;
cont.flushed = false;
}
memcpy(cont.buf + cont.len, text, len);
cont.len += len;
+
+ if (cont.len > (sizeof(cont.buf) * 80) / 100)
+ cont_flush(LOG_CONT);
+
return true;
}
@@ -1370,7 +1454,7 @@ static size_t cont_print_text(char *text, size_t size)
size_t textlen = 0;
size_t len;
- if (cont.cons == 0) {
+ if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
textlen += print_time(cont.ts_nsec, text);
size -= textlen;
}
@@ -1385,7 +1469,8 @@ static size_t cont_print_text(char *text, size_t size)
}
if (cont.flushed) {
- text[textlen++] = '\n';
+ if (cont.flags & LOG_NEWLINE)
+ text[textlen++] = '\n';
/* got everything, release buffer */
cont.len = 0;
}
@@ -1400,10 +1485,9 @@ asmlinkage int vprintk_emit(int facility, int level,
static char textbuf[LOG_LINE_MAX];
char *text = textbuf;
size_t text_len;
+ enum log_flags lflags = 0;
unsigned long flags;
int this_cpu;
- bool newline = false;
- bool prefix = false;
int printed_len = 0;
boot_delay_msec();
@@ -1442,7 +1526,7 @@ asmlinkage int vprintk_emit(int facility, int level,
recursion_bug = 0;
printed_len += strlen(recursion_msg);
/* emit KERN_CRIT message */
- log_store(0, 2, LOG_DEFAULT, 0,
+ log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
NULL, 0, recursion_msg, printed_len);
}
@@ -1455,42 +1539,46 @@ asmlinkage int vprintk_emit(int facility, int level,
/* mark and strip a trailing newline */
if (text_len && text[text_len-1] == '\n') {
text_len--;
- newline = true;
- }
-
- /* strip syslog prefix and extract log level or control flags */
- if (text[0] == '<' && text[1] && text[2] == '>') {
- switch (text[1]) {
- case '0' ... '7':
- if (level == -1)
- level = text[1] - '0';
- case 'd': /* KERN_DEFAULT */
- prefix = true;
- case 'c': /* KERN_CONT */
- text += 3;
- text_len -= 3;
+ lflags |= LOG_NEWLINE;
+ }
+
+ /* strip kernel syslog prefix and extract log level or control flags */
+ if (facility == 0) {
+ int kern_level = printk_get_level(text);
+
+ if (kern_level) {
+ const char *end_of_header = printk_skip_level(text);
+ switch (kern_level) {
+ case '0' ... '7':
+ if (level == -1)
+ level = kern_level - '0';
+ case 'd': /* KERN_DEFAULT */
+ lflags |= LOG_PREFIX;
+ case 'c': /* KERN_CONT */
+ break;
+ }
+ text_len -= end_of_header - text;
+ text = (char *)end_of_header;
}
}
if (level == -1)
level = default_message_loglevel;
- if (dict) {
- prefix = true;
- newline = true;
- }
+ if (dict)
+ lflags |= LOG_PREFIX|LOG_NEWLINE;
- if (!newline) {
+ if (!(lflags & LOG_NEWLINE)) {
/*
* Flush the conflicting buffer. An earlier newline was missing,
* or another task also prints continuation lines.
*/
- if (cont.len && (prefix || cont.owner != current))
- cont_flush();
+ if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
+ cont_flush(LOG_NEWLINE);
/* buffer line if possible, otherwise store it right away */
if (!cont_add(facility, level, text, text_len))
- log_store(facility, level, LOG_DEFAULT, 0,
+ log_store(facility, level, lflags | LOG_CONT, 0,
dict, dictlen, text, text_len);
} else {
bool stored = false;
@@ -1502,13 +1590,13 @@ asmlinkage int vprintk_emit(int facility, int level,
* flush it out and store this line separately.
*/
if (cont.len && cont.owner == current) {
- if (!prefix)
+ if (!(lflags & LOG_PREFIX))
stored = cont_add(facility, level, text, text_len);
- cont_flush();
+ cont_flush(LOG_NEWLINE);
}
if (!stored)
- log_store(facility, level, LOG_DEFAULT, 0,
+ log_store(facility, level, lflags, 0,
dict, dictlen, text, text_len);
}
printed_len += text_len;
@@ -1595,9 +1683,20 @@ asmlinkage int printk(const char *fmt, ...)
}
EXPORT_SYMBOL(printk);
-#else
+#else /* CONFIG_PRINTK */
+#define LOG_LINE_MAX 0
+#define PREFIX_MAX 0
#define LOG_LINE_MAX 0
+static u64 syslog_seq;
+static u32 syslog_idx;
+static u64 console_seq;
+static u32 console_idx;
+static enum log_flags syslog_prev;
+static u64 log_first_seq;
+static u32 log_first_idx;
+static u64 log_next_seq;
+static enum log_flags console_prev;
static struct cont {
size_t len;
size_t cons;
@@ -1607,8 +1706,8 @@ static struct cont {
static struct log *log_from_idx(u32 idx) { return NULL; }
static u32 log_next(u32 idx) { return 0; }
static void call_console_drivers(int level, const char *text, size_t len) {}
-static size_t msg_print_text(const struct log *msg, bool syslog,
- char *buf, size_t size) { return 0; }
+static size_t msg_print_text(const struct log *msg, enum log_flags prev,
+ bool syslog, char *buf, size_t size) { return 0; }
static size_t cont_print_text(char *text, size_t size) { return 0; }
#endif /* CONFIG_PRINTK */
@@ -1881,9 +1980,34 @@ void wake_up_klogd(void)
this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
}
-/* the next printk record to write to the console */
-static u64 console_seq;
-static u32 console_idx;
+static void console_cont_flush(char *text, size_t size)
+{
+ unsigned long flags;
+ size_t len;
+
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
+
+ if (!cont.len)
+ goto out;
+
+ /*
+ * We still queue earlier records, likely because the console was
+ * busy. The earlier ones need to be printed before this one, we
+ * did not flush any fragment so far, so just let it queue up.
+ */
+ if (console_seq < log_next_seq && !cont.cons)
+ goto out;
+
+ len = cont_print_text(text, size);
+ raw_spin_unlock(&logbuf_lock);
+ stop_critical_timings();
+ call_console_drivers(cont.level, text, len);
+ start_critical_timings();
+ local_irq_restore(flags);
+ return;
+out:
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+}
/**
* console_unlock - unlock the console system
@@ -1901,7 +2025,7 @@ static u32 console_idx;
*/
void console_unlock(void)
{
- static char text[LOG_LINE_MAX];
+ static char text[LOG_LINE_MAX + PREFIX_MAX];
static u64 seen_seq;
unsigned long flags;
bool wake_klogd = false;
@@ -1915,19 +2039,7 @@ void console_unlock(void)
console_may_schedule = 0;
/* flush buffered message fragment immediately to console */
- raw_spin_lock_irqsave(&logbuf_lock, flags);
- if (cont.len && (cont.cons < cont.len || cont.flushed)) {
- size_t len;
-
- len = cont_print_text(text, sizeof(text));
- raw_spin_unlock(&logbuf_lock);
- stop_critical_timings();
- call_console_drivers(cont.level, text, len);
- start_critical_timings();
- local_irq_restore(flags);
- } else
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
-
+ console_cont_flush(text, sizeof(text));
again:
for (;;) {
struct log *msg;
@@ -1944,6 +2056,7 @@ again:
/* messages are gone, move to first one */
console_seq = log_first_seq;
console_idx = log_first_idx;
+ console_prev = 0;
}
skip:
if (console_seq == log_next_seq)
@@ -1957,14 +2070,22 @@ skip:
*/
console_idx = log_next(console_idx);
console_seq++;
+ /*
+ * We will get here again when we register a new
+ * CON_PRINTBUFFER console. Clear the flag so we
+ * will properly dump everything later.
+ */
+ msg->flags &= ~LOG_NOCONS;
+ console_prev = msg->flags;
goto skip;
}
level = msg->level;
- len = msg_print_text(msg, false, text, sizeof(text));
-
+ len = msg_print_text(msg, console_prev, false,
+ text, sizeof(text));
console_idx = log_next(console_idx);
console_seq++;
+ console_prev = msg->flags;
raw_spin_unlock(&logbuf_lock);
stop_critical_timings(); /* don't trace print latency */
@@ -2227,6 +2348,7 @@ void register_console(struct console *newcon)
raw_spin_lock_irqsave(&logbuf_lock, flags);
console_seq = syslog_seq;
console_idx = syslog_idx;
+ console_prev = syslog_prev;
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
/*
* We're about to replay the log buffer. Only do this to the
@@ -2479,7 +2601,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)
}
/**
- * kmsg_dump_get_line - retrieve one kmsg log line
+ * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
* @dumper: registered kmsg dumper
* @syslog: include the "<4>" prefixes
* @line: buffer to copy the line to
@@ -2494,11 +2616,12 @@ void kmsg_dump(enum kmsg_dump_reason reason)
*
* A return value of FALSE indicates that there are no more records to
* read.
+ *
+ * The function is similar to kmsg_dump_get_line(), but grabs no locks.
*/
-bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
- char *line, size_t size, size_t *len)
+bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
+ char *line, size_t size, size_t *len)
{
- unsigned long flags;
struct log *msg;
size_t l = 0;
bool ret = false;
@@ -2506,7 +2629,6 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
if (!dumper->active)
goto out;
- raw_spin_lock_irqsave(&logbuf_lock, flags);
if (dumper->cur_seq < log_first_seq) {
/* messages are gone, move to first available one */
dumper->cur_seq = log_first_seq;
@@ -2514,24 +2636,50 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
}
/* last entry */
- if (dumper->cur_seq >= log_next_seq) {
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ if (dumper->cur_seq >= log_next_seq)
goto out;
- }
msg = log_from_idx(dumper->cur_idx);
- l = msg_print_text(msg, syslog,
- line, size);
+ l = msg_print_text(msg, 0, syslog, line, size);
dumper->cur_idx = log_next(dumper->cur_idx);
dumper->cur_seq++;
ret = true;
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
out:
if (len)
*len = l;
return ret;
}
+
+/**
+ * kmsg_dump_get_line - retrieve one kmsg log line
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @line: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the beginning of the kmsg buffer, with the oldest kmsg
+ * record, and copy one record into the provided buffer.
+ *
+ * Consecutive calls will return the next available record moving
+ * towards the end of the buffer with the youngest messages.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ */
+bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+ char *line, size_t size, size_t *len)
+{
+ unsigned long flags;
+ bool ret;
+
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
+ ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
/**
@@ -2561,6 +2709,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
u32 idx;
u64 next_seq;
u32 next_idx;
+ enum log_flags prev;
size_t l = 0;
bool ret = false;
@@ -2583,23 +2732,27 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
/* calculate length of entire buffer */
seq = dumper->cur_seq;
idx = dumper->cur_idx;
+ prev = 0;
while (seq < dumper->next_seq) {
struct log *msg = log_from_idx(idx);
- l += msg_print_text(msg, true, NULL, 0);
+ l += msg_print_text(msg, prev, true, NULL, 0);
idx = log_next(idx);
seq++;
+ prev = msg->flags;
}
/* move first record forward until length fits into the buffer */
seq = dumper->cur_seq;
idx = dumper->cur_idx;
+ prev = 0;
while (l > size && seq < dumper->next_seq) {
struct log *msg = log_from_idx(idx);
- l -= msg_print_text(msg, true, NULL, 0);
+ l -= msg_print_text(msg, prev, true, NULL, 0);
idx = log_next(idx);
seq++;
+ prev = msg->flags;
}
/* last message in next interation */
@@ -2607,14 +2760,14 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
next_idx = idx;
l = 0;
+ prev = 0;
while (seq < dumper->next_seq) {
struct log *msg = log_from_idx(idx);
- l += msg_print_text(msg, syslog,
- buf + l, size - l);
-
+ l += msg_print_text(msg, prev, syslog, buf + l, size - l);
idx = log_next(idx);
seq++;
+ prev = msg->flags;
}
dumper->next_seq = next_seq;
@@ -2629,6 +2782,24 @@ out:
EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
/**
+ * kmsg_dump_rewind_nolock - reset the interator (unlocked version)
+ * @dumper: registered kmsg dumper
+ *
+ * Reset the dumper's iterator so that kmsg_dump_get_line() and
+ * kmsg_dump_get_buffer() can be called again and used multiple
+ * times within the same dumper.dump() callback.
+ *
+ * The function is similar to kmsg_dump_rewind(), but grabs no locks.
+ */
+void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
+{
+ dumper->cur_seq = clear_seq;
+ dumper->cur_idx = clear_idx;
+ dumper->next_seq = log_next_seq;
+ dumper->next_idx = log_next_idx;
+}
+
+/**
* kmsg_dump_rewind - reset the interator
* @dumper: registered kmsg dumper
*
@@ -2641,10 +2812,7 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
unsigned long flags;
raw_spin_lock_irqsave(&logbuf_lock, flags);
- dumper->cur_seq = clear_seq;
- dumper->cur_idx = clear_idx;
- dumper->next_seq = log_next_seq;
- dumper->next_idx = log_next_idx;
+ kmsg_dump_rewind_nolock(dumper);
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 95cba41ce1e..4e6a61b15e8 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -54,6 +54,50 @@
#ifdef CONFIG_PREEMPT_RCU
/*
+ * Preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+ current->rcu_read_lock_nesting++;
+ barrier(); /* critical section after entry code. */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+/*
+ * Preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+ struct task_struct *t = current;
+
+ if (t->rcu_read_lock_nesting != 1) {
+ --t->rcu_read_lock_nesting;
+ } else {
+ barrier(); /* critical section before exit code. */
+ t->rcu_read_lock_nesting = INT_MIN;
+ barrier(); /* assign before ->rcu_read_unlock_special load */
+ if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+ rcu_read_unlock_special(t);
+ barrier(); /* ->rcu_read_unlock_special load before assign */
+ t->rcu_read_lock_nesting = 0;
+ }
+#ifdef CONFIG_PROVE_LOCKING
+ {
+ int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+
+ WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
+ }
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
+/*
* Check for a task exiting while in a preemptible-RCU read-side
* critical section, clean up if so. No need to issue warnings,
* as debug_check_no_locks_held() already does this if lockdep
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 37a5444204d..547b1fe5b05 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -172,7 +172,7 @@ void rcu_irq_enter(void)
local_irq_restore(flags);
}
-#ifdef CONFIG_PROVE_RCU
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
* Test whether RCU thinks that the current CPU is idle.
@@ -183,7 +183,7 @@ int rcu_is_cpu_idle(void)
}
EXPORT_SYMBOL(rcu_is_cpu_idle);
-#endif /* #ifdef CONFIG_PROVE_RCU */
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/*
* Test whether the current CPU was interrupted from idle. Nested
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index fc31a2d6510..918fd1e8509 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -132,7 +132,6 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
RCU_TRACE(.rcb.name = "rcu_preempt")
};
-static void rcu_read_unlock_special(struct task_struct *t);
static int rcu_preempted_readers_exp(void);
static void rcu_report_exp_done(void);
@@ -351,8 +350,9 @@ static int rcu_initiate_boost(void)
rcu_preempt_ctrlblk.boost_tasks =
rcu_preempt_ctrlblk.gp_tasks;
invoke_rcu_callbacks();
- } else
+ } else {
RCU_TRACE(rcu_initiate_boost_trace());
+ }
return 1;
}
@@ -527,23 +527,11 @@ void rcu_preempt_note_context_switch(void)
}
/*
- * Tiny-preemptible RCU implementation for rcu_read_lock().
- * Just increment ->rcu_read_lock_nesting, shared state will be updated
- * if we block.
- */
-void __rcu_read_lock(void)
-{
- current->rcu_read_lock_nesting++;
- barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_lock);
-
-/*
* Handle special cases during rcu_read_unlock(), such as needing to
* notify RCU core processing or task having blocked during the RCU
* read-side critical section.
*/
-static noinline void rcu_read_unlock_special(struct task_struct *t)
+void rcu_read_unlock_special(struct task_struct *t)
{
int empty;
int empty_exp;
@@ -627,38 +615,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
}
/*
- * Tiny-preemptible RCU implementation for rcu_read_unlock().
- * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
- * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
- * invoke rcu_read_unlock_special() to clean up after a context switch
- * in an RCU read-side critical section and other special cases.
- */
-void __rcu_read_unlock(void)
-{
- struct task_struct *t = current;
-
- barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
- if (t->rcu_read_lock_nesting != 1)
- --t->rcu_read_lock_nesting;
- else {
- t->rcu_read_lock_nesting = INT_MIN;
- barrier(); /* assign before ->rcu_read_unlock_special load */
- if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
- rcu_read_unlock_special(t);
- barrier(); /* ->rcu_read_unlock_special load before assign */
- t->rcu_read_lock_nesting = 0;
- }
-#ifdef CONFIG_PROVE_LOCKING
- {
- int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
-
- WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
- }
-#endif /* #ifdef CONFIG_PROVE_LOCKING */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-
-/*
* Check for a quiescent state from the current CPU. When a task blocks,
* the task is recorded in the rcu_preempt_ctrlblk structure, which is
* checked elsewhere. This is called from the scheduling-clock interrupt.
@@ -823,9 +779,9 @@ void synchronize_rcu_expedited(void)
rpcp->exp_tasks = NULL;
/* Wait for tail of ->blkd_tasks list to drain. */
- if (!rcu_preempted_readers_exp())
+ if (!rcu_preempted_readers_exp()) {
local_irq_restore(flags);
- else {
+ } else {
rcu_initiate_boost();
local_irq_restore(flags);
wait_event(sync_rcu_preempt_exp_wq,
@@ -846,8 +802,6 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
*/
int rcu_preempt_needs_cpu(void)
{
- if (!rcu_preempt_running_reader())
- rcu_preempt_cpu_qs();
return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
}
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e66b34ab755..25b15033c61 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -49,8 +49,7 @@
#include <asm/byteorder.h>
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
- "Josh Triplett <josh@freedesktop.org>");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
static int nfakewriters = 4; /* # fake writer threads */
@@ -206,6 +205,7 @@ static unsigned long boost_starttime; /* jiffies of next boost test start. */
DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
/* and boost task create/destroy. */
static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */
+static bool barrier_phase; /* Test phase. */
static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
@@ -407,8 +407,9 @@ rcu_torture_cb(struct rcu_head *p)
if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
rp->rtort_mbtest = 0;
rcu_torture_free(rp);
- } else
+ } else {
cur_ops->deferred_free(rp);
+ }
}
static int rcu_no_completed(void)
@@ -635,6 +636,17 @@ static void srcu_torture_synchronize(void)
synchronize_srcu(&srcu_ctl);
}
+static void srcu_torture_call(struct rcu_head *head,
+ void (*func)(struct rcu_head *head))
+{
+ call_srcu(&srcu_ctl, head, func);
+}
+
+static void srcu_torture_barrier(void)
+{
+ srcu_barrier(&srcu_ctl);
+}
+
static int srcu_torture_stats(char *page)
{
int cnt = 0;
@@ -661,8 +673,8 @@ static struct rcu_torture_ops srcu_ops = {
.completed = srcu_torture_completed,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
- .call = NULL,
- .cb_barrier = NULL,
+ .call = srcu_torture_call,
+ .cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
.name = "srcu"
};
@@ -1013,7 +1025,11 @@ rcu_torture_fakewriter(void *arg)
do {
schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
udelay(rcu_random(&rand) & 0x3ff);
- cur_ops->sync();
+ if (cur_ops->cb_barrier != NULL &&
+ rcu_random(&rand) % (nfakewriters * 8) == 0)
+ cur_ops->cb_barrier();
+ else
+ cur_ops->sync();
rcu_stutter_wait("rcu_torture_fakewriter");
} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
@@ -1183,27 +1199,27 @@ rcu_torture_printk(char *page)
}
cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
cnt += sprintf(&page[cnt],
- "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
- "rtmbe: %d rtbke: %ld rtbre: %ld "
- "rtbf: %ld rtb: %ld nt: %ld "
- "onoff: %ld/%ld:%ld/%ld "
- "barrier: %ld/%ld:%ld",
+ "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
rcu_torture_current,
rcu_torture_current_version,
list_empty(&rcu_torture_freelist),
atomic_read(&n_rcu_torture_alloc),
atomic_read(&n_rcu_torture_alloc_fail),
- atomic_read(&n_rcu_torture_free),
+ atomic_read(&n_rcu_torture_free));
+ cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ",
atomic_read(&n_rcu_torture_mberror),
n_rcu_torture_boost_ktrerror,
- n_rcu_torture_boost_rterror,
+ n_rcu_torture_boost_rterror);
+ cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ",
n_rcu_torture_boost_failure,
n_rcu_torture_boosts,
- n_rcu_torture_timers,
+ n_rcu_torture_timers);
+ cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ",
n_online_successes,
n_online_attempts,
n_offline_successes,
- n_offline_attempts,
+ n_offline_attempts);
+ cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld",
n_barrier_successes,
n_barrier_attempts,
n_rcu_torture_barrier_error);
@@ -1445,8 +1461,7 @@ rcu_torture_shutdown(void *arg)
delta = shutdown_time - jiffies_snap;
if (verbose)
printk(KERN_ALERT "%s" TORTURE_FLAG
- "rcu_torture_shutdown task: %lu "
- "jiffies remaining\n",
+ "rcu_torture_shutdown task: %lu jiffies remaining\n",
torture_type, delta);
schedule_timeout_interruptible(delta);
jiffies_snap = ACCESS_ONCE(jiffies);
@@ -1498,8 +1513,7 @@ rcu_torture_onoff(void *arg)
if (cpu_down(cpu) == 0) {
if (verbose)
printk(KERN_ALERT "%s" TORTURE_FLAG
- "rcu_torture_onoff task: "
- "offlined %d\n",
+ "rcu_torture_onoff task: offlined %d\n",
torture_type, cpu);
n_offline_successes++;
}
@@ -1512,8 +1526,7 @@ rcu_torture_onoff(void *arg)
if (cpu_up(cpu) == 0) {
if (verbose)
printk(KERN_ALERT "%s" TORTURE_FLAG
- "rcu_torture_onoff task: "
- "onlined %d\n",
+ "rcu_torture_onoff task: onlined %d\n",
torture_type, cpu);
n_online_successes++;
}
@@ -1631,6 +1644,7 @@ void rcu_torture_barrier_cbf(struct rcu_head *rcu)
static int rcu_torture_barrier_cbs(void *arg)
{
long myid = (long)arg;
+ bool lastphase = 0;
struct rcu_head rcu;
init_rcu_head_on_stack(&rcu);
@@ -1638,9 +1652,11 @@ static int rcu_torture_barrier_cbs(void *arg)
set_user_nice(current, 19);
do {
wait_event(barrier_cbs_wq[myid],
- atomic_read(&barrier_cbs_count) == n_barrier_cbs ||
+ barrier_phase != lastphase ||
kthread_should_stop() ||
fullstop != FULLSTOP_DONTSTOP);
+ lastphase = barrier_phase;
+ smp_mb(); /* ensure barrier_phase load before ->call(). */
if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
break;
cur_ops->call(&rcu, rcu_torture_barrier_cbf);
@@ -1665,7 +1681,8 @@ static int rcu_torture_barrier(void *arg)
do {
atomic_set(&barrier_cbs_invoked, 0);
atomic_set(&barrier_cbs_count, n_barrier_cbs);
- /* wake_up() path contains the required barriers. */
+ smp_mb(); /* Ensure barrier_phase after prior assignments. */
+ barrier_phase = !barrier_phase;
for (i = 0; i < n_barrier_cbs; i++)
wake_up(&barrier_cbs_wq[i]);
wait_event(barrier_wq,
@@ -1684,7 +1701,7 @@ static int rcu_torture_barrier(void *arg)
schedule_timeout_interruptible(HZ / 10);
} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
- rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
+ rcutorture_shutdown_absorb("rcu_torture_barrier");
while (!kthread_should_stop())
schedule_timeout_interruptible(1);
return 0;
@@ -1908,8 +1925,8 @@ rcu_torture_init(void)
static struct rcu_torture_ops *torture_ops[] =
{ &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
&rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
- &srcu_ops, &srcu_sync_ops, &srcu_raw_ops,
- &srcu_raw_sync_ops, &srcu_expedited_ops,
+ &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops,
+ &srcu_raw_ops, &srcu_raw_sync_ops,
&sched_ops, &sched_sync_ops, &sched_expedited_ops, };
mutex_lock(&fullstop_mutex);
@@ -1931,8 +1948,7 @@ rcu_torture_init(void)
return -EINVAL;
}
if (cur_ops->fqs == NULL && fqs_duration != 0) {
- printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
- "fqs_duration, fqs disabled.\n");
+ printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
fqs_duration = 0;
}
if (cur_ops->init)
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 38ecdda3f55..f280e542e3e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -60,36 +60,44 @@
/* Data structures. */
-static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
-
-#define RCU_STATE_INITIALIZER(structname) { \
- .level = { &structname##_state.node[0] }, \
- .levelcnt = { \
- NUM_RCU_LVL_0, /* root of hierarchy. */ \
- NUM_RCU_LVL_1, \
- NUM_RCU_LVL_2, \
- NUM_RCU_LVL_3, \
- NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
- }, \
+static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
+
+#define RCU_STATE_INITIALIZER(sname, cr) { \
+ .level = { &sname##_state.node[0] }, \
+ .call = cr, \
.fqs_state = RCU_GP_IDLE, \
.gpnum = -300, \
.completed = -300, \
- .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
- .orphan_nxttail = &structname##_state.orphan_nxtlist, \
- .orphan_donetail = &structname##_state.orphan_donelist, \
- .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
- .n_force_qs = 0, \
- .n_force_qs_ngp = 0, \
- .name = #structname, \
+ .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \
+ .orphan_nxttail = &sname##_state.orphan_nxtlist, \
+ .orphan_donetail = &sname##_state.orphan_donelist, \
+ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+ .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \
+ .name = #sname, \
}
-struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched);
+struct rcu_state rcu_sched_state =
+ RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
static struct rcu_state *rcu_state;
+LIST_HEAD(rcu_struct_flavors);
+
+/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
+static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
+module_param(rcu_fanout_leaf, int, 0);
+int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
+static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
+ NUM_RCU_LVL_0,
+ NUM_RCU_LVL_1,
+ NUM_RCU_LVL_2,
+ NUM_RCU_LVL_3,
+ NUM_RCU_LVL_4,
+};
+int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
/*
* The rcu_scheduler_active variable transitions from zero to one just
@@ -147,13 +155,6 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
unsigned long rcutorture_testseq;
unsigned long rcutorture_vernum;
-/* State information for rcu_barrier() and friends. */
-
-static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
-static atomic_t rcu_barrier_cpu_count;
-static DEFINE_MUTEX(rcu_barrier_mutex);
-static struct completion rcu_barrier_completion;
-
/*
* Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
* permit this function to be invoked without holding the root rcu_node
@@ -201,6 +202,7 @@ void rcu_note_context_switch(int cpu)
{
trace_rcu_utilization("Start context switch");
rcu_sched_qs(cpu);
+ rcu_preempt_note_context_switch(cpu);
trace_rcu_utilization("End context switch");
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -357,7 +359,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
struct task_struct *idle = idle_task(smp_processor_id());
trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
- ftrace_dump(DUMP_ALL);
+ ftrace_dump(DUMP_ORIG);
WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
current->pid, current->comm,
idle->pid, idle->comm); /* must be idle task! */
@@ -467,7 +469,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
trace_rcu_dyntick("Error on exit: not idle task",
oldval, rdtp->dynticks_nesting);
- ftrace_dump(DUMP_ALL);
+ ftrace_dump(DUMP_ORIG);
WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
current->pid, current->comm,
idle->pid, idle->comm); /* must be idle task! */
@@ -584,8 +586,6 @@ void rcu_nmi_exit(void)
WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
}
-#ifdef CONFIG_PROVE_RCU
-
/**
* rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
*
@@ -603,7 +603,7 @@ int rcu_is_cpu_idle(void)
}
EXPORT_SYMBOL(rcu_is_cpu_idle);
-#ifdef CONFIG_HOTPLUG_CPU
+#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
/*
* Is the current CPU online? Disable preemption to avoid false positives
@@ -644,9 +644,7 @@ bool rcu_lockdep_current_cpu_online(void)
}
EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
-#endif /* #ifdef CONFIG_PROVE_RCU */
+#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
/**
* rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
@@ -732,7 +730,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
int cpu;
long delta;
unsigned long flags;
- int ndetected;
+ int ndetected = 0;
struct rcu_node *rnp = rcu_get_root(rsp);
/* Only let one CPU complain about others per time interval. */
@@ -773,7 +771,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
*/
rnp = rcu_get_root(rsp);
raw_spin_lock_irqsave(&rnp->lock, flags);
- ndetected = rcu_print_task_stall(rnp);
+ ndetected += rcu_print_task_stall(rnp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
print_cpu_stall_info_end();
@@ -859,9 +857,10 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
*/
void rcu_cpu_stall_reset(void)
{
- rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
- rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
- rcu_preempt_stall_reset();
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
}
static struct notifier_block rcu_panic_block = {
@@ -893,8 +892,9 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
if (rnp->qsmask & rdp->grpmask) {
rdp->qs_pending = 1;
rdp->passed_quiesce = 0;
- } else
+ } else {
rdp->qs_pending = 0;
+ }
zero_cpu_stall_ticks(rdp);
}
}
@@ -936,6 +936,18 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
}
/*
+ * Initialize the specified rcu_data structure's callback list to empty.
+ */
+static void init_callback_list(struct rcu_data *rdp)
+{
+ int i;
+
+ rdp->nxtlist = NULL;
+ for (i = 0; i < RCU_NEXT_SIZE; i++)
+ rdp->nxttail[i] = &rdp->nxtlist;
+}
+
+/*
* Advance this CPU's callbacks, but only if the current grace period
* has ended. This may be called only from the CPU to whom the rdp
* belongs. In addition, the corresponding leaf rcu_node structure's
@@ -1327,8 +1339,6 @@ static void
rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
struct rcu_node *rnp, struct rcu_data *rdp)
{
- int i;
-
/*
* Orphan the callbacks. First adjust the counts. This is safe
* because ->onofflock excludes _rcu_barrier()'s adoption of
@@ -1339,7 +1349,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
rsp->qlen += rdp->qlen;
rdp->n_cbs_orphaned += rdp->qlen;
rdp->qlen_lazy = 0;
- rdp->qlen = 0;
+ ACCESS_ONCE(rdp->qlen) = 0;
}
/*
@@ -1368,9 +1378,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
}
/* Finally, initialize the rcu_data structure's list to empty. */
- rdp->nxtlist = NULL;
- for (i = 0; i < RCU_NEXT_SIZE; i++)
- rdp->nxttail[i] = &rdp->nxtlist;
+ init_callback_list(rdp);
}
/*
@@ -1504,6 +1512,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (need_report & RCU_OFL_TASKS_EXP_GP)
rcu_report_exp_rnp(rsp, rnp, true);
+ WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
+ "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
+ cpu, rdp->qlen, rdp->nxtlist);
}
#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -1591,7 +1602,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
}
smp_mb(); /* List handling before counting for rcu_barrier(). */
rdp->qlen_lazy -= count_lazy;
- rdp->qlen -= count;
+ ACCESS_ONCE(rdp->qlen) -= count;
rdp->n_cbs_invoked += count;
/* Reinstate batch limit if we have worked down the excess. */
@@ -1604,6 +1615,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
rdp->n_force_qs_snap = rsp->n_force_qs;
} else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
rdp->qlen_last_fqs_check = rdp->qlen;
+ WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0));
local_irq_restore(flags);
@@ -1744,8 +1756,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
break; /* grace period idle or initializing, ignore. */
case RCU_SAVE_DYNTICK:
- if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
- break; /* So gcc recognizes the dead code. */
raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
@@ -1787,9 +1797,10 @@ unlock_fqs_ret:
* whom the rdp belongs.
*/
static void
-__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+__rcu_process_callbacks(struct rcu_state *rsp)
{
unsigned long flags;
+ struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
WARN_ON_ONCE(rdp->beenonline == 0);
@@ -1825,11 +1836,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
*/
static void rcu_process_callbacks(struct softirq_action *unused)
{
+ struct rcu_state *rsp;
+
trace_rcu_utilization("Start RCU core");
- __rcu_process_callbacks(&rcu_sched_state,
- &__get_cpu_var(rcu_sched_data));
- __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
- rcu_preempt_process_callbacks();
+ for_each_rcu_flavor(rsp)
+ __rcu_process_callbacks(rsp);
trace_rcu_utilization("End RCU core");
}
@@ -1856,6 +1867,56 @@ static void invoke_rcu_core(void)
raise_softirq(RCU_SOFTIRQ);
}
+/*
+ * Handle any core-RCU processing required by a call_rcu() invocation.
+ */
+static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
+ struct rcu_head *head, unsigned long flags)
+{
+ /*
+ * If called from an extended quiescent state, invoke the RCU
+ * core in order to force a re-evaluation of RCU's idleness.
+ */
+ if (rcu_is_cpu_idle() && cpu_online(smp_processor_id()))
+ invoke_rcu_core();
+
+ /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
+ if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
+ return;
+
+ /*
+ * Force the grace period if too many callbacks or too long waiting.
+ * Enforce hysteresis, and don't invoke force_quiescent_state()
+ * if some other CPU has recently done so. Also, don't bother
+ * invoking force_quiescent_state() if the newly enqueued callback
+ * is the only one waiting for a grace period to complete.
+ */
+ if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
+
+ /* Are we ignoring a completed grace period? */
+ rcu_process_gp_end(rsp, rdp);
+ check_for_new_grace_period(rsp, rdp);
+
+ /* Start a new grace period if one not already started. */
+ if (!rcu_gp_in_progress(rsp)) {
+ unsigned long nestflag;
+ struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+ raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
+ rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
+ } else {
+ /* Give the grace period a kick. */
+ rdp->blimit = LONG_MAX;
+ if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+ *rdp->nxttail[RCU_DONE_TAIL] != head)
+ force_quiescent_state(rsp, 0);
+ rdp->n_force_qs_snap = rsp->n_force_qs;
+ rdp->qlen_last_fqs_check = rdp->qlen;
+ }
+ } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
+ force_quiescent_state(rsp, 1);
+}
+
static void
__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
struct rcu_state *rsp, bool lazy)
@@ -1880,7 +1941,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
rdp = this_cpu_ptr(rsp->rda);
/* Add the callback to our list. */
- rdp->qlen++;
+ ACCESS_ONCE(rdp->qlen)++;
if (lazy)
rdp->qlen_lazy++;
else
@@ -1895,43 +1956,8 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
else
trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
- /* If interrupts were disabled, don't dive into RCU core. */
- if (irqs_disabled_flags(flags)) {
- local_irq_restore(flags);
- return;
- }
-
- /*
- * Force the grace period if too many callbacks or too long waiting.
- * Enforce hysteresis, and don't invoke force_quiescent_state()
- * if some other CPU has recently done so. Also, don't bother
- * invoking force_quiescent_state() if the newly enqueued callback
- * is the only one waiting for a grace period to complete.
- */
- if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
-
- /* Are we ignoring a completed grace period? */
- rcu_process_gp_end(rsp, rdp);
- check_for_new_grace_period(rsp, rdp);
-
- /* Start a new grace period if one not already started. */
- if (!rcu_gp_in_progress(rsp)) {
- unsigned long nestflag;
- struct rcu_node *rnp_root = rcu_get_root(rsp);
-
- raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
- rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
- } else {
- /* Give the grace period a kick. */
- rdp->blimit = LONG_MAX;
- if (rsp->n_force_qs == rdp->n_force_qs_snap &&
- *rdp->nxttail[RCU_DONE_TAIL] != head)
- force_quiescent_state(rsp, 0);
- rdp->n_force_qs_snap = rsp->n_force_qs;
- rdp->qlen_last_fqs_check = rdp->qlen;
- }
- } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
- force_quiescent_state(rsp, 1);
+ /* Go handle any RCU core processing required. */
+ __call_rcu_core(rsp, rdp, head, flags);
local_irq_restore(flags);
}
@@ -1961,28 +1987,16 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
* occasionally incorrectly indicate that there are multiple CPUs online
* when there was in fact only one the whole time, as this just adds
* some overhead: RCU still operates correctly.
- *
- * Of course, sampling num_online_cpus() with preemption enabled can
- * give erroneous results if there are concurrent CPU-hotplug operations.
- * For example, given a demonic sequence of preemptions in num_online_cpus()
- * and CPU-hotplug operations, there could be two or more CPUs online at
- * all times, but num_online_cpus() might well return one (or even zero).
- *
- * However, all such demonic sequences require at least one CPU-offline
- * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer
- * is only a problem if there is an RCU read-side critical section executing
- * throughout. But RCU-sched and RCU-bh read-side critical sections
- * disable either preemption or bh, which prevents a CPU from going offline.
- * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return
- * that there is only one CPU when in fact there was more than one throughout
- * is when there were no RCU readers in the system. If there are no
- * RCU readers, the grace period by definition can be of zero length,
- * regardless of the number of online CPUs.
*/
static inline int rcu_blocking_is_gp(void)
{
+ int ret;
+
might_sleep(); /* Check for RCU read-side critical section. */
- return num_online_cpus() <= 1;
+ preempt_disable();
+ ret = num_online_cpus() <= 1;
+ preempt_enable();
+ return ret;
}
/**
@@ -2117,9 +2131,9 @@ void synchronize_sched_expedited(void)
put_online_cpus();
/* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10)
+ if (trycount++ < 10) {
udelay(trycount * num_online_cpus());
- else {
+ } else {
synchronize_sched();
return;
}
@@ -2240,9 +2254,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
*/
static int rcu_pending(int cpu)
{
- return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
- __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) ||
- rcu_preempt_pending(cpu);
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu)))
+ return 1;
+ return 0;
}
/*
@@ -2252,20 +2269,41 @@ static int rcu_pending(int cpu)
*/
static int rcu_cpu_has_callbacks(int cpu)
{
+ struct rcu_state *rsp;
+
/* RCU callbacks either ready or pending? */
- return per_cpu(rcu_sched_data, cpu).nxtlist ||
- per_cpu(rcu_bh_data, cpu).nxtlist ||
- rcu_preempt_cpu_has_callbacks(cpu);
+ for_each_rcu_flavor(rsp)
+ if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
+ return 1;
+ return 0;
+}
+
+/*
+ * Helper function for _rcu_barrier() tracing. If tracing is disabled,
+ * the compiler is expected to optimize this away.
+ */
+static void _rcu_barrier_trace(struct rcu_state *rsp, char *s,
+ int cpu, unsigned long done)
+{
+ trace_rcu_barrier(rsp->name, s, cpu,
+ atomic_read(&rsp->barrier_cpu_count), done);
}
/*
* RCU callback function for _rcu_barrier(). If we are last, wake
* up the task executing _rcu_barrier().
*/
-static void rcu_barrier_callback(struct rcu_head *notused)
+static void rcu_barrier_callback(struct rcu_head *rhp)
{
- if (atomic_dec_and_test(&rcu_barrier_cpu_count))
- complete(&rcu_barrier_completion);
+ struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head);
+ struct rcu_state *rsp = rdp->rsp;
+
+ if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
+ _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
+ complete(&rsp->barrier_completion);
+ } else {
+ _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
+ }
}
/*
@@ -2273,35 +2311,63 @@ static void rcu_barrier_callback(struct rcu_head *notused)
*/
static void rcu_barrier_func(void *type)
{
- int cpu = smp_processor_id();
- struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
- void (*call_rcu_func)(struct rcu_head *head,
- void (*func)(struct rcu_head *head));
+ struct rcu_state *rsp = type;
+ struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
- atomic_inc(&rcu_barrier_cpu_count);
- call_rcu_func = type;
- call_rcu_func(head, rcu_barrier_callback);
+ _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
+ atomic_inc(&rsp->barrier_cpu_count);
+ rsp->call(&rdp->barrier_head, rcu_barrier_callback);
}
/*
* Orchestrate the specified type of RCU barrier, waiting for all
* RCU callbacks of the specified type to complete.
*/
-static void _rcu_barrier(struct rcu_state *rsp,
- void (*call_rcu_func)(struct rcu_head *head,
- void (*func)(struct rcu_head *head)))
+static void _rcu_barrier(struct rcu_state *rsp)
{
int cpu;
unsigned long flags;
struct rcu_data *rdp;
- struct rcu_head rh;
+ struct rcu_data rd;
+ unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
+ unsigned long snap_done;
- init_rcu_head_on_stack(&rh);
+ init_rcu_head_on_stack(&rd.barrier_head);
+ _rcu_barrier_trace(rsp, "Begin", -1, snap);
/* Take mutex to serialize concurrent rcu_barrier() requests. */
- mutex_lock(&rcu_barrier_mutex);
+ mutex_lock(&rsp->barrier_mutex);
+
+ /*
+ * Ensure that all prior references, including to ->n_barrier_done,
+ * are ordered before the _rcu_barrier() machinery.
+ */
+ smp_mb(); /* See above block comment. */
+
+ /*
+ * Recheck ->n_barrier_done to see if others did our work for us.
+ * This means checking ->n_barrier_done for an even-to-odd-to-even
+ * transition. The "if" expression below therefore rounds the old
+ * value up to the next even number and adds two before comparing.
+ */
+ snap_done = ACCESS_ONCE(rsp->n_barrier_done);
+ _rcu_barrier_trace(rsp, "Check", -1, snap_done);
+ if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) {
+ _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
+ smp_mb(); /* caller's subsequent code after above check. */
+ mutex_unlock(&rsp->barrier_mutex);
+ return;
+ }
- smp_mb(); /* Prevent any prior operations from leaking in. */
+ /*
+ * Increment ->n_barrier_done to avoid duplicate work. Use
+ * ACCESS_ONCE() to prevent the compiler from speculating
+ * the increment to precede the early-exit check.
+ */
+ ACCESS_ONCE(rsp->n_barrier_done)++;
+ WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
+ _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
+ smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
/*
* Initialize the count to one rather than to zero in order to
@@ -2320,8 +2386,8 @@ static void _rcu_barrier(struct rcu_state *rsp,
* 6. Both rcu_barrier_callback() callbacks are invoked, awakening
* us -- but before CPU 1's orphaned callbacks are invoked!!!
*/
- init_completion(&rcu_barrier_completion);
- atomic_set(&rcu_barrier_cpu_count, 1);
+ init_completion(&rsp->barrier_completion);
+ atomic_set(&rsp->barrier_cpu_count, 1);
raw_spin_lock_irqsave(&rsp->onofflock, flags);
rsp->rcu_barrier_in_progress = current;
raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
@@ -2337,14 +2403,19 @@ static void _rcu_barrier(struct rcu_state *rsp,
preempt_disable();
rdp = per_cpu_ptr(rsp->rda, cpu);
if (cpu_is_offline(cpu)) {
+ _rcu_barrier_trace(rsp, "Offline", cpu,
+ rsp->n_barrier_done);
preempt_enable();
while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
schedule_timeout_interruptible(1);
} else if (ACCESS_ONCE(rdp->qlen)) {
- smp_call_function_single(cpu, rcu_barrier_func,
- (void *)call_rcu_func, 1);
+ _rcu_barrier_trace(rsp, "OnlineQ", cpu,
+ rsp->n_barrier_done);
+ smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
preempt_enable();
} else {
+ _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
+ rsp->n_barrier_done);
preempt_enable();
}
}
@@ -2361,24 +2432,32 @@ static void _rcu_barrier(struct rcu_state *rsp,
rcu_adopt_orphan_cbs(rsp);
rsp->rcu_barrier_in_progress = NULL;
raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
- atomic_inc(&rcu_barrier_cpu_count);
+ atomic_inc(&rsp->barrier_cpu_count);
smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
- call_rcu_func(&rh, rcu_barrier_callback);
+ rd.rsp = rsp;
+ rsp->call(&rd.barrier_head, rcu_barrier_callback);
/*
* Now that we have an rcu_barrier_callback() callback on each
* CPU, and thus each counted, remove the initial count.
*/
- if (atomic_dec_and_test(&rcu_barrier_cpu_count))
- complete(&rcu_barrier_completion);
+ if (atomic_dec_and_test(&rsp->barrier_cpu_count))
+ complete(&rsp->barrier_completion);
+
+ /* Increment ->n_barrier_done to prevent duplicate work. */
+ smp_mb(); /* Keep increment after above mechanism. */
+ ACCESS_ONCE(rsp->n_barrier_done)++;
+ WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
+ _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
+ smp_mb(); /* Keep increment before caller's subsequent code. */
/* Wait for all rcu_barrier_callback() callbacks to be invoked. */
- wait_for_completion(&rcu_barrier_completion);
+ wait_for_completion(&rsp->barrier_completion);
/* Other rcu_barrier() invocations can now safely proceed. */
- mutex_unlock(&rcu_barrier_mutex);
+ mutex_unlock(&rsp->barrier_mutex);
- destroy_rcu_head_on_stack(&rh);
+ destroy_rcu_head_on_stack(&rd.barrier_head);
}
/**
@@ -2386,7 +2465,7 @@ static void _rcu_barrier(struct rcu_state *rsp,
*/
void rcu_barrier_bh(void)
{
- _rcu_barrier(&rcu_bh_state, call_rcu_bh);
+ _rcu_barrier(&rcu_bh_state);
}
EXPORT_SYMBOL_GPL(rcu_barrier_bh);
@@ -2395,7 +2474,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh);
*/
void rcu_barrier_sched(void)
{
- _rcu_barrier(&rcu_sched_state, call_rcu_sched);
+ _rcu_barrier(&rcu_sched_state);
}
EXPORT_SYMBOL_GPL(rcu_barrier_sched);
@@ -2406,18 +2485,15 @@ static void __init
rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
{
unsigned long flags;
- int i;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp);
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
- rdp->nxtlist = NULL;
- for (i = 0; i < RCU_NEXT_SIZE; i++)
- rdp->nxttail[i] = &rdp->nxtlist;
+ init_callback_list(rdp);
rdp->qlen_lazy = 0;
- rdp->qlen = 0;
+ ACCESS_ONCE(rdp->qlen) = 0;
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -2491,9 +2567,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
static void __cpuinit rcu_prepare_cpu(int cpu)
{
- rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
- rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
- rcu_preempt_init_percpu_data(cpu);
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ rcu_init_percpu_data(cpu, rsp,
+ strcmp(rsp->name, "rcu_preempt") == 0);
}
/*
@@ -2505,6 +2583,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
long cpu = (long)hcpu;
struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
struct rcu_node *rnp = rdp->mynode;
+ struct rcu_state *rsp;
trace_rcu_utilization("Start CPU hotplug");
switch (action) {
@@ -2529,18 +2608,16 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
* touch any data without introducing corruption. We send the
* dying CPU's callbacks to an arbitrarily chosen online CPU.
*/
- rcu_cleanup_dying_cpu(&rcu_bh_state);
- rcu_cleanup_dying_cpu(&rcu_sched_state);
- rcu_preempt_cleanup_dying_cpu();
+ for_each_rcu_flavor(rsp)
+ rcu_cleanup_dying_cpu(rsp);
rcu_cleanup_after_idle(cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
- rcu_cleanup_dead_cpu(cpu, &rcu_bh_state);
- rcu_cleanup_dead_cpu(cpu, &rcu_sched_state);
- rcu_preempt_cleanup_dead_cpu(cpu);
+ for_each_rcu_flavor(rsp)
+ rcu_cleanup_dead_cpu(cpu, rsp);
break;
default:
break;
@@ -2573,9 +2650,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
{
int i;
- for (i = NUM_RCU_LVLS - 1; i > 0; i--)
+ for (i = rcu_num_lvls - 1; i > 0; i--)
rsp->levelspread[i] = CONFIG_RCU_FANOUT;
- rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF;
+ rsp->levelspread[0] = rcu_fanout_leaf;
}
#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
static void __init rcu_init_levelspread(struct rcu_state *rsp)
@@ -2585,7 +2662,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
int i;
cprv = NR_CPUS;
- for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+ for (i = rcu_num_lvls - 1; i >= 0; i--) {
ccur = rsp->levelcnt[i];
rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
cprv = ccur;
@@ -2612,13 +2689,15 @@ static void __init rcu_init_one(struct rcu_state *rsp,
/* Initialize the level-tracking arrays. */
- for (i = 1; i < NUM_RCU_LVLS; i++)
+ for (i = 0; i < rcu_num_lvls; i++)
+ rsp->levelcnt[i] = num_rcu_lvl[i];
+ for (i = 1; i < rcu_num_lvls; i++)
rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
rcu_init_levelspread(rsp);
/* Initialize the elements themselves, starting from the leaves. */
- for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+ for (i = rcu_num_lvls - 1; i >= 0; i--) {
cpustride *= rsp->levelspread[i];
rnp = rsp->level[i];
for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
@@ -2648,13 +2727,74 @@ static void __init rcu_init_one(struct rcu_state *rsp,
}
rsp->rda = rda;
- rnp = rsp->level[NUM_RCU_LVLS - 1];
+ rnp = rsp->level[rcu_num_lvls - 1];
for_each_possible_cpu(i) {
while (i > rnp->grphi)
rnp++;
per_cpu_ptr(rsp->rda, i)->mynode = rnp;
rcu_boot_init_percpu_data(i, rsp);
}
+ list_add(&rsp->flavors, &rcu_struct_flavors);
+}
+
+/*
+ * Compute the rcu_node tree geometry from kernel parameters. This cannot
+ * replace the definitions in rcutree.h because those are needed to size
+ * the ->node array in the rcu_state structure.
+ */
+static void __init rcu_init_geometry(void)
+{
+ int i;
+ int j;
+ int n = nr_cpu_ids;
+ int rcu_capacity[MAX_RCU_LVLS + 1];
+
+ /* If the compile-time values are accurate, just leave. */
+ if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF)
+ return;
+
+ /*
+ * Compute number of nodes that can be handled an rcu_node tree
+ * with the given number of levels. Setting rcu_capacity[0] makes
+ * some of the arithmetic easier.
+ */
+ rcu_capacity[0] = 1;
+ rcu_capacity[1] = rcu_fanout_leaf;
+ for (i = 2; i <= MAX_RCU_LVLS; i++)
+ rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
+
+ /*
+ * The boot-time rcu_fanout_leaf parameter is only permitted
+ * to increase the leaf-level fanout, not decrease it. Of course,
+ * the leaf-level fanout cannot exceed the number of bits in
+ * the rcu_node masks. Finally, the tree must be able to accommodate
+ * the configured number of CPUs. Complain and fall back to the
+ * compile-time values if these limits are exceeded.
+ */
+ if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
+ rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
+ n > rcu_capacity[MAX_RCU_LVLS]) {
+ WARN_ON(1);
+ return;
+ }
+
+ /* Calculate the number of rcu_nodes at each level of the tree. */
+ for (i = 1; i <= MAX_RCU_LVLS; i++)
+ if (n <= rcu_capacity[i]) {
+ for (j = 0; j <= i; j++)
+ num_rcu_lvl[j] =
+ DIV_ROUND_UP(n, rcu_capacity[i - j]);
+ rcu_num_lvls = i;
+ for (j = i + 1; j <= MAX_RCU_LVLS; j++)
+ num_rcu_lvl[j] = 0;
+ break;
+ }
+
+ /* Calculate the total number of rcu_node structures. */
+ rcu_num_nodes = 0;
+ for (i = 0; i <= MAX_RCU_LVLS; i++)
+ rcu_num_nodes += num_rcu_lvl[i];
+ rcu_num_nodes -= n;
}
void __init rcu_init(void)
@@ -2662,6 +2802,7 @@ void __init rcu_init(void)
int cpu;
rcu_bootup_announce();
+ rcu_init_geometry();
rcu_init_one(&rcu_sched_state, &rcu_sched_data);
rcu_init_one(&rcu_bh_state, &rcu_bh_data);
__rcu_init_preempt();
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index ea056495783..4d29169f212 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -42,28 +42,28 @@
#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
#if NR_CPUS <= RCU_FANOUT_1
-# define NUM_RCU_LVLS 1
+# define RCU_NUM_LVLS 1
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 (NR_CPUS)
# define NUM_RCU_LVL_2 0
# define NUM_RCU_LVL_3 0
# define NUM_RCU_LVL_4 0
#elif NR_CPUS <= RCU_FANOUT_2
-# define NUM_RCU_LVLS 2
+# define RCU_NUM_LVLS 2
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
# define NUM_RCU_LVL_2 (NR_CPUS)
# define NUM_RCU_LVL_3 0
# define NUM_RCU_LVL_4 0
#elif NR_CPUS <= RCU_FANOUT_3
-# define NUM_RCU_LVLS 3
+# define RCU_NUM_LVLS 3
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
# define NUM_RCU_LVL_3 (NR_CPUS)
# define NUM_RCU_LVL_4 0
#elif NR_CPUS <= RCU_FANOUT_4
-# define NUM_RCU_LVLS 4
+# define RCU_NUM_LVLS 4
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
@@ -76,6 +76,9 @@
#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
+extern int rcu_num_lvls;
+extern int rcu_num_nodes;
+
/*
* Dynticks per-CPU state.
*/
@@ -97,6 +100,7 @@ struct rcu_dynticks {
/* # times non-lazy CBs posted to CPU. */
unsigned long nonlazy_posted_snap;
/* idle-period nonlazy_posted snapshot. */
+ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
};
@@ -206,7 +210,7 @@ struct rcu_node {
*/
#define rcu_for_each_node_breadth_first(rsp, rnp) \
for ((rnp) = &(rsp)->node[0]; \
- (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
/*
* Do a breadth-first scan of the non-leaf rcu_node structures for the
@@ -215,7 +219,7 @@ struct rcu_node {
*/
#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
for ((rnp) = &(rsp)->node[0]; \
- (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
+ (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
/*
* Scan the leaves of the rcu_node hierarchy for the specified rcu_state
@@ -224,8 +228,8 @@ struct rcu_node {
* It is still a leaf node, even if it is also the root node.
*/
#define rcu_for_each_leaf_node(rsp, rnp) \
- for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
- (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+ for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
+ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
/* Index values for nxttail array in struct rcu_data. */
#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
@@ -311,6 +315,9 @@ struct rcu_data {
unsigned long n_rp_need_fqs;
unsigned long n_rp_need_nothing;
+ /* 6) _rcu_barrier() callback. */
+ struct rcu_head barrier_head;
+
int cpu;
struct rcu_state *rsp;
};
@@ -357,10 +364,12 @@ do { \
*/
struct rcu_state {
struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
- struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
+ struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
- u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
+ u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
+ void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
+ void (*func)(struct rcu_head *head));
/* The following fields are guarded by the root rcu_node's lock. */
@@ -392,6 +401,11 @@ struct rcu_state {
struct task_struct *rcu_barrier_in_progress;
/* Task doing rcu_barrier(), */
/* or NULL if no barrier. */
+ struct mutex barrier_mutex; /* Guards barrier fields. */
+ atomic_t barrier_cpu_count; /* # CPUs waiting on. */
+ struct completion barrier_completion; /* Wake at barrier end. */
+ unsigned long n_barrier_done; /* ++ at start and end of */
+ /* _rcu_barrier(). */
raw_spinlock_t fqslock; /* Only one task forcing */
/* quiescent states. */
unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -409,8 +423,13 @@ struct rcu_state {
unsigned long gp_max; /* Maximum GP duration in */
/* jiffies. */
char *name; /* Name of structure. */
+ struct list_head flavors; /* List of RCU flavors. */
};
+extern struct list_head rcu_struct_flavors;
+#define for_each_rcu_flavor(rsp) \
+ list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
+
/* Return values for rcu_preempt_offline_tasks(). */
#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
@@ -444,6 +463,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
/* Forward declarations for rcutree_plugin.h */
static void rcu_bootup_announce(void);
long rcu_batches_completed(void);
+static void rcu_preempt_note_context_switch(int cpu);
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
@@ -452,25 +472,18 @@ static void rcu_stop_cpu_kthread(int cpu);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_print_detail_task_stall(struct rcu_state *rsp);
static int rcu_print_task_stall(struct rcu_node *rnp);
-static void rcu_preempt_stall_reset(void);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
struct rcu_node *rnp,
struct rcu_data *rdp);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_preempt_cleanup_dead_cpu(int cpu);
static void rcu_preempt_check_callbacks(int cpu);
-static void rcu_preempt_process_callbacks(void);
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
bool wake);
#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
-static int rcu_preempt_pending(int cpu);
-static int rcu_preempt_cpu_has_callbacks(int cpu);
-static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void rcu_preempt_cleanup_dying_cpu(void);
static void __init __rcu_init_preempt(void);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 5271a020887..7f3244c0df0 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -68,17 +68,21 @@ static void __init rcu_bootup_announce_oddness(void)
printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n");
#endif
#if NUM_RCU_LVL_4 != 0
- printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
+ printk(KERN_INFO "\tFour-level hierarchy is enabled.\n");
#endif
+ if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
+ printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
+ if (nr_cpu_ids != NR_CPUS)
+ printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
}
#ifdef CONFIG_TREE_PREEMPT_RCU
-struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt);
+struct rcu_state rcu_preempt_state =
+ RCU_STATE_INITIALIZER(rcu_preempt, call_rcu);
DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
static struct rcu_state *rcu_state = &rcu_preempt_state;
-static void rcu_read_unlock_special(struct task_struct *t);
static int rcu_preempted_readers_exp(struct rcu_node *rnp);
/*
@@ -153,7 +157,7 @@ static void rcu_preempt_qs(int cpu)
*
* Caller must disable preemption.
*/
-void rcu_preempt_note_context_switch(void)
+static void rcu_preempt_note_context_switch(int cpu)
{
struct task_struct *t = current;
unsigned long flags;
@@ -164,7 +168,7 @@ void rcu_preempt_note_context_switch(void)
(t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
/* Possibly blocking in an RCU read-side critical section. */
- rdp = __this_cpu_ptr(rcu_preempt_state.rda);
+ rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -228,23 +232,11 @@ void rcu_preempt_note_context_switch(void)
* means that we continue to block the current grace period.
*/
local_irq_save(flags);
- rcu_preempt_qs(smp_processor_id());
+ rcu_preempt_qs(cpu);
local_irq_restore(flags);
}
/*
- * Tree-preemptible RCU implementation for rcu_read_lock().
- * Just increment ->rcu_read_lock_nesting, shared state will be updated
- * if we block.
- */
-void __rcu_read_lock(void)
-{
- current->rcu_read_lock_nesting++;
- barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_lock);
-
-/*
* Check for preempted RCU readers blocking the current grace period
* for the specified rcu_node structure. If the caller needs a reliable
* answer, it must hold the rcu_node's ->lock.
@@ -310,7 +302,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
* notify RCU core processing or task having blocked during the RCU
* read-side critical section.
*/
-static noinline void rcu_read_unlock_special(struct task_struct *t)
+void rcu_read_unlock_special(struct task_struct *t)
{
int empty;
int empty_exp;
@@ -398,8 +390,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
rnp->grphi,
!!rnp->gp_tasks);
rcu_report_unblock_qs_rnp(rnp, flags);
- } else
+ } else {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ }
#ifdef CONFIG_RCU_BOOST
/* Unboost if we were boosted. */
@@ -418,38 +411,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
}
}
-/*
- * Tree-preemptible RCU implementation for rcu_read_unlock().
- * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
- * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
- * invoke rcu_read_unlock_special() to clean up after a context switch
- * in an RCU read-side critical section and other special cases.
- */
-void __rcu_read_unlock(void)
-{
- struct task_struct *t = current;
-
- if (t->rcu_read_lock_nesting != 1)
- --t->rcu_read_lock_nesting;
- else {
- barrier(); /* critical section before exit code. */
- t->rcu_read_lock_nesting = INT_MIN;
- barrier(); /* assign before ->rcu_read_unlock_special load */
- if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
- rcu_read_unlock_special(t);
- barrier(); /* ->rcu_read_unlock_special load before assign */
- t->rcu_read_lock_nesting = 0;
- }
-#ifdef CONFIG_PROVE_LOCKING
- {
- int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
-
- WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
- }
-#endif /* #ifdef CONFIG_PROVE_LOCKING */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-
#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
/*
@@ -540,16 +501,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
}
/*
- * Suppress preemptible RCU's CPU stall warnings by pushing the
- * time of the next stall-warning message comfortably far into the
- * future.
- */
-static void rcu_preempt_stall_reset(void)
-{
- rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
-}
-
-/*
* Check that the list of blocked tasks for the newly completed grace
* period is in fact empty. It is a serious bug to complete a grace
* period that still has RCU readers blocked! This function must be
@@ -650,14 +601,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
/*
- * Do CPU-offline processing for preemptible RCU.
- */
-static void rcu_preempt_cleanup_dead_cpu(int cpu)
-{
- rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state);
-}
-
-/*
* Check for a quiescent state from the current CPU. When a task blocks,
* the task is recorded in the corresponding CPU's rcu_node structure,
* which is checked elsewhere.
@@ -677,15 +620,6 @@ static void rcu_preempt_check_callbacks(int cpu)
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
}
-/*
- * Process callbacks for preemptible RCU.
- */
-static void rcu_preempt_process_callbacks(void)
-{
- __rcu_process_callbacks(&rcu_preempt_state,
- &__get_cpu_var(rcu_preempt_data));
-}
-
#ifdef CONFIG_RCU_BOOST
static void rcu_preempt_do_callbacks(void)
@@ -824,9 +758,9 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
int must_wait = 0;
raw_spin_lock_irqsave(&rnp->lock, flags);
- if (list_empty(&rnp->blkd_tasks))
+ if (list_empty(&rnp->blkd_tasks)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- else {
+ } else {
rnp->exp_tasks = rnp->blkd_tasks.next;
rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
must_wait = 1;
@@ -870,9 +804,9 @@ void synchronize_rcu_expedited(void)
* expedited grace period for us, just leave.
*/
while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
- if (trycount++ < 10)
+ if (trycount++ < 10) {
udelay(trycount * num_online_cpus());
- else {
+ } else {
synchronize_rcu();
return;
}
@@ -917,51 +851,16 @@ mb_ret:
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-/*
- * Check to see if there is any immediate preemptible-RCU-related work
- * to be done.
- */
-static int rcu_preempt_pending(int cpu)
-{
- return __rcu_pending(&rcu_preempt_state,
- &per_cpu(rcu_preempt_data, cpu));
-}
-
-/*
- * Does preemptible RCU have callbacks on this CPU?
- */
-static int rcu_preempt_cpu_has_callbacks(int cpu)
-{
- return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
-}
-
/**
* rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
*/
void rcu_barrier(void)
{
- _rcu_barrier(&rcu_preempt_state, call_rcu);
+ _rcu_barrier(&rcu_preempt_state);
}
EXPORT_SYMBOL_GPL(rcu_barrier);
/*
- * Initialize preemptible RCU's per-CPU data.
- */
-static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
-{
- rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
-}
-
-/*
- * Move preemptible RCU's callbacks from dying CPU to other online CPU
- * and record a quiescent state.
- */
-static void rcu_preempt_cleanup_dying_cpu(void)
-{
- rcu_cleanup_dying_cpu(&rcu_preempt_state);
-}
-
-/*
* Initialize preemptible RCU's state structures.
*/
static void __init __rcu_init_preempt(void)
@@ -1002,6 +901,14 @@ void rcu_force_quiescent_state(void)
EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * CPUs being in quiescent states.
+ */
+static void rcu_preempt_note_context_switch(int cpu)
+{
+}
+
+/*
* Because preemptible RCU does not exist, there are never any preempted
* RCU readers.
*/
@@ -1038,14 +945,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
}
/*
- * Because preemptible RCU does not exist, there is no need to suppress
- * its CPU stall warnings.
- */
-static void rcu_preempt_stall_reset(void)
-{
-}
-
-/*
* Because there is no preemptible RCU, there can be no readers blocked,
* so there is no need to check for blocked tasks. So check only for
* bogus qsmask values.
@@ -1073,14 +972,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
/*
- * Because preemptible RCU does not exist, it never needs CPU-offline
- * processing.
- */
-static void rcu_preempt_cleanup_dead_cpu(int cpu)
-{
-}
-
-/*
* Because preemptible RCU does not exist, it never has any callbacks
* to check.
*/
@@ -1089,14 +980,6 @@ static void rcu_preempt_check_callbacks(int cpu)
}
/*
- * Because preemptible RCU does not exist, it never has any callbacks
- * to process.
- */
-static void rcu_preempt_process_callbacks(void)
-{
-}
-
-/*
* Queue an RCU callback for lazy invocation after a grace period.
* This will likely be later named something like "call_rcu_lazy()",
* but this change will require some way of tagging the lazy RCU
@@ -1137,22 +1020,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
/*
- * Because preemptible RCU does not exist, it never has any work to do.
- */
-static int rcu_preempt_pending(int cpu)
-{
- return 0;
-}
-
-/*
- * Because preemptible RCU does not exist, it never has callbacks
- */
-static int rcu_preempt_cpu_has_callbacks(int cpu)
-{
- return 0;
-}
-
-/*
* Because preemptible RCU does not exist, rcu_barrier() is just
* another name for rcu_barrier_sched().
*/
@@ -1163,21 +1030,6 @@ void rcu_barrier(void)
EXPORT_SYMBOL_GPL(rcu_barrier);
/*
- * Because preemptible RCU does not exist, there is no per-CPU
- * data to initialize.
- */
-static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
-{
-}
-
-/*
- * Because there is no preemptible RCU, there is no cleanup to do.
- */
-static void rcu_preempt_cleanup_dying_cpu(void)
-{
-}
-
-/*
* Because preemptible RCU does not exist, it need not be initialized.
*/
static void __init __rcu_init_preempt(void)
@@ -1960,9 +1812,11 @@ static void rcu_idle_count_callbacks_posted(void)
*/
#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
-#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
+#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
+extern int tick_nohz_enabled;
+
/*
* Does the specified flavor of RCU have non-lazy callbacks pending on
* the specified CPU? Both RCU flavor and CPU are specified by the
@@ -2039,10 +1893,13 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
return 1;
}
/* Set up for the possibility that RCU will post a timer. */
- if (rcu_cpu_has_nonlazy_callbacks(cpu))
- *delta_jiffies = RCU_IDLE_GP_DELAY;
- else
- *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY;
+ if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
+ *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies,
+ RCU_IDLE_GP_DELAY) - jiffies;
+ } else {
+ *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY;
+ *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
+ }
return 0;
}
@@ -2101,6 +1958,7 @@ static void rcu_cleanup_after_idle(int cpu)
del_timer(&rdtp->idle_gp_timer);
trace_rcu_prep_idle("Cleanup after idle");
+ rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
}
/*
@@ -2126,6 +1984,18 @@ static void rcu_prepare_for_idle(int cpu)
{
struct timer_list *tp;
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+ int tne;
+
+ /* Handle nohz enablement switches conservatively. */
+ tne = ACCESS_ONCE(tick_nohz_enabled);
+ if (tne != rdtp->tick_nohz_enabled_snap) {
+ if (rcu_cpu_has_callbacks(cpu))
+ invoke_rcu_core(); /* force nohz to see update. */
+ rdtp->tick_nohz_enabled_snap = tne;
+ return;
+ }
+ if (!tne)
+ return;
/*
* If this is an idle re-entry, for example, due to use of
@@ -2179,10 +2049,11 @@ static void rcu_prepare_for_idle(int cpu)
if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
trace_rcu_prep_idle("Dyntick with callbacks");
rdtp->idle_gp_timer_expires =
- jiffies + RCU_IDLE_GP_DELAY;
+ round_up(jiffies + RCU_IDLE_GP_DELAY,
+ RCU_IDLE_GP_DELAY);
} else {
rdtp->idle_gp_timer_expires =
- jiffies + RCU_IDLE_LAZY_GP_DELAY;
+ round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
trace_rcu_prep_idle("Dyntick with lazy callbacks");
}
tp = &rdtp->idle_gp_timer;
@@ -2223,8 +2094,9 @@ static void rcu_prepare_for_idle(int cpu)
if (rcu_cpu_has_callbacks(cpu)) {
trace_rcu_prep_idle("More callbacks");
invoke_rcu_core();
- } else
+ } else {
trace_rcu_prep_idle("Callbacks drained");
+ }
}
/*
@@ -2261,6 +2133,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
{
+ *cp = '\0';
}
#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d4bc16ddd1d..abffb486e94 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,6 +46,31 @@
#define RCU_TREE_NONCORE
#include "rcutree.h"
+static int show_rcubarrier(struct seq_file *m, void *unused)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ seq_printf(m, "%s: %c bcc: %d nbd: %lu\n",
+ rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.',
+ atomic_read(&rsp->barrier_cpu_count),
+ rsp->n_barrier_done);
+ return 0;
+}
+
+static int rcubarrier_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_rcubarrier, NULL);
+}
+
+static const struct file_operations rcubarrier_fops = {
+ .owner = THIS_MODULE,
+ .open = rcubarrier_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
#ifdef CONFIG_RCU_BOOST
static char convert_kthread_status(unsigned int kthread_status)
@@ -95,24 +120,16 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
}
-#define PRINT_RCU_DATA(name, func, m) \
- do { \
- int _p_r_d_i; \
- \
- for_each_possible_cpu(_p_r_d_i) \
- func(m, &per_cpu(name, _p_r_d_i)); \
- } while (0)
-
static int show_rcudata(struct seq_file *m, void *unused)
{
-#ifdef CONFIG_TREE_PREEMPT_RCU
- seq_puts(m, "rcu_preempt:\n");
- PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
- seq_puts(m, "rcu_sched:\n");
- PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m);
- seq_puts(m, "rcu_bh:\n");
- PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
+ int cpu;
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ seq_printf(m, "%s:\n", rsp->name);
+ for_each_possible_cpu(cpu)
+ print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu));
+ }
return 0;
}
@@ -166,6 +183,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
static int show_rcudata_csv(struct seq_file *m, void *unused)
{
+ int cpu;
+ struct rcu_state *rsp;
+
seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\"");
@@ -173,14 +193,11 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
seq_puts(m, "\"kt\",\"ktl\"");
#endif /* #ifdef CONFIG_RCU_BOOST */
seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n");
-#ifdef CONFIG_TREE_PREEMPT_RCU
- seq_puts(m, "\"rcu_preempt:\"\n");
- PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
- seq_puts(m, "\"rcu_sched:\"\n");
- PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
- seq_puts(m, "\"rcu_bh:\"\n");
- PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
+ for_each_rcu_flavor(rsp) {
+ seq_printf(m, "\"%s:\"\n", rsp->name);
+ for_each_possible_cpu(cpu)
+ print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu));
+ }
return 0;
}
@@ -201,8 +218,7 @@ static const struct file_operations rcudata_csv_fops = {
static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
{
- seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu "
- "j=%04x bt=%04x\n",
+ seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ",
rnp->grplo, rnp->grphi,
"T."[list_empty(&rnp->blkd_tasks)],
"N."[!rnp->gp_tasks],
@@ -210,11 +226,11 @@ static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
"B."[!rnp->boost_tasks],
convert_kthread_status(rnp->boost_kthread_status),
rnp->n_tasks_boosted, rnp->n_exp_boosts,
- rnp->n_normal_boosts,
+ rnp->n_normal_boosts);
+ seq_printf(m, "j=%04x bt=%04x\n",
(int)(jiffies & 0xffff),
(int)(rnp->boost_time & 0xffff));
- seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
- " balk",
+ seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
rnp->n_balk_blkd_tasks,
rnp->n_balk_exp_gp_tasks,
rnp->n_balk_boost_tasks,
@@ -270,15 +286,15 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
struct rcu_node *rnp;
gpnum = rsp->gpnum;
- seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
- "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
- rsp->completed, gpnum, rsp->fqs_state,
+ seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ",
+ rsp->name, rsp->completed, gpnum, rsp->fqs_state,
(long)(rsp->jiffies_force_qs - jiffies),
- (int)(jiffies & 0xffff),
+ (int)(jiffies & 0xffff));
+ seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
rsp->n_force_qs, rsp->n_force_qs_ngp,
rsp->n_force_qs - rsp->n_force_qs_ngp,
rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
- for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
+ for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
if (rnp->level != level) {
seq_puts(m, "\n");
level = rnp->level;
@@ -295,14 +311,10 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
static int show_rcuhier(struct seq_file *m, void *unused)
{
-#ifdef CONFIG_TREE_PREEMPT_RCU
- seq_puts(m, "rcu_preempt:\n");
- print_one_rcu_state(m, &rcu_preempt_state);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
- seq_puts(m, "rcu_sched:\n");
- print_one_rcu_state(m, &rcu_sched_state);
- seq_puts(m, "rcu_bh:\n");
- print_one_rcu_state(m, &rcu_bh_state);
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ print_one_rcu_state(m, rsp);
return 0;
}
@@ -343,11 +355,10 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
static int show_rcugp(struct seq_file *m, void *unused)
{
-#ifdef CONFIG_TREE_PREEMPT_RCU
- show_one_rcugp(m, &rcu_preempt_state);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
- show_one_rcugp(m, &rcu_sched_state);
- show_one_rcugp(m, &rcu_bh_state);
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ show_one_rcugp(m, rsp);
return 0;
}
@@ -366,44 +377,36 @@ static const struct file_operations rcugp_fops = {
static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
{
- seq_printf(m, "%3d%cnp=%ld "
- "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
- "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
+ seq_printf(m, "%3d%cnp=%ld ",
rdp->cpu,
cpu_is_offline(rdp->cpu) ? '!' : ' ',
- rdp->n_rcu_pending,
+ rdp->n_rcu_pending);
+ seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
rdp->n_rp_qs_pending,
rdp->n_rp_report_qs,
rdp->n_rp_cb_ready,
- rdp->n_rp_cpu_needs_gp,
+ rdp->n_rp_cpu_needs_gp);
+ seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
rdp->n_rp_gp_completed,
rdp->n_rp_gp_started,
rdp->n_rp_need_fqs,
rdp->n_rp_need_nothing);
}
-static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
+static int show_rcu_pending(struct seq_file *m, void *unused)
{
int cpu;
struct rcu_data *rdp;
-
- for_each_possible_cpu(cpu) {
- rdp = per_cpu_ptr(rsp->rda, cpu);
- if (rdp->beenonline)
- print_one_rcu_pending(m, rdp);
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ seq_printf(m, "%s:\n", rsp->name);
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ if (rdp->beenonline)
+ print_one_rcu_pending(m, rdp);
+ }
}
-}
-
-static int show_rcu_pending(struct seq_file *m, void *unused)
-{
-#ifdef CONFIG_TREE_PREEMPT_RCU
- seq_puts(m, "rcu_preempt:\n");
- print_rcu_pendings(m, &rcu_preempt_state);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
- seq_puts(m, "rcu_sched:\n");
- print_rcu_pendings(m, &rcu_sched_state);
- seq_puts(m, "rcu_bh:\n");
- print_rcu_pendings(m, &rcu_bh_state);
return 0;
}
@@ -453,6 +456,11 @@ static int __init rcutree_trace_init(void)
if (!rcudir)
goto free_out;
+ retval = debugfs_create_file("rcubarrier", 0444, rcudir,
+ NULL, &rcubarrier_fops);
+ if (!retval)
+ goto free_out;
+
retval = debugfs_create_file("rcudata", 0444, rcudir,
NULL, &rcudata_fops);
if (!retval)
diff --git a/kernel/relay.c b/kernel/relay.c
index ab56a1764d4..e8cd2027abb 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1235,6 +1235,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
struct splice_pipe_desc spd = {
.pages = pages,
.nr_pages = 0,
+ .nr_pages_max = PIPE_DEF_BUFFERS,
.partial = partial,
.flags = flags,
.ops = &relay_pipe_buf_ops,
@@ -1302,8 +1303,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
ret += padding;
out:
- splice_shrink_spd(pipe, &spd);
- return ret;
+ splice_shrink_spd(&spd);
+ return ret;
}
static ssize_t relay_file_splice_read(struct file *in,
diff --git a/kernel/resource.c b/kernel/resource.c
index e1d2b8ee76d..34d45886ee8 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -7,6 +7,8 @@
* Arbitrary resource management.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/export.h>
#include <linux/errno.h>
#include <linux/ioport.h>
@@ -722,14 +724,12 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
write_lock(&resource_lock);
+ if (!parent)
+ goto skip;
+
if ((start < parent->start) || (end > parent->end))
goto out;
- for (tmp = res->child; tmp; tmp = tmp->sibling) {
- if ((tmp->start < start) || (tmp->end > end))
- goto out;
- }
-
if (res->sibling && (res->sibling->start <= end))
goto out;
@@ -741,6 +741,11 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
goto out;
}
+skip:
+ for (tmp = res->child; tmp; tmp = tmp->sibling)
+ if ((tmp->start < start) || (tmp->end > end))
+ goto out;
+
res->start = start;
res->end = end;
result = 0;
@@ -788,8 +793,28 @@ void __init reserve_region_with_split(struct resource *root,
resource_size_t start, resource_size_t end,
const char *name)
{
+ int abort = 0;
+
write_lock(&resource_lock);
- __reserve_region_with_split(root, start, end, name);
+ if (root->start > start || root->end < end) {
+ pr_err("requested range [0x%llx-0x%llx] not in root %pr\n",
+ (unsigned long long)start, (unsigned long long)end,
+ root);
+ if (start > root->end || end < root->start)
+ abort = 1;
+ else {
+ if (end > root->end)
+ end = root->end;
+ if (start < root->start)
+ start = root->start;
+ pr_err("fixing request to [0x%llx-0x%llx]\n",
+ (unsigned long long)start,
+ (unsigned long long)end);
+ }
+ dump_stack();
+ }
+ if (!abort)
+ __reserve_region_with_split(root, start, end, name);
write_unlock(&resource_lock);
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d5594a4268d..d325c4b2dcb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
*
* sched_move_task() holds both and thus holding either pins the cgroup,
- * see set_task_rq().
+ * see task_group().
*
* Furthermore, all task_rq users should acquire both locks, see
* task_rq_lock().
@@ -1910,12 +1910,12 @@ static inline void
prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
+ trace_sched_switch(prev, next);
sched_info_switch(prev, next);
perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next);
prepare_arch_switch(next);
- trace_sched_switch(prev, next);
}
/**
@@ -2081,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
#endif
/* Here we just switch the register state and the stack. */
- rcu_switch_from(prev);
switch_to(prev, next, prev);
barrier();
@@ -2161,11 +2160,73 @@ unsigned long this_cpu_load(void)
}
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ * nr_active = 0;
+ * for_each_possible_cpu(cpu)
+ * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ * - for_each_possible_cpu() is prohibitively expensive on machines with
+ * serious number of cpus, therefore we need to take a distributed approach
+ * to calculating nr_active.
+ *
+ * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ * So assuming nr_active := 0 when we start out -- true per definition, we
+ * can simply take per-cpu deltas and fold those into a global accumulate
+ * to obtain the same result. See calc_load_fold_active().
+ *
+ * Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ * across the machine, we assume 10 ticks is sufficient time for every
+ * cpu to have completed this task.
+ *
+ * This places an upper-bound on the IRQ-off latency of the machine. Then
+ * again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ * this would add another cross-cpu cacheline miss and atomic operation
+ * to the wakeup path. Instead we increment on whatever cpu the task ran
+ * when it went into uninterruptible state and decrement on whatever cpu
+ * did the wakeup. This means that only the sum of nr_uninterruptible over
+ * all cpus yields the correct result.
+ *
+ * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
+
/* Variables and functions for calc_load */
static atomic_long_t calc_load_tasks;
static unsigned long calc_load_update;
unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun);
+EXPORT_SYMBOL(avenrun); /* should be removed */
+
+/**
+ * get_avenrun - get the load average array
+ * @loads: pointer to dest load array
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+ loads[0] = (avenrun[0] + offset) << shift;
+ loads[1] = (avenrun[1] + offset) << shift;
+ loads[2] = (avenrun[2] + offset) << shift;
+}
static long calc_load_fold_active(struct rq *this_rq)
{
@@ -2182,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq)
return delta;
}
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
static unsigned long
calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
@@ -2193,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
#ifdef CONFIG_NO_HZ
/*
- * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-cpu sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ * - When we go NO_HZ idle during the window, we can negate our sample
+ * contribution, causing under-accounting.
+ *
+ * We avoid this by keeping two idle-delta counters and flipping them
+ * when the window starts, thus separating old and new NO_HZ load.
+ *
+ * The only trick is the slight shift in index flip for read vs write.
+ *
+ * 0s 5s 10s 15s
+ * +10 +10 +10 +10
+ * |-|-----------|-|-----------|-|-----------|-|
+ * r:0 0 1 1 0 0 1 1 0
+ * w:0 1 1 0 0 1 1 0 0
+ *
+ * This ensures we'll fold the old idle contribution in this window while
+ * accumlating the new one.
+ *
+ * - When we wake up from NO_HZ idle during the window, we push up our
+ * contribution, since we effectively move our sample point to a known
+ * busy state.
+ *
+ * This is solved by pushing the window forward, and thus skipping the
+ * sample, for this cpu (effectively using the idle-delta for this cpu which
+ * was in effect at the time the window opened). This also solves the issue
+ * of having to deal with a cpu having been in NOHZ idle for multiple
+ * LOAD_FREQ intervals.
*
* When making the ILB scale, we should try to pull this in as well.
*/
-static atomic_long_t calc_load_tasks_idle;
+static atomic_long_t calc_load_idle[2];
+static int calc_load_idx;
+
+static inline int calc_load_write_idx(void)
+{
+ int idx = calc_load_idx;
+
+ /*
+ * See calc_global_nohz(), if we observe the new index, we also
+ * need to observe the new update time.
+ */
+ smp_rmb();
+
+ /*
+ * If the folding window started, make sure we start writing in the
+ * next idle-delta.
+ */
+ if (!time_before(jiffies, calc_load_update))
+ idx++;
-void calc_load_account_idle(struct rq *this_rq)
+ return idx & 1;
+}
+
+static inline int calc_load_read_idx(void)
+{
+ return calc_load_idx & 1;
+}
+
+void calc_load_enter_idle(void)
{
+ struct rq *this_rq = this_rq();
long delta;
+ /*
+ * We're going into NOHZ mode, if there's any pending delta, fold it
+ * into the pending idle delta.
+ */
delta = calc_load_fold_active(this_rq);
- if (delta)
- atomic_long_add(delta, &calc_load_tasks_idle);
+ if (delta) {
+ int idx = calc_load_write_idx();
+ atomic_long_add(delta, &calc_load_idle[idx]);
+ }
}
-static long calc_load_fold_idle(void)
+void calc_load_exit_idle(void)
{
- long delta = 0;
+ struct rq *this_rq = this_rq();
/*
- * Its got a race, we don't care...
+ * If we're still before the sample window, we're done.
*/
- if (atomic_long_read(&calc_load_tasks_idle))
- delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+ if (time_before(jiffies, this_rq->calc_load_update))
+ return;
+
+ /*
+ * We woke inside or after the sample window, this means we're already
+ * accounted through the nohz accounting, so skip the entire deal and
+ * sync up for the next window.
+ */
+ this_rq->calc_load_update = calc_load_update;
+ if (time_before(jiffies, this_rq->calc_load_update + 10))
+ this_rq->calc_load_update += LOAD_FREQ;
+}
+
+static long calc_load_fold_idle(void)
+{
+ int idx = calc_load_read_idx();
+ long delta = 0;
+
+ if (atomic_long_read(&calc_load_idle[idx]))
+ delta = atomic_long_xchg(&calc_load_idle[idx], 0);
return delta;
}
@@ -2302,66 +2454,39 @@ static void calc_global_nohz(void)
{
long delta, active, n;
- /*
- * If we crossed a calc_load_update boundary, make sure to fold
- * any pending idle changes, the respective CPUs might have
- * missed the tick driven calc_load_account_active() update
- * due to NO_HZ.
- */
- delta = calc_load_fold_idle();
- if (delta)
- atomic_long_add(delta, &calc_load_tasks);
-
- /*
- * It could be the one fold was all it took, we done!
- */
- if (time_before(jiffies, calc_load_update + 10))
- return;
+ if (!time_before(jiffies, calc_load_update + 10)) {
+ /*
+ * Catch-up, fold however many we are behind still
+ */
+ delta = jiffies - calc_load_update - 10;
+ n = 1 + (delta / LOAD_FREQ);
- /*
- * Catch-up, fold however many we are behind still
- */
- delta = jiffies - calc_load_update - 10;
- n = 1 + (delta / LOAD_FREQ);
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
- active = atomic_long_read(&calc_load_tasks);
- active = active > 0 ? active * FIXED_1 : 0;
+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
- avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
- avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
- avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+ calc_load_update += n * LOAD_FREQ;
+ }
- calc_load_update += n * LOAD_FREQ;
-}
-#else
-void calc_load_account_idle(struct rq *this_rq)
-{
+ /*
+ * Flip the idle index...
+ *
+ * Make sure we first write the new time then flip the index, so that
+ * calc_load_write_idx() will see the new time when it reads the new
+ * index, this avoids a double flip messing things up.
+ */
+ smp_wmb();
+ calc_load_idx++;
}
+#else /* !CONFIG_NO_HZ */
-static inline long calc_load_fold_idle(void)
-{
- return 0;
-}
+static inline long calc_load_fold_idle(void) { return 0; }
+static inline void calc_global_nohz(void) { }
-static void calc_global_nohz(void)
-{
-}
-#endif
-
-/**
- * get_avenrun - get the load average array
- * @loads: pointer to dest load array
- * @offset: offset to add
- * @shift: shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
- loads[0] = (avenrun[0] + offset) << shift;
- loads[1] = (avenrun[1] + offset) << shift;
- loads[2] = (avenrun[2] + offset) << shift;
-}
+#endif /* CONFIG_NO_HZ */
/*
* calc_load - update the avenrun load estimates 10 ticks after the
@@ -2369,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
*/
void calc_global_load(unsigned long ticks)
{
- long active;
+ long active, delta;
if (time_before(jiffies, calc_load_update + 10))
return;
+ /*
+ * Fold the 'old' idle-delta to include all NO_HZ cpus.
+ */
+ delta = calc_load_fold_idle();
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
active = atomic_long_read(&calc_load_tasks);
active = active > 0 ? active * FIXED_1 : 0;
@@ -2384,12 +2516,7 @@ void calc_global_load(unsigned long ticks)
calc_load_update += LOAD_FREQ;
/*
- * Account one period with whatever state we found before
- * folding in the nohz state and ageing the entire idle period.
- *
- * This avoids loosing a sample when we go idle between
- * calc_load_account_active() (10 ticks ago) and now and thus
- * under-accounting.
+ * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
*/
calc_global_nohz();
}
@@ -2406,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq)
return;
delta = calc_load_fold_active(this_rq);
- delta += calc_load_fold_idle();
if (delta)
atomic_long_add(delta, &calc_load_tasks);
@@ -2414,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq)
}
/*
+ * End of global load-average stuff
+ */
+
+/*
* The exact cpuload at various idx values, calculated at every tick would be
* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
*
@@ -5894,6 +6024,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
* allows us to avoid some pointer chasing select_idle_sibling().
*
+ * Iterate domains and sched_groups downward, assigning CPUs to be
+ * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
+ * due to random perturbation self canceling, ie sw buddies pull
+ * their counterpart to their CPU's hw counterpart.
+ *
* Also keep a unique ID per domain (we use the first cpu number in
* the cpumask of the domain), this allows us to quickly tell if
* two cpus are in the same cache domain, see cpus_share_cache().
@@ -5907,8 +6042,40 @@ static void update_top_cache_domain(int cpu)
int id = cpu;
sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
- if (sd)
+ if (sd) {
+ struct sched_domain *tmp = sd;
+ struct sched_group *sg, *prev;
+ bool right;
+
+ /*
+ * Traverse to first CPU in group, and count hops
+ * to cpu from there, switching direction on each
+ * hop, never ever pointing the last CPU rightward.
+ */
+ do {
+ id = cpumask_first(sched_domain_span(tmp));
+ prev = sg = tmp->groups;
+ right = 1;
+
+ while (cpumask_first(sched_group_cpus(sg)) != id)
+ sg = sg->next;
+
+ while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
+ prev = sg;
+ sg = sg->next;
+ right = !right;
+ }
+
+ /* A CPU went down, never point back to domain start. */
+ if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
+ right = false;
+
+ sg = right ? sg->next : prev;
+ tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
+ } while ((tmp = tmp->child));
+
id = cpumask_first(sched_domain_span(sd));
+ }
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_id, cpu) = id;
@@ -6967,34 +7134,66 @@ match2:
mutex_unlock(&sched_domains_mutex);
}
+static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
+
/*
* Update cpusets according to cpu_active mask. If cpusets are
* disabled, cpuset_update_active_cpus() becomes a simple wrapper
* around partition_sched_domains().
+ *
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
+ * want to restore it back to its original state upon resume anyway.
*/
static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
- switch (action & ~CPU_TASKS_FROZEN) {
+ switch (action) {
+ case CPU_ONLINE_FROZEN:
+ case CPU_DOWN_FAILED_FROZEN:
+
+ /*
+ * num_cpus_frozen tracks how many CPUs are involved in suspend
+ * resume sequence. As long as this is not the last online
+ * operation in the resume sequence, just build a single sched
+ * domain, ignoring cpusets.
+ */
+ num_cpus_frozen--;
+ if (likely(num_cpus_frozen)) {
+ partition_sched_domains(1, NULL, NULL);
+ break;
+ }
+
+ /*
+ * This is the last CPU online operation. So fall through and
+ * restore the original sched domains by considering the
+ * cpuset configurations.
+ */
+
case CPU_ONLINE:
case CPU_DOWN_FAILED:
- cpuset_update_active_cpus();
- return NOTIFY_OK;
+ cpuset_update_active_cpus(true);
+ break;
default:
return NOTIFY_DONE;
}
+ return NOTIFY_OK;
}
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
- switch (action & ~CPU_TASKS_FROZEN) {
+ switch (action) {
case CPU_DOWN_PREPARE:
- cpuset_update_active_cpus();
- return NOTIFY_OK;
+ cpuset_update_active_cpus(false);
+ break;
+ case CPU_DOWN_PREPARE_FROZEN:
+ num_cpus_frozen++;
+ partition_sched_domains(1, NULL, NULL);
+ break;
default:
return NOTIFY_DONE;
}
+ return NOTIFY_OK;
}
void __init sched_init_smp(void)
@@ -7459,6 +7658,7 @@ void sched_destroy_group(struct task_group *tg)
*/
void sched_move_task(struct task_struct *tsk)
{
+ struct task_group *tg;
int on_rq, running;
unsigned long flags;
struct rq *rq;
@@ -7473,6 +7673,12 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
+ tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
+ lockdep_is_held(&tsk->sighand->siglock)),
+ struct task_group, css);
+ tg = autogroup_task_group(tsk, tg);
+ tsk->sched_task_group = tg;
+
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_move_group)
tsk->sched_class->task_move_group(tsk, on_rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c099cc6eebe..22321db6495 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
struct sched_domain *sd;
- struct sched_group *sg;
- int i;
/*
* If the task is going to be woken-up on this cpu and if it is
@@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
return prev_cpu;
/*
- * Otherwise, iterate the domains and find an elegible idle cpu.
+ * Otherwise, check assigned siblings to find an elegible idle cpu.
*/
sd = rcu_dereference(per_cpu(sd_llc, target));
- for_each_lower_domain(sd) {
- sg = sd->groups;
- do {
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
-
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (!idle_cpu(i))
- goto next;
- }
- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
- goto done;
-next:
- sg = sg->next;
- } while (sg != sd->groups);
+ for_each_lower_domain(sd) {
+ if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(sd->idle_buddy))
+ return sd->idle_buddy;
}
-done:
+
return target;
}
@@ -3068,16 +3054,19 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
+#define LBF_SOME_PINNED 0x04
struct lb_env {
struct sched_domain *sd;
- int src_cpu;
struct rq *src_rq;
+ int src_cpu;
int dst_cpu;
struct rq *dst_rq;
+ struct cpumask *dst_grpmask;
+ int new_dst_cpu;
enum cpu_idle_type idle;
long imbalance;
unsigned int flags;
@@ -3145,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* 3) are cache-hot on their current CPU.
*/
if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+ int new_dst_cpu;
+
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+
+ /*
+ * Remember if this task can be migrated to any other cpu in
+ * our sched_group. We may want to revisit it if we couldn't
+ * meet load balance goals by pulling other tasks on src_cpu.
+ *
+ * Also avoid computing new_dst_cpu if we have already computed
+ * one in current iteration.
+ */
+ if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
+ return 0;
+
+ new_dst_cpu = cpumask_first_and(env->dst_grpmask,
+ tsk_cpus_allowed(p));
+ if (new_dst_cpu < nr_cpu_ids) {
+ env->flags |= LBF_SOME_PINNED;
+ env->new_dst_cpu = new_dst_cpu;
+ }
return 0;
}
+
+ /* Record that we found atleast one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
if (task_running(env->src_rq, p)) {
@@ -4227,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *balance)
{
- int ld_moved, active_balance = 0;
+ int ld_moved, cur_ld_moved, active_balance = 0;
+ int lb_iterations, max_lb_iterations;
struct sched_group *group;
struct rq *busiest;
unsigned long flags;
@@ -4237,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
+ .dst_grpmask = sched_group_cpus(sd->groups),
.idle = idle,
.loop_break = sched_nr_migrate_break,
};
cpumask_copy(cpus, cpu_active_mask);
+ max_lb_iterations = cpumask_weight(env.dst_grpmask);
schedstat_inc(sd, lb_count[idle]);
@@ -4267,6 +4281,7 @@ redo:
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
ld_moved = 0;
+ lb_iterations = 1;
if (busiest->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
@@ -4284,7 +4299,13 @@ more_balance:
double_rq_lock(this_rq, busiest);
if (!env.loop)
update_h_load(env.src_cpu);
- ld_moved += move_tasks(&env);
+
+ /*
+ * cur_ld_moved - load moved in current iteration
+ * ld_moved - cumulative load moved across iterations
+ */
+ cur_ld_moved = move_tasks(&env);
+ ld_moved += cur_ld_moved;
double_rq_unlock(this_rq, busiest);
local_irq_restore(flags);
@@ -4296,14 +4317,52 @@ more_balance:
/*
* some other cpu did the load balance for us.
*/
- if (ld_moved && this_cpu != smp_processor_id())
- resched_cpu(this_cpu);
+ if (cur_ld_moved && env.dst_cpu != smp_processor_id())
+ resched_cpu(env.dst_cpu);
+
+ /*
+ * Revisit (affine) tasks on src_cpu that couldn't be moved to
+ * us and move them to an alternate dst_cpu in our sched_group
+ * where they can run. The upper limit on how many times we
+ * iterate on same src_cpu is dependent on number of cpus in our
+ * sched_group.
+ *
+ * This changes load balance semantics a bit on who can move
+ * load to a given_cpu. In addition to the given_cpu itself
+ * (or a ilb_cpu acting on its behalf where given_cpu is
+ * nohz-idle), we now have balance_cpu in a position to move
+ * load to given_cpu. In rare situations, this may cause
+ * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
+ * _independently_ and at _same_ time to move some load to
+ * given_cpu) causing exceess load to be moved to given_cpu.
+ * This however should not happen so much in practice and
+ * moreover subsequent load balance cycles should correct the
+ * excess load moved.
+ */
+ if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
+ lb_iterations++ < max_lb_iterations) {
+
+ this_rq = cpu_rq(env.new_dst_cpu);
+ env.dst_rq = this_rq;
+ env.dst_cpu = env.new_dst_cpu;
+ env.flags &= ~LBF_SOME_PINNED;
+ env.loop = 0;
+ env.loop_break = sched_nr_migrate_break;
+ /*
+ * Go back to "more_balance" rather than "redo" since we
+ * need to continue with same src_cpu.
+ */
+ goto more_balance;
+ }
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
- if (!cpumask_empty(cpus))
+ if (!cpumask_empty(cpus)) {
+ env.loop = 0;
+ env.loop_break = sched_nr_migrate_break;
goto redo;
+ }
goto out_balanced;
}
}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b44d604b35d..b6baf370cae 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
static struct task_struct *pick_next_task_idle(struct rq *rq)
{
schedstat_inc(rq, sched_goidle);
- calc_load_account_idle(rq);
return rq->idle;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6d52cea7f33..c35a1a7dd4d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -538,22 +538,19 @@ extern int group_balance_cpu(struct sched_group *sg);
/*
* Return the group to which this tasks belongs.
*
- * We use task_subsys_state_check() and extend the RCU verification with
- * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
- * task it moves into the cgroup. Therefore by holding either of those locks,
- * we pin the task to the current cgroup.
+ * We cannot use task_subsys_state() and friends because the cgroup
+ * subsystem changes that value before the cgroup_subsys::attach() method
+ * is called, therefore we cannot pin it and might observe the wrong value.
+ *
+ * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
+ * core changes this before calling sched_move_task().
+ *
+ * Instead we use a 'copy' which is updated from sched_move_task() while
+ * holding both task_struct::pi_lock and rq::lock.
*/
static inline struct task_group *task_group(struct task_struct *p)
{
- struct task_group *tg;
- struct cgroup_subsys_state *css;
-
- css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
- lockdep_is_held(&p->pi_lock) ||
- lockdep_is_held(&task_rq(p)->lock));
- tg = container_of(css, struct task_group, css);
-
- return autogroup_task_group(p, tg);
+ return p->sched_task_group;
}
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -942,8 +939,6 @@ static inline u64 sched_avg_period(void)
return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
}
-void calc_load_account_idle(struct rq *this_rq);
-
#ifdef CONFIG_SCHED_HRTICK
/*
diff --git a/kernel/signal.c b/kernel/signal.c
index 677102789cf..be4f856d52f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1971,6 +1971,13 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
void ptrace_notify(int exit_code)
{
BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
+ if (unlikely(current->task_works)) {
+ if (test_and_clear_ti_thread_flag(current_thread_info(),
+ TIF_NOTIFY_RESUME)) {
+ smp_mb__after_clear_bit();
+ task_work_run();
+ }
+ }
spin_lock_irq(&current->sighand->siglock);
ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
@@ -2191,6 +2198,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
struct signal_struct *signal = current->signal;
int signr;
+ if (unlikely(current->task_works)) {
+ if (test_and_clear_ti_thread_flag(current_thread_info(),
+ TIF_NOTIFY_RESUME)) {
+ smp_mb__after_clear_bit();
+ task_work_run();
+ }
+ }
+
if (unlikely(uprobe_deny_signal()))
return 0;
diff --git a/kernel/smp.c b/kernel/smp.c
index d0ae5b24875..29dd40a9f2f 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -581,26 +581,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait)
return 0;
}
EXPORT_SYMBOL(smp_call_function);
-
-void ipi_call_lock(void)
-{
- raw_spin_lock(&call_function.lock);
-}
-
-void ipi_call_unlock(void)
-{
- raw_spin_unlock(&call_function.lock);
-}
-
-void ipi_call_lock_irq(void)
-{
- raw_spin_lock_irq(&call_function.lock);
-}
-
-void ipi_call_unlock_irq(void)
-{
- raw_spin_unlock_irq(&call_function.lock);
-}
#endif /* USE_GENERIC_SMP_HELPERS */
/* Setup configured maximum number of CPUs to activate */
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 80c0acfb847..6ef9433e1c7 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -3,8 +3,6 @@
struct task_struct;
-int smpboot_prepare(unsigned int cpu);
-
#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
struct task_struct *idle_thread_get(unsigned int cpu);
void idle_thread_set_boot_cpu(void);
diff --git a/kernel/sys.c b/kernel/sys.c
index e0c8ffc50d7..241507f23ec 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1788,7 +1788,6 @@ SYSCALL_DEFINE1(umask, int, mask)
#ifdef CONFIG_CHECKPOINT_RESTORE
static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
{
- struct vm_area_struct *vma;
struct file *exe_file;
struct dentry *dentry;
int err;
@@ -1816,13 +1815,17 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
down_write(&mm->mmap_sem);
/*
- * Forbid mm->exe_file change if there are mapped other files.
+ * Forbid mm->exe_file change if old file still mapped.
*/
err = -EBUSY;
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (vma->vm_file && !path_equal(&vma->vm_file->f_path,
- &exe_file->f_path))
- goto exit_unlock;
+ if (mm->exe_file) {
+ struct vm_area_struct *vma;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next)
+ if (vma->vm_file &&
+ path_equal(&vma->vm_file->f_path,
+ &mm->exe_file->f_path))
+ goto exit_unlock;
}
/*
@@ -1835,6 +1838,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
goto exit_unlock;
+ err = 0;
set_mm_exe_file(mm, exe_file);
exit_unlock:
up_write(&mm->mmap_sem);
@@ -2011,7 +2015,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
break;
}
me->pdeath_signal = arg2;
- error = 0;
break;
case PR_GET_PDEATHSIG:
error = put_user(me->pdeath_signal, (int __user *)arg2);
@@ -2025,7 +2028,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
break;
}
set_dumpable(me->mm, arg2);
- error = 0;
break;
case PR_SET_UNALIGN:
@@ -2052,10 +2054,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SET_TIMING:
if (arg2 != PR_TIMING_STATISTICAL)
error = -EINVAL;
- else
- error = 0;
break;
-
case PR_SET_NAME:
comm[sizeof(me->comm)-1] = 0;
if (strncpy_from_user(comm, (char __user *)arg2,
@@ -2063,20 +2062,19 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EFAULT;
set_task_comm(me, comm);
proc_comm_connector(me);
- return 0;
+ break;
case PR_GET_NAME:
get_task_comm(comm, me);
if (copy_to_user((char __user *)arg2, comm,
sizeof(comm)))
return -EFAULT;
- return 0;
+ break;
case PR_GET_ENDIAN:
error = GET_ENDIAN(me, arg2);
break;
case PR_SET_ENDIAN:
error = SET_ENDIAN(me, arg2);
break;
-
case PR_GET_SECCOMP:
error = prctl_get_seccomp();
break;
@@ -2104,7 +2102,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
current->default_timer_slack_ns;
else
current->timer_slack_ns = arg2;
- error = 0;
break;
case PR_MCE_KILL:
if (arg4 | arg5)
@@ -2130,7 +2127,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
default:
return -EINVAL;
}
- error = 0;
break;
case PR_MCE_KILL_GET:
if (arg2 | arg3 | arg4 | arg5)
@@ -2149,7 +2145,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
break;
case PR_SET_CHILD_SUBREAPER:
me->signal->is_child_subreaper = !!arg2;
- error = 0;
break;
case PR_GET_CHILD_SUBREAPER:
error = put_user(me->signal->is_child_subreaper,
@@ -2191,46 +2186,52 @@ static void argv_cleanup(struct subprocess_info *info)
argv_free(info->argv);
}
-/**
- * orderly_poweroff - Trigger an orderly system poweroff
- * @force: force poweroff if command execution fails
- *
- * This may be called from any context to trigger a system shutdown.
- * If the orderly shutdown fails, it will force an immediate shutdown.
- */
-int orderly_poweroff(bool force)
+static int __orderly_poweroff(void)
{
int argc;
- char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
+ char **argv;
static char *envp[] = {
"HOME=/",
"PATH=/sbin:/bin:/usr/sbin:/usr/bin",
NULL
};
- int ret = -ENOMEM;
+ int ret;
+ argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
if (argv == NULL) {
printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
__func__, poweroff_cmd);
- goto out;
+ return -ENOMEM;
}
ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT,
NULL, argv_cleanup, NULL);
-out:
- if (likely(!ret))
- return 0;
-
if (ret == -ENOMEM)
argv_free(argv);
- if (force) {
+ return ret;
+}
+
+/**
+ * orderly_poweroff - Trigger an orderly system poweroff
+ * @force: force poweroff if command execution fails
+ *
+ * This may be called from any context to trigger a system shutdown.
+ * If the orderly shutdown fails, it will force an immediate shutdown.
+ */
+int orderly_poweroff(bool force)
+{
+ int ret = __orderly_poweroff();
+
+ if (ret && force) {
printk(KERN_WARNING "Failed to start orderly shutdown: "
"forcing the issue\n");
- /* I guess this should try to kick off some daemon to
- sync and poweroff asap. Or not even bother syncing
- if we're doing an emergency shutdown? */
+ /*
+ * I guess this should try to kick off some daemon to sync and
+ * poweroff asap. Or not even bother syncing if we're doing an
+ * emergency shutdown?
+ */
emergency_sync();
kernel_power_off();
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4ab11879aeb..97186b99b0e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -30,6 +30,7 @@
#include <linux/security.h>
#include <linux/ctype.h>
#include <linux/kmemcheck.h>
+#include <linux/kmemleak.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -174,6 +175,11 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
#endif
+static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+static int proc_dostring_coredump(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+
#ifdef CONFIG_MAGIC_SYSRQ
/* Note: sysrq code uses it's own private copy */
static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -410,7 +416,7 @@ static struct ctl_table kern_table[] = {
.data = core_pattern,
.maxlen = CORENAME_MAX_SIZE,
.mode = 0644,
- .proc_handler = proc_dostring,
+ .proc_handler = proc_dostring_coredump,
},
{
.procname = "core_pipe_limit",
@@ -1498,7 +1504,7 @@ static struct ctl_table fs_table[] = {
.data = &suid_dumpable,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dointvec_minmax_coredump,
.extra1 = &zero,
.extra2 = &two,
},
@@ -1551,7 +1557,10 @@ static struct ctl_table dev_table[] = {
int __init sysctl_init(void)
{
- register_sysctl_table(sysctl_base_table);
+ struct ctl_table_header *hdr;
+
+ hdr = register_sysctl_table(sysctl_base_table);
+ kmemleak_not_leak(hdr);
return 0;
}
@@ -2009,6 +2018,34 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
do_proc_dointvec_minmax_conv, &param);
}
+static void validate_coredump_safety(void)
+{
+ if (suid_dumpable == SUID_DUMPABLE_SAFE &&
+ core_pattern[0] != '/' && core_pattern[0] != '|') {
+ printk(KERN_WARNING "Unsafe core_pattern used with "\
+ "suid_dumpable=2. Pipe handler or fully qualified "\
+ "core dump path required.\n");
+ }
+}
+
+static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (!error)
+ validate_coredump_safety();
+ return error;
+}
+
+static int proc_dostring_coredump(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int error = proc_dostring(table, write, buffer, lenp, ppos);
+ if (!error)
+ validate_coredump_safety();
+ return error;
+}
+
static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos,
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 82d1c794066..91d4e1742a0 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -3,82 +3,78 @@
#include <linux/tracehook.h>
int
-task_work_add(struct task_struct *task, struct task_work *twork, bool notify)
+task_work_add(struct task_struct *task, struct callback_head *twork, bool notify)
{
+ struct callback_head *last, *first;
unsigned long flags;
- int err = -ESRCH;
-#ifndef TIF_NOTIFY_RESUME
- if (notify)
- return -ENOTSUPP;
-#endif
/*
- * We must not insert the new work if the task has already passed
- * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait()
- * and check PF_EXITING under pi_lock.
+ * Not inserting the new work if the task has already passed
+ * exit_task_work() is the responisbility of callers.
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
- if (likely(!(task->flags & PF_EXITING))) {
- hlist_add_head(&twork->hlist, &task->task_works);
- err = 0;
- }
+ last = task->task_works;
+ first = last ? last->next : twork;
+ twork->next = first;
+ if (last)
+ last->next = twork;
+ task->task_works = twork;
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
/* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */
- if (likely(!err) && notify)
+ if (notify)
set_notify_resume(task);
- return err;
+ return 0;
}
-struct task_work *
+struct callback_head *
task_work_cancel(struct task_struct *task, task_work_func_t func)
{
unsigned long flags;
- struct task_work *twork;
- struct hlist_node *pos;
+ struct callback_head *last, *res = NULL;
raw_spin_lock_irqsave(&task->pi_lock, flags);
- hlist_for_each_entry(twork, pos, &task->task_works, hlist) {
- if (twork->func == func) {
- hlist_del(&twork->hlist);
- goto found;
+ last = task->task_works;
+ if (last) {
+ struct callback_head *q = last, *p = q->next;
+ while (1) {
+ if (p->func == func) {
+ q->next = p->next;
+ if (p == last)
+ task->task_works = q == p ? NULL : q;
+ res = p;
+ break;
+ }
+ if (p == last)
+ break;
+ q = p;
+ p = q->next;
}
}
- twork = NULL;
- found:
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-
- return twork;
+ return res;
}
void task_work_run(void)
{
struct task_struct *task = current;
- struct hlist_head task_works;
- struct hlist_node *pos;
+ struct callback_head *p, *q;
- raw_spin_lock_irq(&task->pi_lock);
- hlist_move_list(&task->task_works, &task_works);
- raw_spin_unlock_irq(&task->pi_lock);
+ while (1) {
+ raw_spin_lock_irq(&task->pi_lock);
+ p = task->task_works;
+ task->task_works = NULL;
+ raw_spin_unlock_irq(&task->pi_lock);
- if (unlikely(hlist_empty(&task_works)))
- return;
- /*
- * We use hlist to save the space in task_struct, but we want fifo.
- * Find the last entry, the list should be short, then process them
- * in reverse order.
- */
- for (pos = task_works.first; pos->next; pos = pos->next)
- ;
+ if (unlikely(!p))
+ return;
- for (;;) {
- struct hlist_node **pprev = pos->pprev;
- struct task_work *twork = container_of(pos, struct task_work,
- hlist);
- twork->func(twork);
-
- if (pprev == &task_works.first)
- break;
- pos = container_of(pprev, struct hlist_node, next);
+ q = p->next; /* head */
+ p->next = NULL; /* cut it */
+ while (q) {
+ p = q->next;
+ q->func(q);
+ q = p;
+ }
}
}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index e66046456f4..d0a32796550 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -436,6 +436,11 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
sizeof(struct cgroupstats));
+ if (na == NULL) {
+ rc = -EMSGSIZE;
+ goto err;
+ }
+
stats = nla_data(na);
memset(stats, 0, sizeof(*stats));
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 70b33abcc7b..b7fbadc5c97 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -409,7 +409,9 @@ int second_overflow(unsigned long secs)
time_state = TIME_DEL;
break;
case TIME_INS:
- if (secs % 86400 == 0) {
+ if (!(time_status & STA_INS))
+ time_state = TIME_OK;
+ else if (secs % 86400 == 0) {
leap = -1;
time_state = TIME_OOP;
time_tai++;
@@ -418,7 +420,9 @@ int second_overflow(unsigned long secs)
}
break;
case TIME_DEL:
- if ((secs + 1) % 86400 == 0) {
+ if (!(time_status & STA_DEL))
+ time_state = TIME_OK;
+ else if ((secs + 1) % 86400 == 0) {
leap = 1;
time_tai--;
time_state = TIME_WAIT;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 86999783392..024540f97f7 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -105,7 +105,7 @@ static ktime_t tick_init_jiffy_update(void)
/*
* NO HZ enabled ?
*/
-static int tick_nohz_enabled __read_mostly = 1;
+int tick_nohz_enabled __read_mostly = 1;
/*
* Enable / Disable tickless mode
@@ -271,50 +271,15 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
-static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
+static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
+ ktime_t now, int cpu)
{
unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
+ ktime_t last_update, expires, ret = { .tv64 = 0 };
unsigned long rcu_delta_jiffies;
- ktime_t last_update, expires, now;
struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
u64 time_delta;
- int cpu;
-
- cpu = smp_processor_id();
- ts = &per_cpu(tick_cpu_sched, cpu);
-
- now = tick_nohz_start_idle(cpu, ts);
-
- /*
- * If this cpu is offline and it is the one which updates
- * jiffies, then give up the assignment and let it be taken by
- * the cpu which runs the tick timer next. If we don't drop
- * this here the jiffies might be stale and do_timer() never
- * invoked.
- */
- if (unlikely(!cpu_online(cpu))) {
- if (cpu == tick_do_timer_cpu)
- tick_do_timer_cpu = TICK_DO_TIMER_NONE;
- }
-
- if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
- return;
- if (need_resched())
- return;
-
- if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
- static int ratelimit;
-
- if (ratelimit < 10) {
- printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
- (unsigned int) local_softirq_pending());
- ratelimit++;
- }
- return;
- }
-
- ts->idle_calls++;
/* Read jiffies and the time when jiffies were updated last */
do {
seq = read_seqbegin(&xtime_lock);
@@ -397,6 +362,8 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
goto out;
+ ret = expires;
+
/*
* nohz_stop_sched_tick can be called several times before
* the nohz_restart_sched_tick is called. This happens when
@@ -406,17 +373,12 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
*/
if (!ts->tick_stopped) {
select_nohz_load_balancer(1);
+ calc_load_enter_idle();
- ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
+ ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
- ts->idle_jiffies = last_jiffies;
}
- ts->idle_sleeps++;
-
- /* Mark expires */
- ts->idle_expires = expires;
-
/*
* If the expiration time == KTIME_MAX, then
* in this case we simply stop the tick timer.
@@ -447,6 +409,65 @@ out:
ts->next_jiffies = next_jiffies;
ts->last_jiffies = last_jiffies;
ts->sleep_length = ktime_sub(dev->next_event, now);
+
+ return ret;
+}
+
+static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
+{
+ /*
+ * If this cpu is offline and it is the one which updates
+ * jiffies, then give up the assignment and let it be taken by
+ * the cpu which runs the tick timer next. If we don't drop
+ * this here the jiffies might be stale and do_timer() never
+ * invoked.
+ */
+ if (unlikely(!cpu_online(cpu))) {
+ if (cpu == tick_do_timer_cpu)
+ tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+ }
+
+ if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
+ return false;
+
+ if (need_resched())
+ return false;
+
+ if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
+ static int ratelimit;
+
+ if (ratelimit < 10) {
+ printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+ (unsigned int) local_softirq_pending());
+ ratelimit++;
+ }
+ return false;
+ }
+
+ return true;
+}
+
+static void __tick_nohz_idle_enter(struct tick_sched *ts)
+{
+ ktime_t now, expires;
+ int cpu = smp_processor_id();
+
+ now = tick_nohz_start_idle(cpu, ts);
+
+ if (can_stop_idle_tick(cpu, ts)) {
+ int was_stopped = ts->tick_stopped;
+
+ ts->idle_calls++;
+
+ expires = tick_nohz_stop_sched_tick(ts, now, cpu);
+ if (expires.tv64 > 0LL) {
+ ts->idle_sleeps++;
+ ts->idle_expires = expires;
+ }
+
+ if (!was_stopped && ts->tick_stopped)
+ ts->idle_jiffies = ts->last_jiffies;
+ }
}
/**
@@ -484,7 +505,7 @@ void tick_nohz_idle_enter(void)
* update of the idle time accounting in tick_nohz_start_idle().
*/
ts->inidle = 1;
- tick_nohz_stop_sched_tick(ts);
+ __tick_nohz_idle_enter(ts);
local_irq_enable();
}
@@ -504,7 +525,7 @@ void tick_nohz_irq_exit(void)
if (!ts->inidle)
return;
- tick_nohz_stop_sched_tick(ts);
+ __tick_nohz_idle_enter(ts);
}
/**
@@ -522,7 +543,7 @@ ktime_t tick_nohz_get_sleep_length(void)
static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
{
hrtimer_cancel(&ts->sched_timer);
- hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
+ hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
while (1) {
/* Forward the time to expire in the future */
@@ -545,6 +566,41 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
}
}
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
+{
+ /* Update jiffies first */
+ select_nohz_load_balancer(0);
+ tick_do_update_jiffies64(now);
+ update_cpu_load_nohz();
+
+ touch_softlockup_watchdog();
+ /*
+ * Cancel the scheduled timer and restore the tick
+ */
+ ts->tick_stopped = 0;
+ ts->idle_exittime = now;
+
+ tick_nohz_restart(ts, now);
+}
+
+static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
+{
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+ unsigned long ticks;
+ /*
+ * We stopped the tick in idle. Update process times would miss the
+ * time we slept as update_process_times does only a 1 tick
+ * accounting. Enforce that this is accounted to idle !
+ */
+ ticks = jiffies - ts->idle_jiffies;
+ /*
+ * We might be one off. Do not randomly account a huge number of ticks!
+ */
+ if (ticks && ticks < LONG_MAX)
+ account_idle_ticks(ticks);
+#endif
+}
+
/**
* tick_nohz_idle_exit - restart the idle tick from the idle task
*
@@ -556,9 +612,6 @@ void tick_nohz_idle_exit(void)
{
int cpu = smp_processor_id();
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
- unsigned long ticks;
-#endif
ktime_t now;
local_irq_disable();
@@ -573,39 +626,11 @@ void tick_nohz_idle_exit(void)
if (ts->idle_active)
tick_nohz_stop_idle(cpu, now);
- if (!ts->tick_stopped) {
- local_irq_enable();
- return;
+ if (ts->tick_stopped) {
+ tick_nohz_restart_sched_tick(ts, now);
+ tick_nohz_account_idle_ticks(ts);
}
- /* Update jiffies first */
- select_nohz_load_balancer(0);
- tick_do_update_jiffies64(now);
- update_cpu_load_nohz();
-
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
- /*
- * We stopped the tick in idle. Update process times would miss the
- * time we slept as update_process_times does only a 1 tick
- * accounting. Enforce that this is accounted to idle !
- */
- ticks = jiffies - ts->idle_jiffies;
- /*
- * We might be one off. Do not randomly account a huge number of ticks!
- */
- if (ticks && ticks < LONG_MAX)
- account_idle_ticks(ticks);
-#endif
-
- touch_softlockup_watchdog();
- /*
- * Cancel the scheduled timer and restore the tick
- */
- ts->tick_stopped = 0;
- ts->idle_exittime = now;
-
- tick_nohz_restart(ts, now);
-
local_irq_enable();
}
@@ -809,7 +834,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
*/
if (ts->tick_stopped) {
touch_softlockup_watchdog();
- ts->idle_jiffies++;
+ if (idle_cpu(cpu))
+ ts->idle_jiffies++;
}
update_process_times(user_mode(regs));
profile_tick(CPU_PROFILING);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6f46a00a1e8..f045cc50832 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -24,32 +24,32 @@
/* Structure holding internal timekeeping values. */
struct timekeeper {
/* Current clocksource used for timekeeping. */
- struct clocksource *clock;
+ struct clocksource *clock;
/* NTP adjusted clock multiplier */
- u32 mult;
+ u32 mult;
/* The shift value of the current clocksource. */
- int shift;
-
+ u32 shift;
/* Number of clock cycles in one NTP interval. */
- cycle_t cycle_interval;
+ cycle_t cycle_interval;
/* Number of clock shifted nano seconds in one NTP interval. */
- u64 xtime_interval;
+ u64 xtime_interval;
/* shifted nano seconds left over when rounding cycle_interval */
- s64 xtime_remainder;
+ s64 xtime_remainder;
/* Raw nano seconds accumulated per NTP interval. */
- u32 raw_interval;
+ u32 raw_interval;
+
+ /* Current CLOCK_REALTIME time in seconds */
+ u64 xtime_sec;
+ /* Clock shifted nano seconds */
+ u64 xtime_nsec;
- /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
- u64 xtime_nsec;
/* Difference between accumulated time and NTP time in ntp
* shifted nano seconds. */
- s64 ntp_error;
+ s64 ntp_error;
/* Shift conversion between clock shifted nano seconds and
* ntp shifted nano seconds. */
- int ntp_error_shift;
+ u32 ntp_error_shift;
- /* The current time */
- struct timespec xtime;
/*
* wall_to_monotonic is what we need to add to xtime (or xtime corrected
* for sub jiffie times) to get to monotonic time. Monotonic is pegged
@@ -64,14 +64,17 @@ struct timekeeper {
* - wall_to_monotonic is no longer the boot time, getboottime must be
* used instead.
*/
- struct timespec wall_to_monotonic;
+ struct timespec wall_to_monotonic;
/* time spent in suspend */
- struct timespec total_sleep_time;
+ struct timespec total_sleep_time;
/* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
- struct timespec raw_time;
-
+ struct timespec raw_time;
+ /* Offset clock monotonic -> clock realtime */
+ ktime_t offs_real;
+ /* Offset clock monotonic -> clock boottime */
+ ktime_t offs_boot;
/* Seqlock for all timekeeper values */
- seqlock_t lock;
+ seqlock_t lock;
};
static struct timekeeper timekeeper;
@@ -82,11 +85,37 @@ static struct timekeeper timekeeper;
*/
__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
-
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
+static inline void tk_normalize_xtime(struct timekeeper *tk)
+{
+ while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) {
+ tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift;
+ tk->xtime_sec++;
+ }
+}
+
+static struct timespec tk_xtime(struct timekeeper *tk)
+{
+ struct timespec ts;
+
+ ts.tv_sec = tk->xtime_sec;
+ ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift);
+ return ts;
+}
+
+static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
+{
+ tk->xtime_sec = ts->tv_sec;
+ tk->xtime_nsec = ts->tv_nsec << tk->shift;
+}
+static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts)
+{
+ tk->xtime_sec += ts->tv_sec;
+ tk->xtime_nsec += ts->tv_nsec << tk->shift;
+}
/**
* timekeeper_setup_internals - Set up internals to use clocksource clock.
@@ -98,12 +127,14 @@ int __read_mostly timekeeping_suspended;
*
* Unless you're the timekeeping code, you should not be using this!
*/
-static void timekeeper_setup_internals(struct clocksource *clock)
+static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
{
cycle_t interval;
u64 tmp, ntpinterval;
+ struct clocksource *old_clock;
- timekeeper.clock = clock;
+ old_clock = tk->clock;
+ tk->clock = clock;
clock->cycle_last = clock->read(clock);
/* Do the ns -> cycle conversion first, using original mult */
@@ -116,71 +147,96 @@ static void timekeeper_setup_internals(struct clocksource *clock)
tmp = 1;
interval = (cycle_t) tmp;
- timekeeper.cycle_interval = interval;
+ tk->cycle_interval = interval;
/* Go back from cycles -> shifted ns */
- timekeeper.xtime_interval = (u64) interval * clock->mult;
- timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
- timekeeper.raw_interval =
+ tk->xtime_interval = (u64) interval * clock->mult;
+ tk->xtime_remainder = ntpinterval - tk->xtime_interval;
+ tk->raw_interval =
((u64) interval * clock->mult) >> clock->shift;
- timekeeper.xtime_nsec = 0;
- timekeeper.shift = clock->shift;
+ /* if changing clocks, convert xtime_nsec shift units */
+ if (old_clock) {
+ int shift_change = clock->shift - old_clock->shift;
+ if (shift_change < 0)
+ tk->xtime_nsec >>= -shift_change;
+ else
+ tk->xtime_nsec <<= shift_change;
+ }
+ tk->shift = clock->shift;
- timekeeper.ntp_error = 0;
- timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
+ tk->ntp_error = 0;
+ tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
/*
* The timekeeper keeps its own mult values for the currently
* active clocksource. These value will be adjusted via NTP
* to counteract clock drifting.
*/
- timekeeper.mult = clock->mult;
+ tk->mult = clock->mult;
}
/* Timekeeper helper functions. */
-static inline s64 timekeeping_get_ns(void)
+static inline s64 timekeeping_get_ns(struct timekeeper *tk)
{
cycle_t cycle_now, cycle_delta;
struct clocksource *clock;
+ s64 nsec;
/* read clocksource: */
- clock = timekeeper.clock;
+ clock = tk->clock;
cycle_now = clock->read(clock);
/* calculate the delta since the last update_wall_time: */
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
- /* return delta convert to nanoseconds using ntp adjusted mult. */
- return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
- timekeeper.shift);
+ nsec = cycle_delta * tk->mult + tk->xtime_nsec;
+ nsec >>= tk->shift;
+
+ /* If arch requires, add in gettimeoffset() */
+ return nsec + arch_gettimeoffset();
}
-static inline s64 timekeeping_get_ns_raw(void)
+static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
{
cycle_t cycle_now, cycle_delta;
struct clocksource *clock;
+ s64 nsec;
/* read clocksource: */
- clock = timekeeper.clock;
+ clock = tk->clock;
cycle_now = clock->read(clock);
/* calculate the delta since the last update_wall_time: */
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
- /* return delta convert to nanoseconds. */
- return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+ /* convert delta to nanoseconds. */
+ nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+
+ /* If arch requires, add in gettimeoffset() */
+ return nsec + arch_gettimeoffset();
+}
+
+static void update_rt_offset(struct timekeeper *tk)
+{
+ struct timespec tmp, *wtm = &tk->wall_to_monotonic;
+
+ set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec);
+ tk->offs_real = timespec_to_ktime(tmp);
}
/* must hold write on timekeeper.lock */
-static void timekeeping_update(bool clearntp)
+static void timekeeping_update(struct timekeeper *tk, bool clearntp)
{
+ struct timespec xt;
+
if (clearntp) {
- timekeeper.ntp_error = 0;
+ tk->ntp_error = 0;
ntp_clear();
}
- update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic,
- timekeeper.clock, timekeeper.mult);
+ update_rt_offset(tk);
+ xt = tk_xtime(tk);
+ update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult);
}
@@ -191,27 +247,26 @@ static void timekeeping_update(bool clearntp)
* update_wall_time(). This is useful before significant clock changes,
* as it avoids having to deal with this time offset explicitly.
*/
-static void timekeeping_forward_now(void)
+static void timekeeping_forward_now(struct timekeeper *tk)
{
cycle_t cycle_now, cycle_delta;
struct clocksource *clock;
s64 nsec;
- clock = timekeeper.clock;
+ clock = tk->clock;
cycle_now = clock->read(clock);
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
clock->cycle_last = cycle_now;
- nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
- timekeeper.shift);
+ tk->xtime_nsec += cycle_delta * tk->mult;
/* If arch requires, add in gettimeoffset() */
- nsec += arch_gettimeoffset();
+ tk->xtime_nsec += arch_gettimeoffset() << tk->shift;
- timespec_add_ns(&timekeeper.xtime, nsec);
+ tk_normalize_xtime(tk);
nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
- timespec_add_ns(&timekeeper.raw_time, nsec);
+ timespec_add_ns(&tk->raw_time, nsec);
}
/**
@@ -223,18 +278,15 @@ static void timekeeping_forward_now(void)
void getnstimeofday(struct timespec *ts)
{
unsigned long seq;
- s64 nsecs;
+ s64 nsecs = 0;
WARN_ON(timekeeping_suspended);
do {
seq = read_seqbegin(&timekeeper.lock);
- *ts = timekeeper.xtime;
- nsecs = timekeeping_get_ns();
-
- /* If arch requires, add in gettimeoffset() */
- nsecs += arch_gettimeoffset();
+ ts->tv_sec = timekeeper.xtime_sec;
+ ts->tv_nsec = timekeeping_get_ns(&timekeeper);
} while (read_seqretry(&timekeeper.lock, seq));
@@ -251,13 +303,10 @@ ktime_t ktime_get(void)
do {
seq = read_seqbegin(&timekeeper.lock);
- secs = timekeeper.xtime.tv_sec +
+ secs = timekeeper.xtime_sec +
timekeeper.wall_to_monotonic.tv_sec;
- nsecs = timekeeper.xtime.tv_nsec +
+ nsecs = timekeeping_get_ns(&timekeeper) +
timekeeper.wall_to_monotonic.tv_nsec;
- nsecs += timekeeping_get_ns();
- /* If arch requires, add in gettimeoffset() */
- nsecs += arch_gettimeoffset();
} while (read_seqretry(&timekeeper.lock, seq));
/*
@@ -280,22 +329,19 @@ void ktime_get_ts(struct timespec *ts)
{
struct timespec tomono;
unsigned int seq;
- s64 nsecs;
WARN_ON(timekeeping_suspended);
do {
seq = read_seqbegin(&timekeeper.lock);
- *ts = timekeeper.xtime;
+ ts->tv_sec = timekeeper.xtime_sec;
+ ts->tv_nsec = timekeeping_get_ns(&timekeeper);
tomono = timekeeper.wall_to_monotonic;
- nsecs = timekeeping_get_ns();
- /* If arch requires, add in gettimeoffset() */
- nsecs += arch_gettimeoffset();
} while (read_seqretry(&timekeeper.lock, seq));
set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
- ts->tv_nsec + tomono.tv_nsec + nsecs);
+ ts->tv_nsec + tomono.tv_nsec);
}
EXPORT_SYMBOL_GPL(ktime_get_ts);
@@ -318,20 +364,14 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
WARN_ON_ONCE(timekeeping_suspended);
do {
- u32 arch_offset;
-
seq = read_seqbegin(&timekeeper.lock);
*ts_raw = timekeeper.raw_time;
- *ts_real = timekeeper.xtime;
-
- nsecs_raw = timekeeping_get_ns_raw();
- nsecs_real = timekeeping_get_ns();
+ ts_real->tv_sec = timekeeper.xtime_sec;
+ ts_real->tv_nsec = 0;
- /* If arch requires, add in gettimeoffset() */
- arch_offset = arch_gettimeoffset();
- nsecs_raw += arch_offset;
- nsecs_real += arch_offset;
+ nsecs_raw = timekeeping_get_ns_raw(&timekeeper);
+ nsecs_real = timekeeping_get_ns(&timekeeper);
} while (read_seqretry(&timekeeper.lock, seq));
@@ -366,7 +406,7 @@ EXPORT_SYMBOL(do_gettimeofday);
*/
int do_settimeofday(const struct timespec *tv)
{
- struct timespec ts_delta;
+ struct timespec ts_delta, xt;
unsigned long flags;
if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
@@ -374,15 +414,18 @@ int do_settimeofday(const struct timespec *tv)
write_seqlock_irqsave(&timekeeper.lock, flags);
- timekeeping_forward_now();
+ timekeeping_forward_now(&timekeeper);
+
+ xt = tk_xtime(&timekeeper);
+ ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
+ ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
- ts_delta.tv_sec = tv->tv_sec - timekeeper.xtime.tv_sec;
- ts_delta.tv_nsec = tv->tv_nsec - timekeeper.xtime.tv_nsec;
timekeeper.wall_to_monotonic =
timespec_sub(timekeeper.wall_to_monotonic, ts_delta);
- timekeeper.xtime = *tv;
- timekeeping_update(true);
+ tk_set_xtime(&timekeeper, tv);
+
+ timekeeping_update(&timekeeper, true);
write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -409,13 +452,14 @@ int timekeeping_inject_offset(struct timespec *ts)
write_seqlock_irqsave(&timekeeper.lock, flags);
- timekeeping_forward_now();
+ timekeeping_forward_now(&timekeeper);
- timekeeper.xtime = timespec_add(timekeeper.xtime, *ts);
+
+ tk_xtime_add(&timekeeper, ts);
timekeeper.wall_to_monotonic =
timespec_sub(timekeeper.wall_to_monotonic, *ts);
- timekeeping_update(true);
+ timekeeping_update(&timekeeper, true);
write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -440,14 +484,14 @@ static int change_clocksource(void *data)
write_seqlock_irqsave(&timekeeper.lock, flags);
- timekeeping_forward_now();
+ timekeeping_forward_now(&timekeeper);
if (!new->enable || new->enable(new) == 0) {
old = timekeeper.clock;
- timekeeper_setup_internals(new);
+ tk_setup_internals(&timekeeper, new);
if (old->disable)
old->disable(old);
}
- timekeeping_update(true);
+ timekeeping_update(&timekeeper, true);
write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -497,7 +541,7 @@ void getrawmonotonic(struct timespec *ts)
do {
seq = read_seqbegin(&timekeeper.lock);
- nsecs = timekeeping_get_ns_raw();
+ nsecs = timekeeping_get_ns_raw(&timekeeper);
*ts = timekeeper.raw_time;
} while (read_seqretry(&timekeeper.lock, seq));
@@ -532,6 +576,7 @@ u64 timekeeping_max_deferment(void)
{
unsigned long seq;
u64 ret;
+
do {
seq = read_seqbegin(&timekeeper.lock);
@@ -592,18 +637,17 @@ void __init timekeeping_init(void)
clock = clocksource_default_clock();
if (clock->enable)
clock->enable(clock);
- timekeeper_setup_internals(clock);
+ tk_setup_internals(&timekeeper, clock);
- timekeeper.xtime.tv_sec = now.tv_sec;
- timekeeper.xtime.tv_nsec = now.tv_nsec;
+ tk_set_xtime(&timekeeper, &now);
timekeeper.raw_time.tv_sec = 0;
timekeeper.raw_time.tv_nsec = 0;
- if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
- boot.tv_sec = timekeeper.xtime.tv_sec;
- boot.tv_nsec = timekeeper.xtime.tv_nsec;
- }
+ if (boot.tv_sec == 0 && boot.tv_nsec == 0)
+ boot = tk_xtime(&timekeeper);
+
set_normalized_timespec(&timekeeper.wall_to_monotonic,
-boot.tv_sec, -boot.tv_nsec);
+ update_rt_offset(&timekeeper);
timekeeper.total_sleep_time.tv_sec = 0;
timekeeper.total_sleep_time.tv_nsec = 0;
write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -612,6 +656,12 @@ void __init timekeeping_init(void)
/* time in seconds when suspend began */
static struct timespec timekeeping_suspend_time;
+static void update_sleep_time(struct timespec t)
+{
+ timekeeper.total_sleep_time = t;
+ timekeeper.offs_boot = timespec_to_ktime(t);
+}
+
/**
* __timekeeping_inject_sleeptime - Internal function to add sleep interval
* @delta: pointer to a timespec delta value
@@ -619,7 +669,8 @@ static struct timespec timekeeping_suspend_time;
* Takes a timespec offset measuring a suspend interval and properly
* adds the sleep offset to the timekeeping variables.
*/
-static void __timekeeping_inject_sleeptime(struct timespec *delta)
+static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
+ struct timespec *delta)
{
if (!timespec_valid(delta)) {
printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
@@ -627,11 +678,9 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)
return;
}
- timekeeper.xtime = timespec_add(timekeeper.xtime, *delta);
- timekeeper.wall_to_monotonic =
- timespec_sub(timekeeper.wall_to_monotonic, *delta);
- timekeeper.total_sleep_time = timespec_add(
- timekeeper.total_sleep_time, *delta);
+ tk_xtime_add(tk, delta);
+ tk->wall_to_monotonic = timespec_sub(tk->wall_to_monotonic, *delta);
+ update_sleep_time(timespec_add(tk->total_sleep_time, *delta));
}
@@ -657,11 +706,11 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
write_seqlock_irqsave(&timekeeper.lock, flags);
- timekeeping_forward_now();
+ timekeeping_forward_now(&timekeeper);
- __timekeeping_inject_sleeptime(delta);
+ __timekeeping_inject_sleeptime(&timekeeper, delta);
- timekeeping_update(true);
+ timekeeping_update(&timekeeper, true);
write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -690,12 +739,13 @@ static void timekeeping_resume(void)
if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
ts = timespec_sub(ts, timekeeping_suspend_time);
- __timekeeping_inject_sleeptime(&ts);
+ __timekeeping_inject_sleeptime(&timekeeper, &ts);
}
/* re-base the last cycle value */
timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
timekeeper.ntp_error = 0;
timekeeping_suspended = 0;
+ timekeeping_update(&timekeeper, false);
write_sequnlock_irqrestore(&timekeeper.lock, flags);
touch_softlockup_watchdog();
@@ -715,7 +765,7 @@ static int timekeeping_suspend(void)
read_persistent_clock(&timekeeping_suspend_time);
write_seqlock_irqsave(&timekeeper.lock, flags);
- timekeeping_forward_now();
+ timekeeping_forward_now(&timekeeper);
timekeeping_suspended = 1;
/*
@@ -724,7 +774,7 @@ static int timekeeping_suspend(void)
* try to compensate so the difference in system time
* and persistent_clock time stays close to constant.
*/
- delta = timespec_sub(timekeeper.xtime, timekeeping_suspend_time);
+ delta = timespec_sub(tk_xtime(&timekeeper), timekeeping_suspend_time);
delta_delta = timespec_sub(delta, old_delta);
if (abs(delta_delta.tv_sec) >= 2) {
/*
@@ -763,7 +813,8 @@ device_initcall(timekeeping_init_ops);
* If the error is already larger, we look ahead even further
* to compensate for late or lost adjustments.
*/
-static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
+static __always_inline int timekeeping_bigadjust(struct timekeeper *tk,
+ s64 error, s64 *interval,
s64 *offset)
{
s64 tick_error, i;
@@ -779,7 +830,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
* here. This is tuned so that an error of about 1 msec is adjusted
* within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
*/
- error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
+ error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
error2 = abs(error2);
for (look_ahead = 0; error2 > 0; look_ahead++)
error2 >>= 2;
@@ -788,8 +839,8 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
* Now calculate the error in (1 << look_ahead) ticks, but first
* remove the single look ahead already included in the error.
*/
- tick_error = ntp_tick_length() >> (timekeeper.ntp_error_shift + 1);
- tick_error -= timekeeper.xtime_interval >> 1;
+ tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1);
+ tick_error -= tk->xtime_interval >> 1;
error = ((error - tick_error) >> look_ahead) + tick_error;
/* Finally calculate the adjustment shift value. */
@@ -814,9 +865,9 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
* this is optimized for the most common adjustments of -1,0,1,
* for other values we can do a bit more work.
*/
-static void timekeeping_adjust(s64 offset)
+static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
{
- s64 error, interval = timekeeper.cycle_interval;
+ s64 error, interval = tk->cycle_interval;
int adj;
/*
@@ -832,7 +883,7 @@ static void timekeeping_adjust(s64 offset)
*
* Note: It does not "save" on aggravation when reading the code.
*/
- error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
+ error = tk->ntp_error >> (tk->ntp_error_shift - 1);
if (error > interval) {
/*
* We now divide error by 4(via shift), which checks if
@@ -854,7 +905,8 @@ static void timekeeping_adjust(s64 offset)
if (likely(error <= interval))
adj = 1;
else
- adj = timekeeping_bigadjust(error, &interval, &offset);
+ adj = timekeeping_bigadjust(tk, error, &interval,
+ &offset);
} else if (error < -interval) {
/* See comment above, this is just switched for the negative */
error >>= 2;
@@ -863,18 +915,17 @@ static void timekeeping_adjust(s64 offset)
interval = -interval;
offset = -offset;
} else
- adj = timekeeping_bigadjust(error, &interval, &offset);
- } else /* No adjustment needed */
+ adj = timekeeping_bigadjust(tk, error, &interval,
+ &offset);
+ } else
return;
- if (unlikely(timekeeper.clock->maxadj &&
- (timekeeper.mult + adj >
- timekeeper.clock->mult + timekeeper.clock->maxadj))) {
+ if (unlikely(tk->clock->maxadj &&
+ (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
printk_once(KERN_WARNING
"Adjusting %s more than 11%% (%ld vs %ld)\n",
- timekeeper.clock->name, (long)timekeeper.mult + adj,
- (long)timekeeper.clock->mult +
- timekeeper.clock->maxadj);
+ tk->clock->name, (long)tk->mult + adj,
+ (long)tk->clock->mult + tk->clock->maxadj);
}
/*
* So the following can be confusing.
@@ -925,11 +976,60 @@ static void timekeeping_adjust(s64 offset)
*
* XXX - TODO: Doc ntp_error calculation.
*/
- timekeeper.mult += adj;
- timekeeper.xtime_interval += interval;
- timekeeper.xtime_nsec -= offset;
- timekeeper.ntp_error -= (interval - offset) <<
- timekeeper.ntp_error_shift;
+ tk->mult += adj;
+ tk->xtime_interval += interval;
+ tk->xtime_nsec -= offset;
+ tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
+
+ /*
+ * It may be possible that when we entered this function, xtime_nsec
+ * was very small. Further, if we're slightly speeding the clocksource
+ * in the code above, its possible the required corrective factor to
+ * xtime_nsec could cause it to underflow.
+ *
+ * Now, since we already accumulated the second, cannot simply roll
+ * the accumulated second back, since the NTP subsystem has been
+ * notified via second_overflow. So instead we push xtime_nsec forward
+ * by the amount we underflowed, and add that amount into the error.
+ *
+ * We'll correct this error next time through this function, when
+ * xtime_nsec is not as small.
+ */
+ if (unlikely((s64)tk->xtime_nsec < 0)) {
+ s64 neg = -(s64)tk->xtime_nsec;
+ tk->xtime_nsec = 0;
+ tk->ntp_error += neg << tk->ntp_error_shift;
+ }
+
+}
+
+
+/**
+ * accumulate_nsecs_to_secs - Accumulates nsecs into secs
+ *
+ * Helper function that accumulates a the nsecs greater then a second
+ * from the xtime_nsec field to the xtime_secs field.
+ * It also calls into the NTP code to handle leapsecond processing.
+ *
+ */
+static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
+{
+ u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
+
+ while (tk->xtime_nsec >= nsecps) {
+ int leap;
+
+ tk->xtime_nsec -= nsecps;
+ tk->xtime_sec++;
+
+ /* Figure out if its a leap sec and apply if needed */
+ leap = second_overflow(tk->xtime_sec);
+ tk->xtime_sec += leap;
+ tk->wall_to_monotonic.tv_sec -= leap;
+ if (leap)
+ clock_was_set_delayed();
+
+ }
}
@@ -942,44 +1042,36 @@ static void timekeeping_adjust(s64 offset)
*
* Returns the unconsumed cycles.
*/
-static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
+static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
+ u32 shift)
{
- u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
u64 raw_nsecs;
- /* If the offset is smaller than a shifted interval, do nothing */
- if (offset < timekeeper.cycle_interval<<shift)
+ /* If the offset is smaller then a shifted interval, do nothing */
+ if (offset < tk->cycle_interval<<shift)
return offset;
/* Accumulate one shifted interval */
- offset -= timekeeper.cycle_interval << shift;
- timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
+ offset -= tk->cycle_interval << shift;
+ tk->clock->cycle_last += tk->cycle_interval << shift;
- timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
- while (timekeeper.xtime_nsec >= nsecps) {
- int leap;
- timekeeper.xtime_nsec -= nsecps;
- timekeeper.xtime.tv_sec++;
- leap = second_overflow(timekeeper.xtime.tv_sec);
- timekeeper.xtime.tv_sec += leap;
- timekeeper.wall_to_monotonic.tv_sec -= leap;
- }
+ tk->xtime_nsec += tk->xtime_interval << shift;
+ accumulate_nsecs_to_secs(tk);
/* Accumulate raw time */
- raw_nsecs = timekeeper.raw_interval << shift;
- raw_nsecs += timekeeper.raw_time.tv_nsec;
+ raw_nsecs = tk->raw_interval << shift;
+ raw_nsecs += tk->raw_time.tv_nsec;
if (raw_nsecs >= NSEC_PER_SEC) {
u64 raw_secs = raw_nsecs;
raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
- timekeeper.raw_time.tv_sec += raw_secs;
+ tk->raw_time.tv_sec += raw_secs;
}
- timekeeper.raw_time.tv_nsec = raw_nsecs;
+ tk->raw_time.tv_nsec = raw_nsecs;
/* Accumulate error between NTP and clock interval */
- timekeeper.ntp_error += ntp_tick_length() << shift;
- timekeeper.ntp_error -=
- (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
- (timekeeper.ntp_error_shift + shift);
+ tk->ntp_error += ntp_tick_length() << shift;
+ tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
+ (tk->ntp_error_shift + shift);
return offset;
}
@@ -995,6 +1087,7 @@ static void update_wall_time(void)
cycle_t offset;
int shift = 0, maxshift;
unsigned long flags;
+ s64 remainder;
write_seqlock_irqsave(&timekeeper.lock, flags);
@@ -1009,8 +1102,6 @@ static void update_wall_time(void)
#else
offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
#endif
- timekeeper.xtime_nsec = (s64)timekeeper.xtime.tv_nsec <<
- timekeeper.shift;
/*
* With NO_HZ we may have to accumulate many cycle_intervals
@@ -1026,62 +1117,36 @@ static void update_wall_time(void)
maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
shift = min(shift, maxshift);
while (offset >= timekeeper.cycle_interval) {
- offset = logarithmic_accumulation(offset, shift);
+ offset = logarithmic_accumulation(&timekeeper, offset, shift);
if(offset < timekeeper.cycle_interval<<shift)
shift--;
}
/* correct the clock when NTP error is too big */
- timekeeping_adjust(offset);
-
- /*
- * Since in the loop above, we accumulate any amount of time
- * in xtime_nsec over a second into xtime.tv_sec, its possible for
- * xtime_nsec to be fairly small after the loop. Further, if we're
- * slightly speeding the clocksource up in timekeeping_adjust(),
- * its possible the required corrective factor to xtime_nsec could
- * cause it to underflow.
- *
- * Now, we cannot simply roll the accumulated second back, since
- * the NTP subsystem has been notified via second_overflow. So
- * instead we push xtime_nsec forward by the amount we underflowed,
- * and add that amount into the error.
- *
- * We'll correct this error next time through this function, when
- * xtime_nsec is not as small.
- */
- if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
- s64 neg = -(s64)timekeeper.xtime_nsec;
- timekeeper.xtime_nsec = 0;
- timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
- }
+ timekeeping_adjust(&timekeeper, offset);
/*
- * Store full nanoseconds into xtime after rounding it up and
- * add the remainder to the error difference.
- */
- timekeeper.xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >>
- timekeeper.shift) + 1;
- timekeeper.xtime_nsec -= (s64)timekeeper.xtime.tv_nsec <<
- timekeeper.shift;
- timekeeper.ntp_error += timekeeper.xtime_nsec <<
- timekeeper.ntp_error_shift;
+ * Store only full nanoseconds into xtime_nsec after rounding
+ * it up and add the remainder to the error difference.
+ * XXX - This is necessary to avoid small 1ns inconsistnecies caused
+ * by truncating the remainder in vsyscalls. However, it causes
+ * additional work to be done in timekeeping_adjust(). Once
+ * the vsyscall implementations are converted to use xtime_nsec
+ * (shifted nanoseconds), this can be killed.
+ */
+ remainder = timekeeper.xtime_nsec & ((1 << timekeeper.shift) - 1);
+ timekeeper.xtime_nsec -= remainder;
+ timekeeper.xtime_nsec += 1 << timekeeper.shift;
+ timekeeper.ntp_error += remainder << timekeeper.ntp_error_shift;
/*
* Finally, make sure that after the rounding
- * xtime.tv_nsec isn't larger than NSEC_PER_SEC
+ * xtime_nsec isn't larger than NSEC_PER_SEC
*/
- if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) {
- int leap;
- timekeeper.xtime.tv_nsec -= NSEC_PER_SEC;
- timekeeper.xtime.tv_sec++;
- leap = second_overflow(timekeeper.xtime.tv_sec);
- timekeeper.xtime.tv_sec += leap;
- timekeeper.wall_to_monotonic.tv_sec -= leap;
- }
+ accumulate_nsecs_to_secs(&timekeeper);
- timekeeping_update(false);
+ timekeeping_update(&timekeeper, false);
out:
write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -1126,21 +1191,20 @@ void get_monotonic_boottime(struct timespec *ts)
{
struct timespec tomono, sleep;
unsigned int seq;
- s64 nsecs;
WARN_ON(timekeeping_suspended);
do {
seq = read_seqbegin(&timekeeper.lock);
- *ts = timekeeper.xtime;
+ ts->tv_sec = timekeeper.xtime_sec;
+ ts->tv_nsec = timekeeping_get_ns(&timekeeper);
tomono = timekeeper.wall_to_monotonic;
sleep = timekeeper.total_sleep_time;
- nsecs = timekeeping_get_ns();
} while (read_seqretry(&timekeeper.lock, seq));
set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
- ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
+ ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec);
}
EXPORT_SYMBOL_GPL(get_monotonic_boottime);
@@ -1173,13 +1237,13 @@ EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
unsigned long get_seconds(void)
{
- return timekeeper.xtime.tv_sec;
+ return timekeeper.xtime_sec;
}
EXPORT_SYMBOL(get_seconds);
struct timespec __current_kernel_time(void)
{
- return timekeeper.xtime;
+ return tk_xtime(&timekeeper);
}
struct timespec current_kernel_time(void)
@@ -1190,7 +1254,7 @@ struct timespec current_kernel_time(void)
do {
seq = read_seqbegin(&timekeeper.lock);
- now = timekeeper.xtime;
+ now = tk_xtime(&timekeeper);
} while (read_seqretry(&timekeeper.lock, seq));
return now;
@@ -1205,7 +1269,7 @@ struct timespec get_monotonic_coarse(void)
do {
seq = read_seqbegin(&timekeeper.lock);
- now = timekeeper.xtime;
+ now = tk_xtime(&timekeeper);
mono = timekeeper.wall_to_monotonic;
} while (read_seqretry(&timekeeper.lock, seq));
@@ -1240,12 +1304,43 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
do {
seq = read_seqbegin(&timekeeper.lock);
- *xtim = timekeeper.xtime;
+ *xtim = tk_xtime(&timekeeper);
*wtom = timekeeper.wall_to_monotonic;
*sleep = timekeeper.total_sleep_time;
} while (read_seqretry(&timekeeper.lock, seq));
}
+#ifdef CONFIG_HIGH_RES_TIMERS
+/**
+ * ktime_get_update_offsets - hrtimer helper
+ * @offs_real: pointer to storage for monotonic -> realtime offset
+ * @offs_boot: pointer to storage for monotonic -> boottime offset
+ *
+ * Returns current monotonic time and updates the offsets
+ * Called from hrtimer_interupt() or retrigger_next_event()
+ */
+ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
+{
+ ktime_t now;
+ unsigned int seq;
+ u64 secs, nsecs;
+
+ do {
+ seq = read_seqbegin(&timekeeper.lock);
+
+ secs = timekeeper.xtime_sec;
+ nsecs = timekeeping_get_ns(&timekeeper);
+
+ *offs_real = timekeeper.offs_real;
+ *offs_boot = timekeeper.offs_boot;
+ } while (read_seqretry(&timekeeper.lock, seq));
+
+ now = ktime_add_ns(ktime_set(secs, 0), nsecs);
+ now = ktime_sub(now, *offs_real);
+ return now;
+}
+#endif
+
/**
* ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
*/
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 3258455549f..af5a7e9f164 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -167,7 +167,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
{
struct tick_sched *ts = tick_get_tick_sched(cpu);
P(nohz_mode);
- P_ns(idle_tick);
+ P_ns(last_tick);
P(tick_stopped);
P(idle_jiffies);
P(idle_calls);
@@ -259,7 +259,7 @@ static int timer_list_show(struct seq_file *m, void *v)
u64 now = ktime_to_ns(ktime_get());
int cpu;
- SEQ_printf(m, "Timer List Version: v0.6\n");
+ SEQ_printf(m, "Timer List Version: v0.7\n");
SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
diff --git a/kernel/timer.c b/kernel/timer.c
index 6ec7e7e0db4..a61c09374eb 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -77,6 +77,7 @@ struct tvec_base {
struct timer_list *running_timer;
unsigned long timer_jiffies;
unsigned long next_timer;
+ unsigned long active_timers;
struct tvec_root tv1;
struct tvec tv2;
struct tvec tv3;
@@ -330,7 +331,8 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
}
EXPORT_SYMBOL_GPL(set_timer_slack);
-static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+static void
+__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
{
unsigned long expires = timer->expires;
unsigned long idx = expires - base->timer_jiffies;
@@ -372,6 +374,19 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
list_add_tail(&timer->entry, vec);
}
+static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+{
+ __internal_add_timer(base, timer);
+ /*
+ * Update base->active_timers and base->next_timer
+ */
+ if (!tbase_get_deferrable(timer->base)) {
+ if (time_before(timer->expires, base->next_timer))
+ base->next_timer = timer->expires;
+ base->active_timers++;
+ }
+}
+
#ifdef CONFIG_TIMER_STATS
void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
{
@@ -654,8 +669,7 @@ void init_timer_deferrable_key(struct timer_list *timer,
}
EXPORT_SYMBOL(init_timer_deferrable_key);
-static inline void detach_timer(struct timer_list *timer,
- int clear_pending)
+static inline void detach_timer(struct timer_list *timer, bool clear_pending)
{
struct list_head *entry = &timer->entry;
@@ -667,6 +681,29 @@ static inline void detach_timer(struct timer_list *timer,
entry->prev = LIST_POISON2;
}
+static inline void
+detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
+{
+ detach_timer(timer, true);
+ if (!tbase_get_deferrable(timer->base))
+ timer->base->active_timers--;
+}
+
+static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
+ bool clear_pending)
+{
+ if (!timer_pending(timer))
+ return 0;
+
+ detach_timer(timer, clear_pending);
+ if (!tbase_get_deferrable(timer->base)) {
+ timer->base->active_timers--;
+ if (timer->expires == base->next_timer)
+ base->next_timer = base->timer_jiffies;
+ }
+ return 1;
+}
+
/*
* We are using hashed locking: holding per_cpu(tvec_bases).lock
* means that all timers which are tied to this base via timer->base are
@@ -712,16 +749,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
base = lock_timer_base(timer, &flags);
- if (timer_pending(timer)) {
- detach_timer(timer, 0);
- if (timer->expires == base->next_timer &&
- !tbase_get_deferrable(timer->base))
- base->next_timer = base->timer_jiffies;
- ret = 1;
- } else {
- if (pending_only)
- goto out_unlock;
- }
+ ret = detach_if_pending(timer, base, false);
+ if (!ret && pending_only)
+ goto out_unlock;
debug_activate(timer, expires);
@@ -752,9 +782,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
}
timer->expires = expires;
- if (time_before(timer->expires, base->next_timer) &&
- !tbase_get_deferrable(timer->base))
- base->next_timer = timer->expires;
internal_add_timer(base, timer);
out_unlock:
@@ -920,9 +947,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
spin_lock_irqsave(&base->lock, flags);
timer_set_base(timer, base);
debug_activate(timer, timer->expires);
- if (time_before(timer->expires, base->next_timer) &&
- !tbase_get_deferrable(timer->base))
- base->next_timer = timer->expires;
internal_add_timer(base, timer);
/*
* Check whether the other CPU is idle and needs to be
@@ -959,13 +983,7 @@ int del_timer(struct timer_list *timer)
timer_stats_timer_clear_start_info(timer);
if (timer_pending(timer)) {
base = lock_timer_base(timer, &flags);
- if (timer_pending(timer)) {
- detach_timer(timer, 1);
- if (timer->expires == base->next_timer &&
- !tbase_get_deferrable(timer->base))
- base->next_timer = base->timer_jiffies;
- ret = 1;
- }
+ ret = detach_if_pending(timer, base, true);
spin_unlock_irqrestore(&base->lock, flags);
}
@@ -990,19 +1008,10 @@ int try_to_del_timer_sync(struct timer_list *timer)
base = lock_timer_base(timer, &flags);
- if (base->running_timer == timer)
- goto out;
-
- timer_stats_timer_clear_start_info(timer);
- ret = 0;
- if (timer_pending(timer)) {
- detach_timer(timer, 1);
- if (timer->expires == base->next_timer &&
- !tbase_get_deferrable(timer->base))
- base->next_timer = base->timer_jiffies;
- ret = 1;
+ if (base->running_timer != timer) {
+ timer_stats_timer_clear_start_info(timer);
+ ret = detach_if_pending(timer, base, true);
}
-out:
spin_unlock_irqrestore(&base->lock, flags);
return ret;
@@ -1089,7 +1098,8 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
*/
list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
BUG_ON(tbase_get_base(timer->base) != base);
- internal_add_timer(base, timer);
+ /* No accounting, while moving them */
+ __internal_add_timer(base, timer);
}
return index;
@@ -1178,7 +1188,7 @@ static inline void __run_timers(struct tvec_base *base)
timer_stats_account_timer(timer);
base->running_timer = timer;
- detach_timer(timer, 1);
+ detach_expired_timer(timer, base);
spin_unlock_irq(&base->lock);
call_timer_fn(timer, fn, data);
@@ -1316,18 +1326,21 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
unsigned long get_next_timer_interrupt(unsigned long now)
{
struct tvec_base *base = __this_cpu_read(tvec_bases);
- unsigned long expires;
+ unsigned long expires = now + NEXT_TIMER_MAX_DELTA;
/*
* Pretend that there is no timer pending if the cpu is offline.
* Possible pending timers will be migrated later to an active cpu.
*/
if (cpu_is_offline(smp_processor_id()))
- return now + NEXT_TIMER_MAX_DELTA;
+ return expires;
+
spin_lock(&base->lock);
- if (time_before_eq(base->next_timer, base->timer_jiffies))
- base->next_timer = __next_timer_interrupt(base);
- expires = base->next_timer;
+ if (base->active_timers) {
+ if (time_before_eq(base->next_timer, base->timer_jiffies))
+ base->next_timer = __next_timer_interrupt(base);
+ expires = base->next_timer;
+ }
spin_unlock(&base->lock);
if (time_before_eq(expires, now))
@@ -1704,6 +1717,7 @@ static int __cpuinit init_timers_cpu(int cpu)
base->timer_jiffies = jiffies;
base->next_timer = base->timer_jiffies;
+ base->active_timers = 0;
return 0;
}
@@ -1714,11 +1728,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
while (!list_empty(head)) {
timer = list_first_entry(head, struct timer_list, entry);
- detach_timer(timer, 0);
+ /* We ignore the accounting on the dying cpu */
+ detach_timer(timer, false);
timer_set_base(timer, new_base);
- if (time_before(timer->expires, new_base->next_timer) &&
- !tbase_get_deferrable(timer->base))
- new_base->next_timer = timer->expires;
internal_add_timer(new_base, timer);
}
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a008663d86c..b4f20fba09f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -312,7 +312,7 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
static int __register_ftrace_function(struct ftrace_ops *ops)
{
- if (ftrace_disabled)
+ if (unlikely(ftrace_disabled))
return -ENODEV;
if (FTRACE_WARN_ON(ops == &global_ops))
@@ -4299,16 +4299,12 @@ int register_ftrace_function(struct ftrace_ops *ops)
mutex_lock(&ftrace_lock);
- if (unlikely(ftrace_disabled))
- goto out_unlock;
-
ret = __register_ftrace_function(ops);
if (!ret)
ret = ftrace_startup(ops, 0);
-
- out_unlock:
mutex_unlock(&ftrace_lock);
+
return ret;
}
EXPORT_SYMBOL_GPL(register_ftrace_function);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1d0f6a8a0e5..49491fa7daa 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1075,6 +1075,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
rb_init_page(bpage->page);
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
ret = rb_allocate_pages(cpu_buffer, nr_pages);
if (ret < 0)
@@ -1346,10 +1347,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
* If something was added to this page, it was full
* since it is not the tail page. So we deduct the
* bytes consumed in ring buffer from here.
- * No need to update overruns, since this page is
- * deleted from ring buffer and its entries are
- * already accounted for.
+ * Increment overrun to account for the lost events.
*/
+ local_add(page_entries, &cpu_buffer->overrun);
local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
}
@@ -3239,6 +3239,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
if (cpu_buffer->commit_page == cpu_buffer->reader_page)
goto out;
+ /* Don't bother swapping if the ring buffer is empty */
+ if (rb_num_of_entries(cpu_buffer) == 0)
+ goto out;
+
/*
* Reset the reader page to size zero.
*/
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 49249c28690..5c38c81496c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -830,6 +830,8 @@ int register_tracer(struct tracer *type)
current_trace = saved_tracer;
if (ret) {
printk(KERN_CONT "FAILED!\n");
+ /* Add the warning after printing 'FAILED' */
+ WARN_ON(1);
goto out;
}
/* Only reset on passing, to avoid touching corrupted buffers */
@@ -1708,9 +1710,11 @@ EXPORT_SYMBOL_GPL(trace_vprintk);
static void trace_iterator_increment(struct trace_iterator *iter)
{
+ struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu);
+
iter->idx++;
- if (iter->buffer_iter[iter->cpu])
- ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
+ if (buf_iter)
+ ring_buffer_read(buf_iter, NULL);
}
static struct trace_entry *
@@ -1718,7 +1722,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
unsigned long *lost_events)
{
struct ring_buffer_event *event;
- struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
+ struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu);
if (buf_iter)
event = ring_buffer_iter_peek(buf_iter, ts);
@@ -1856,10 +1860,10 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
tr->data[cpu]->skipped_entries = 0;
- if (!iter->buffer_iter[cpu])
+ buf_iter = trace_buffer_iter(iter, cpu);
+ if (!buf_iter)
return;
- buf_iter = iter->buffer_iter[cpu];
ring_buffer_iter_reset(buf_iter);
/*
@@ -2205,13 +2209,15 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
int trace_empty(struct trace_iterator *iter)
{
+ struct ring_buffer_iter *buf_iter;
int cpu;
/* If we are looking at one CPU buffer, only check that one */
if (iter->cpu_file != TRACE_PIPE_ALL_CPU) {
cpu = iter->cpu_file;
- if (iter->buffer_iter[cpu]) {
- if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
+ buf_iter = trace_buffer_iter(iter, cpu);
+ if (buf_iter) {
+ if (!ring_buffer_iter_empty(buf_iter))
return 0;
} else {
if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
@@ -2221,8 +2227,9 @@ int trace_empty(struct trace_iterator *iter)
}
for_each_tracing_cpu(cpu) {
- if (iter->buffer_iter[cpu]) {
- if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
+ buf_iter = trace_buffer_iter(iter, cpu);
+ if (buf_iter) {
+ if (!ring_buffer_iter_empty(buf_iter))
return 0;
} else {
if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
@@ -2381,6 +2388,11 @@ __tracing_open(struct inode *inode, struct file *file)
if (!iter)
return ERR_PTR(-ENOMEM);
+ iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(),
+ GFP_KERNEL);
+ if (!iter->buffer_iter)
+ goto release;
+
/*
* We make a copy of the current tracer to avoid concurrent
* changes on it while we are reading.
@@ -2441,6 +2453,8 @@ __tracing_open(struct inode *inode, struct file *file)
fail:
mutex_unlock(&trace_types_lock);
kfree(iter->trace);
+ kfree(iter->buffer_iter);
+release:
seq_release_private(inode, file);
return ERR_PTR(-ENOMEM);
}
@@ -2481,6 +2495,7 @@ static int tracing_release(struct inode *inode, struct file *file)
mutex_destroy(&iter->mutex);
free_cpumask_var(iter->started);
kfree(iter->trace);
+ kfree(iter->buffer_iter);
seq_release_private(inode, file);
return 0;
}
@@ -3172,10 +3187,10 @@ static int tracing_set_tracer(const char *buf)
}
destroy_trace_option_files(topts);
- current_trace = t;
+ current_trace = &nop_trace;
- topts = create_trace_option_files(current_trace);
- if (current_trace->use_max_tr) {
+ topts = create_trace_option_files(t);
+ if (t->use_max_tr) {
int cpu;
/* we need to make per cpu buffer sizes equivalent */
for_each_tracing_cpu(cpu) {
@@ -3195,6 +3210,7 @@ static int tracing_set_tracer(const char *buf)
goto out;
}
+ current_trace = t;
trace_branch_enable(tr);
out:
mutex_unlock(&trace_types_lock);
@@ -3609,6 +3625,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
.pages = pages_def,
.partial = partial_def,
.nr_pages = 0, /* This gets updated below. */
+ .nr_pages_max = PIPE_DEF_BUFFERS,
.flags = flags,
.ops = &tracing_pipe_buf_ops,
.spd_release = tracing_spd_release_pipe,
@@ -3680,7 +3697,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
ret = splice_to_pipe(pipe, &spd);
out:
- splice_shrink_spd(pipe, &spd);
+ splice_shrink_spd(&spd);
return ret;
out_err:
@@ -4231,6 +4248,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
struct splice_pipe_desc spd = {
.pages = pages_def,
.partial = partial_def,
+ .nr_pages_max = PIPE_DEF_BUFFERS,
.flags = flags,
.ops = &buffer_pipe_buf_ops,
.spd_release = buffer_spd_release,
@@ -4318,7 +4336,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
}
ret = splice_to_pipe(pipe, &spd);
- splice_shrink_spd(pipe, &spd);
+ splice_shrink_spd(&spd);
out:
return ret;
}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5aec220d2de..55e1f7f0db1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -317,6 +317,14 @@ struct tracer {
#define TRACE_PIPE_ALL_CPU -1
+static inline struct ring_buffer_iter *
+trace_buffer_iter(struct trace_iterator *iter, int cpu)
+{
+ if (iter->buffer_iter && iter->buffer_iter[cpu])
+ return iter->buffer_iter[cpu];
+ return NULL;
+}
+
int tracer_init(struct tracer *t, struct trace_array *tr);
int tracing_is_enabled(void);
void trace_wake_up(void);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c7b0c6a7db0..a426f410c06 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -13,6 +13,7 @@
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
+#include <linux/pstore.h>
#include <linux/fs.h>
#include "trace.h"
@@ -74,6 +75,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
preempt_enable_notrace();
}
+/* Our two options */
+enum {
+ TRACE_FUNC_OPT_STACK = 0x1,
+ TRACE_FUNC_OPT_PSTORE = 0x2,
+};
+
+static struct tracer_flags func_flags;
+
static void
function_trace_call(unsigned long ip, unsigned long parent_ip)
{
@@ -97,6 +106,12 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
+ /*
+ * So far tracing doesn't support multiple buffers, so
+ * we make an explicit call for now.
+ */
+ if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE))
+ pstore_ftrace_call(ip, parent_ip);
pc = preempt_count();
trace_function(tr, ip, parent_ip, flags, pc);
}
@@ -158,15 +173,13 @@ static struct ftrace_ops trace_stack_ops __read_mostly =
.flags = FTRACE_OPS_FL_GLOBAL,
};
-/* Our two options */
-enum {
- TRACE_FUNC_OPT_STACK = 0x1,
-};
-
static struct tracer_opt func_opts[] = {
#ifdef CONFIG_STACKTRACE
{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
#endif
+#ifdef CONFIG_PSTORE_FTRACE
+ { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) },
+#endif
{ } /* Always set a last empty entry */
};
@@ -204,10 +217,11 @@ static void tracing_stop_function_trace(void)
static int func_set_flag(u32 old_flags, u32 bit, int set)
{
- if (bit == TRACE_FUNC_OPT_STACK) {
+ switch (bit) {
+ case TRACE_FUNC_OPT_STACK:
/* do nothing if already set */
if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
- return 0;
+ break;
if (set) {
unregister_ftrace_function(&trace_ops);
@@ -217,10 +231,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
register_ftrace_function(&trace_ops);
}
- return 0;
+ break;
+ case TRACE_FUNC_OPT_PSTORE:
+ break;
+ default:
+ return -EINVAL;
}
- return -EINVAL;
+ return 0;
}
static struct tracer function_trace __read_mostly =
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index a7d2a4c653d..ce27c8ba8d3 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -538,7 +538,7 @@ get_return_for_leaf(struct trace_iterator *iter,
next = &data->ret;
} else {
- ring_iter = iter->buffer_iter[iter->cpu];
+ ring_iter = trace_buffer_iter(iter, iter->cpu);
/* First peek to compare current entry and the next one */
if (ring_iter)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index df611a0e76c..123b189c732 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1325,4 +1325,4 @@ __init static int init_events(void)
return 0;
}
-device_initcall(init_events);
+early_initcall(init_events);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4b1dfba70f7..69add8a9da6 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -575,7 +575,7 @@ out:
/*
* Create/destroy watchdog threads as CPUs come and go:
*/
-static int __cpuinit
+static int
cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
int hotcpu = (unsigned long)hcpu;
@@ -610,10 +610,27 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
return NOTIFY_OK;
}
-static struct notifier_block __cpuinitdata cpu_nfb = {
+static struct notifier_block cpu_nfb = {
.notifier_call = cpu_callback
};
+#ifdef CONFIG_SUSPEND
+/*
+ * On exit from suspend we force an offline->online transition on the boot CPU
+ * so that the PMU state that was lost while in suspended state gets set up
+ * properly for the boot CPU. This information is required for restarting the
+ * NMI watchdog.
+ */
+void lockup_detector_bootcpu_resume(void)
+{
+ void *cpu = (void *)(long)smp_processor_id();
+
+ cpu_callback(&cpu_nfb, CPU_DEAD_FROZEN, cpu);
+ cpu_callback(&cpu_nfb, CPU_UP_PREPARE_FROZEN, cpu);
+ cpu_callback(&cpu_nfb, CPU_ONLINE_FROZEN, cpu);
+}
+#endif
+
void __init lockup_detector_init(void)
{
void *cpu = (void *)(long)smp_processor_id();
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9a3128dc67d..692d97628a1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -45,32 +45,41 @@
#include "workqueue_sched.h"
enum {
- /* global_cwq flags */
- GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
- GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */
- GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
- GCWQ_FREEZING = 1 << 3, /* freeze in progress */
- GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */
+ /*
+ * global_cwq flags
+ *
+ * A bound gcwq is either associated or disassociated with its CPU.
+ * While associated (!DISASSOCIATED), all workers are bound to the
+ * CPU and none has %WORKER_UNBOUND set and concurrency management
+ * is in effect.
+ *
+ * While DISASSOCIATED, the cpu may be offline and all workers have
+ * %WORKER_UNBOUND set and concurrency management disabled, and may
+ * be executing on any CPU. The gcwq behaves as an unbound one.
+ *
+ * Note that DISASSOCIATED can be flipped only while holding
+ * managership of all pools on the gcwq to avoid changing binding
+ * state while create_worker() is in progress.
+ */
+ GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */
+ GCWQ_FREEZING = 1 << 1, /* freeze in progress */
+
+ /* pool flags */
+ POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
/* worker flags */
WORKER_STARTED = 1 << 0, /* started */
WORKER_DIE = 1 << 1, /* die die die */
WORKER_IDLE = 1 << 2, /* is idle */
WORKER_PREP = 1 << 3, /* preparing to run works */
- WORKER_ROGUE = 1 << 4, /* not bound to any cpu */
WORKER_REBIND = 1 << 5, /* mom is home, come back */
WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
WORKER_UNBOUND = 1 << 7, /* worker is unbound */
- WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
- WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
+ WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND |
+ WORKER_CPU_INTENSIVE,
- /* gcwq->trustee_state */
- TRUSTEE_START = 0, /* start */
- TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */
- TRUSTEE_BUTCHER = 2, /* butcher workers */
- TRUSTEE_RELEASE = 3, /* release workers */
- TRUSTEE_DONE = 4, /* trustee is done */
+ NR_WORKER_POOLS = 2, /* # worker pools per gcwq */
BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
@@ -84,13 +93,13 @@ enum {
(min two ticks) */
MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
CREATE_COOLDOWN = HZ, /* time to breath after fail */
- TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
/*
* Rescue workers are used only on emergencies and shared by
* all cpus. Give -20.
*/
RESCUER_NICE_LEVEL = -20,
+ HIGHPRI_NICE_LEVEL = -20,
};
/*
@@ -115,6 +124,8 @@ enum {
*/
struct global_cwq;
+struct worker_pool;
+struct idle_rebind;
/*
* The poor guys doing the actual heavy lifting. All on-duty workers
@@ -131,12 +142,31 @@ struct worker {
struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
struct list_head scheduled; /* L: scheduled works */
struct task_struct *task; /* I: worker task */
- struct global_cwq *gcwq; /* I: the associated gcwq */
+ struct worker_pool *pool; /* I: the associated pool */
/* 64 bytes boundary on 64bit, 32 on 32bit */
unsigned long last_active; /* L: last active timestamp */
unsigned int flags; /* X: flags */
int id; /* I: worker id */
- struct work_struct rebind_work; /* L: rebind worker to cpu */
+
+ /* for rebinding worker to CPU */
+ struct idle_rebind *idle_rebind; /* L: for idle worker */
+ struct work_struct rebind_work; /* L: for busy worker */
+};
+
+struct worker_pool {
+ struct global_cwq *gcwq; /* I: the owning gcwq */
+ unsigned int flags; /* X: flags */
+
+ struct list_head worklist; /* L: list of pending works */
+ int nr_workers; /* L: total number of workers */
+ int nr_idle; /* L: currently idle ones */
+
+ struct list_head idle_list; /* X: list of idle workers */
+ struct timer_list idle_timer; /* L: worker idle timeout */
+ struct timer_list mayday_timer; /* L: SOS timer for workers */
+
+ struct mutex manager_mutex; /* mutex manager should hold */
+ struct ida worker_ida; /* L: for worker IDs */
};
/*
@@ -146,27 +176,16 @@ struct worker {
*/
struct global_cwq {
spinlock_t lock; /* the gcwq lock */
- struct list_head worklist; /* L: list of pending works */
unsigned int cpu; /* I: the associated cpu */
unsigned int flags; /* L: GCWQ_* flags */
- int nr_workers; /* L: total number of workers */
- int nr_idle; /* L: currently idle ones */
-
- /* workers are chained either in the idle_list or busy_hash */
- struct list_head idle_list; /* X: list of idle workers */
+ /* workers are chained either in busy_hash or pool idle_list */
struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
/* L: hash of busy workers */
- struct timer_list idle_timer; /* L: worker idle timeout */
- struct timer_list mayday_timer; /* L: SOS timer for dworkers */
-
- struct ida worker_ida; /* L: for worker IDs */
+ struct worker_pool pools[2]; /* normal and highpri pools */
- struct task_struct *trustee; /* L: for gcwq shutdown */
- unsigned int trustee_state; /* L: trustee state */
- wait_queue_head_t trustee_wait; /* trustee wait */
- struct worker *first_idle; /* L: first idle worker */
+ wait_queue_head_t rebind_hold; /* rebind hold wait */
} ____cacheline_aligned_in_smp;
/*
@@ -175,7 +194,7 @@ struct global_cwq {
* aligned at two's power of the number of flag bits.
*/
struct cpu_workqueue_struct {
- struct global_cwq *gcwq; /* I: the associated gcwq */
+ struct worker_pool *pool; /* I: the associated pool */
struct workqueue_struct *wq; /* I: the owning workqueue */
int work_color; /* L: current color */
int flush_color; /* L: flushing color */
@@ -264,6 +283,10 @@ EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>
+#define for_each_worker_pool(pool, gcwq) \
+ for ((pool) = &(gcwq)->pools[0]; \
+ (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++)
+
#define for_each_busy_worker(worker, i, pos, gcwq) \
for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
@@ -444,7 +467,7 @@ static bool workqueue_freezing; /* W: have wqs started freezing? */
* try_to_wake_up(). Put it in a separate cacheline.
*/
static DEFINE_PER_CPU(struct global_cwq, global_cwq);
-static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
+static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]);
/*
* Global cpu workqueue and nr_running counter for unbound gcwq. The
@@ -452,10 +475,17 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
* workers have WORKER_UNBOUND set.
*/
static struct global_cwq unbound_global_cwq;
-static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */
+static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
+ [0 ... NR_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */
+};
static int worker_thread(void *__worker);
+static int worker_pool_pri(struct worker_pool *pool)
+{
+ return pool - pool->gcwq->pools;
+}
+
static struct global_cwq *get_gcwq(unsigned int cpu)
{
if (cpu != WORK_CPU_UNBOUND)
@@ -464,12 +494,15 @@ static struct global_cwq *get_gcwq(unsigned int cpu)
return &unbound_global_cwq;
}
-static atomic_t *get_gcwq_nr_running(unsigned int cpu)
+static atomic_t *get_pool_nr_running(struct worker_pool *pool)
{
+ int cpu = pool->gcwq->cpu;
+ int idx = worker_pool_pri(pool);
+
if (cpu != WORK_CPU_UNBOUND)
- return &per_cpu(gcwq_nr_running, cpu);
+ return &per_cpu(pool_nr_running, cpu)[idx];
else
- return &unbound_gcwq_nr_running;
+ return &unbound_pool_nr_running[idx];
}
static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
@@ -555,7 +588,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
if (data & WORK_STRUCT_CWQ)
return ((struct cpu_workqueue_struct *)
- (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
+ (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
cpu = data >> WORK_STRUCT_FLAG_BITS;
if (cpu == WORK_CPU_NONE)
@@ -566,60 +599,62 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
}
/*
- * Policy functions. These define the policies on how the global
- * worker pool is managed. Unless noted otherwise, these functions
- * assume that they're being called with gcwq->lock held.
+ * Policy functions. These define the policies on how the global worker
+ * pools are managed. Unless noted otherwise, these functions assume that
+ * they're being called with gcwq->lock held.
*/
-static bool __need_more_worker(struct global_cwq *gcwq)
+static bool __need_more_worker(struct worker_pool *pool)
{
- return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
- gcwq->flags & GCWQ_HIGHPRI_PENDING;
+ return !atomic_read(get_pool_nr_running(pool));
}
/*
* Need to wake up a worker? Called from anything but currently
* running workers.
+ *
+ * Note that, because unbound workers never contribute to nr_running, this
+ * function will always return %true for unbound gcwq as long as the
+ * worklist isn't empty.
*/
-static bool need_more_worker(struct global_cwq *gcwq)
+static bool need_more_worker(struct worker_pool *pool)
{
- return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
+ return !list_empty(&pool->worklist) && __need_more_worker(pool);
}
/* Can I start working? Called from busy but !running workers. */
-static bool may_start_working(struct global_cwq *gcwq)
+static bool may_start_working(struct worker_pool *pool)
{
- return gcwq->nr_idle;
+ return pool->nr_idle;
}
/* Do I need to keep working? Called from currently running workers. */
-static bool keep_working(struct global_cwq *gcwq)
+static bool keep_working(struct worker_pool *pool)
{
- atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+ atomic_t *nr_running = get_pool_nr_running(pool);
- return !list_empty(&gcwq->worklist) &&
- (atomic_read(nr_running) <= 1 ||
- gcwq->flags & GCWQ_HIGHPRI_PENDING);
+ return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
}
/* Do we need a new worker? Called from manager. */
-static bool need_to_create_worker(struct global_cwq *gcwq)
+static bool need_to_create_worker(struct worker_pool *pool)
{
- return need_more_worker(gcwq) && !may_start_working(gcwq);
+ return need_more_worker(pool) && !may_start_working(pool);
}
/* Do I need to be the manager? */
-static bool need_to_manage_workers(struct global_cwq *gcwq)
+static bool need_to_manage_workers(struct worker_pool *pool)
{
- return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
+ return need_to_create_worker(pool) ||
+ (pool->flags & POOL_MANAGE_WORKERS);
}
/* Do we have too many workers and should some go away? */
-static bool too_many_workers(struct global_cwq *gcwq)
+static bool too_many_workers(struct worker_pool *pool)
{
- bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
- int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
- int nr_busy = gcwq->nr_workers - nr_idle;
+ bool managing = mutex_is_locked(&pool->manager_mutex);
+ int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
+ int nr_busy = pool->nr_workers - nr_idle;
return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
}
@@ -629,26 +664,26 @@ static bool too_many_workers(struct global_cwq *gcwq)
*/
/* Return the first worker. Safe with preemption disabled */
-static struct worker *first_worker(struct global_cwq *gcwq)
+static struct worker *first_worker(struct worker_pool *pool)
{
- if (unlikely(list_empty(&gcwq->idle_list)))
+ if (unlikely(list_empty(&pool->idle_list)))
return NULL;
- return list_first_entry(&gcwq->idle_list, struct worker, entry);
+ return list_first_entry(&pool->idle_list, struct worker, entry);
}
/**
* wake_up_worker - wake up an idle worker
- * @gcwq: gcwq to wake worker for
+ * @pool: worker pool to wake worker from
*
- * Wake up the first idle worker of @gcwq.
+ * Wake up the first idle worker of @pool.
*
* CONTEXT:
* spin_lock_irq(gcwq->lock).
*/
-static void wake_up_worker(struct global_cwq *gcwq)
+static void wake_up_worker(struct worker_pool *pool)
{
- struct worker *worker = first_worker(gcwq);
+ struct worker *worker = first_worker(pool);
if (likely(worker))
wake_up_process(worker->task);
@@ -670,7 +705,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
struct worker *worker = kthread_data(task);
if (!(worker->flags & WORKER_NOT_RUNNING))
- atomic_inc(get_gcwq_nr_running(cpu));
+ atomic_inc(get_pool_nr_running(worker->pool));
}
/**
@@ -692,8 +727,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
unsigned int cpu)
{
struct worker *worker = kthread_data(task), *to_wakeup = NULL;
- struct global_cwq *gcwq = get_gcwq(cpu);
- atomic_t *nr_running = get_gcwq_nr_running(cpu);
+ struct worker_pool *pool = worker->pool;
+ atomic_t *nr_running = get_pool_nr_running(pool);
if (worker->flags & WORKER_NOT_RUNNING)
return NULL;
@@ -706,14 +741,14 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
* worklist not empty test sequence is in insert_work().
* Please read comment there.
*
- * NOT_RUNNING is clear. This means that trustee is not in
- * charge and we're running on the local cpu w/ rq lock held
- * and preemption disabled, which in turn means that none else
- * could be manipulating idle_list, so dereferencing idle_list
- * without gcwq lock is safe.
+ * NOT_RUNNING is clear. This means that we're bound to and
+ * running on the local cpu w/ rq lock held and preemption
+ * disabled, which in turn means that none else could be
+ * manipulating idle_list, so dereferencing idle_list without gcwq
+ * lock is safe.
*/
- if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
- to_wakeup = first_worker(gcwq);
+ if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
+ to_wakeup = first_worker(pool);
return to_wakeup ? to_wakeup->task : NULL;
}
@@ -733,7 +768,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
static inline void worker_set_flags(struct worker *worker, unsigned int flags,
bool wakeup)
{
- struct global_cwq *gcwq = worker->gcwq;
+ struct worker_pool *pool = worker->pool;
WARN_ON_ONCE(worker->task != current);
@@ -744,12 +779,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
*/
if ((flags & WORKER_NOT_RUNNING) &&
!(worker->flags & WORKER_NOT_RUNNING)) {
- atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+ atomic_t *nr_running = get_pool_nr_running(pool);
if (wakeup) {
if (atomic_dec_and_test(nr_running) &&
- !list_empty(&gcwq->worklist))
- wake_up_worker(gcwq);
+ !list_empty(&pool->worklist))
+ wake_up_worker(pool);
} else
atomic_dec(nr_running);
}
@@ -769,7 +804,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
*/
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
- struct global_cwq *gcwq = worker->gcwq;
+ struct worker_pool *pool = worker->pool;
unsigned int oflags = worker->flags;
WARN_ON_ONCE(worker->task != current);
@@ -783,7 +818,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
*/
if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
if (!(worker->flags & WORKER_NOT_RUNNING))
- atomic_inc(get_gcwq_nr_running(gcwq->cpu));
+ atomic_inc(get_pool_nr_running(pool));
}
/**
@@ -867,43 +902,6 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
}
/**
- * gcwq_determine_ins_pos - find insertion position
- * @gcwq: gcwq of interest
- * @cwq: cwq a work is being queued for
- *
- * A work for @cwq is about to be queued on @gcwq, determine insertion
- * position for the work. If @cwq is for HIGHPRI wq, the work is
- * queued at the head of the queue but in FIFO order with respect to
- * other HIGHPRI works; otherwise, at the end of the queue. This
- * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
- * there are HIGHPRI works pending.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- *
- * RETURNS:
- * Pointer to inserstion position.
- */
-static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
- struct cpu_workqueue_struct *cwq)
-{
- struct work_struct *twork;
-
- if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
- return &gcwq->worklist;
-
- list_for_each_entry(twork, &gcwq->worklist, entry) {
- struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
-
- if (!(tcwq->wq->flags & WQ_HIGHPRI))
- break;
- }
-
- gcwq->flags |= GCWQ_HIGHPRI_PENDING;
- return &twork->entry;
-}
-
-/**
* insert_work - insert a work into gcwq
* @cwq: cwq @work belongs to
* @work: work to insert
@@ -920,7 +918,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
struct work_struct *work, struct list_head *head,
unsigned int extra_flags)
{
- struct global_cwq *gcwq = cwq->gcwq;
+ struct worker_pool *pool = cwq->pool;
/* we own @work, set data and link */
set_work_cwq(work, cwq, extra_flags);
@@ -940,8 +938,8 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
*/
smp_mb();
- if (__need_more_worker(gcwq))
- wake_up_worker(gcwq);
+ if (__need_more_worker(pool))
+ wake_up_worker(pool);
}
/*
@@ -1043,7 +1041,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
if (likely(cwq->nr_active < cwq->max_active)) {
trace_workqueue_activate_work(work);
cwq->nr_active++;
- worklist = gcwq_determine_ins_pos(gcwq, cwq);
+ worklist = &cwq->pool->worklist;
} else {
work_flags |= WORK_STRUCT_DELAYED;
worklist = &cwq->delayed_works;
@@ -1192,7 +1190,8 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
*/
static void worker_enter_idle(struct worker *worker)
{
- struct global_cwq *gcwq = worker->gcwq;
+ struct worker_pool *pool = worker->pool;
+ struct global_cwq *gcwq = pool->gcwq;
BUG_ON(worker->flags & WORKER_IDLE);
BUG_ON(!list_empty(&worker->entry) &&
@@ -1200,27 +1199,24 @@ static void worker_enter_idle(struct worker *worker)
/* can't use worker_set_flags(), also called from start_worker() */
worker->flags |= WORKER_IDLE;
- gcwq->nr_idle++;
+ pool->nr_idle++;
worker->last_active = jiffies;
/* idle_list is LIFO */
- list_add(&worker->entry, &gcwq->idle_list);
+ list_add(&worker->entry, &pool->idle_list);
- if (likely(!(worker->flags & WORKER_ROGUE))) {
- if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
- mod_timer(&gcwq->idle_timer,
- jiffies + IDLE_WORKER_TIMEOUT);
- } else
- wake_up_all(&gcwq->trustee_wait);
+ if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
+ mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
/*
- * Sanity check nr_running. Because trustee releases gcwq->lock
- * between setting %WORKER_ROGUE and zapping nr_running, the
- * warning may trigger spuriously. Check iff trustee is idle.
+ * Sanity check nr_running. Because gcwq_unbind_fn() releases
+ * gcwq->lock between setting %WORKER_UNBOUND and zapping
+ * nr_running, the warning may trigger spuriously. Check iff
+ * unbind is not in progress.
*/
- WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
- gcwq->nr_workers == gcwq->nr_idle &&
- atomic_read(get_gcwq_nr_running(gcwq->cpu)));
+ WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
+ pool->nr_workers == pool->nr_idle &&
+ atomic_read(get_pool_nr_running(pool)));
}
/**
@@ -1234,11 +1230,11 @@ static void worker_enter_idle(struct worker *worker)
*/
static void worker_leave_idle(struct worker *worker)
{
- struct global_cwq *gcwq = worker->gcwq;
+ struct worker_pool *pool = worker->pool;
BUG_ON(!(worker->flags & WORKER_IDLE));
worker_clr_flags(worker, WORKER_IDLE);
- gcwq->nr_idle--;
+ pool->nr_idle--;
list_del_init(&worker->entry);
}
@@ -1258,11 +1254,11 @@ static void worker_leave_idle(struct worker *worker)
* verbatim as it's best effort and blocking and gcwq may be
* [dis]associated in the meantime.
*
- * This function tries set_cpus_allowed() and locks gcwq and verifies
- * the binding against GCWQ_DISASSOCIATED which is set during
- * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
- * idle state or fetches works without dropping lock, it can guarantee
- * the scheduling requirement described in the first paragraph.
+ * This function tries set_cpus_allowed() and locks gcwq and verifies the
+ * binding against %GCWQ_DISASSOCIATED which is set during
+ * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
+ * enters idle state or fetches works without dropping lock, it can
+ * guarantee the scheduling requirement described in the first paragraph.
*
* CONTEXT:
* Might sleep. Called without any lock but returns with gcwq->lock
@@ -1275,7 +1271,7 @@ static void worker_leave_idle(struct worker *worker)
static bool worker_maybe_bind_and_lock(struct worker *worker)
__acquires(&gcwq->lock)
{
- struct global_cwq *gcwq = worker->gcwq;
+ struct global_cwq *gcwq = worker->pool->gcwq;
struct task_struct *task = worker->task;
while (true) {
@@ -1308,16 +1304,40 @@ __acquires(&gcwq->lock)
}
}
+struct idle_rebind {
+ int cnt; /* # workers to be rebound */
+ struct completion done; /* all workers rebound */
+};
+
+/*
+ * Rebind an idle @worker to its CPU. During CPU onlining, this has to
+ * happen synchronously for idle workers. worker_thread() will test
+ * %WORKER_REBIND before leaving idle and call this function.
+ */
+static void idle_worker_rebind(struct worker *worker)
+{
+ struct global_cwq *gcwq = worker->pool->gcwq;
+
+ /* CPU must be online at this point */
+ WARN_ON(!worker_maybe_bind_and_lock(worker));
+ if (!--worker->idle_rebind->cnt)
+ complete(&worker->idle_rebind->done);
+ spin_unlock_irq(&worker->pool->gcwq->lock);
+
+ /* we did our part, wait for rebind_workers() to finish up */
+ wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
+}
+
/*
- * Function for worker->rebind_work used to rebind rogue busy workers
- * to the associated cpu which is coming back online. This is
- * scheduled by cpu up but can race with other cpu hotplug operations
- * and may be executed twice without intervening cpu down.
+ * Function for @worker->rebind.work used to rebind unbound busy workers to
+ * the associated cpu which is coming back online. This is scheduled by
+ * cpu up but can race with other cpu hotplug operations and may be
+ * executed twice without intervening cpu down.
*/
-static void worker_rebind_fn(struct work_struct *work)
+static void busy_worker_rebind_fn(struct work_struct *work)
{
struct worker *worker = container_of(work, struct worker, rebind_work);
- struct global_cwq *gcwq = worker->gcwq;
+ struct global_cwq *gcwq = worker->pool->gcwq;
if (worker_maybe_bind_and_lock(worker))
worker_clr_flags(worker, WORKER_REBIND);
@@ -1325,6 +1345,112 @@ static void worker_rebind_fn(struct work_struct *work)
spin_unlock_irq(&gcwq->lock);
}
+/**
+ * rebind_workers - rebind all workers of a gcwq to the associated CPU
+ * @gcwq: gcwq of interest
+ *
+ * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding
+ * is different for idle and busy ones.
+ *
+ * The idle ones should be rebound synchronously and idle rebinding should
+ * be complete before any worker starts executing work items with
+ * concurrency management enabled; otherwise, scheduler may oops trying to
+ * wake up non-local idle worker from wq_worker_sleeping().
+ *
+ * This is achieved by repeatedly requesting rebinding until all idle
+ * workers are known to have been rebound under @gcwq->lock and holding all
+ * idle workers from becoming busy until idle rebinding is complete.
+ *
+ * Once idle workers are rebound, busy workers can be rebound as they
+ * finish executing their current work items. Queueing the rebind work at
+ * the head of their scheduled lists is enough. Note that nr_running will
+ * be properbly bumped as busy workers rebind.
+ *
+ * On return, all workers are guaranteed to either be bound or have rebind
+ * work item scheduled.
+ */
+static void rebind_workers(struct global_cwq *gcwq)
+ __releases(&gcwq->lock) __acquires(&gcwq->lock)
+{
+ struct idle_rebind idle_rebind;
+ struct worker_pool *pool;
+ struct worker *worker;
+ struct hlist_node *pos;
+ int i;
+
+ lockdep_assert_held(&gcwq->lock);
+
+ for_each_worker_pool(pool, gcwq)
+ lockdep_assert_held(&pool->manager_mutex);
+
+ /*
+ * Rebind idle workers. Interlocked both ways. We wait for
+ * workers to rebind via @idle_rebind.done. Workers will wait for
+ * us to finish up by watching %WORKER_REBIND.
+ */
+ init_completion(&idle_rebind.done);
+retry:
+ idle_rebind.cnt = 1;
+ INIT_COMPLETION(idle_rebind.done);
+
+ /* set REBIND and kick idle ones, we'll wait for these later */
+ for_each_worker_pool(pool, gcwq) {
+ list_for_each_entry(worker, &pool->idle_list, entry) {
+ if (worker->flags & WORKER_REBIND)
+ continue;
+
+ /* morph UNBOUND to REBIND */
+ worker->flags &= ~WORKER_UNBOUND;
+ worker->flags |= WORKER_REBIND;
+
+ idle_rebind.cnt++;
+ worker->idle_rebind = &idle_rebind;
+
+ /* worker_thread() will call idle_worker_rebind() */
+ wake_up_process(worker->task);
+ }
+ }
+
+ if (--idle_rebind.cnt) {
+ spin_unlock_irq(&gcwq->lock);
+ wait_for_completion(&idle_rebind.done);
+ spin_lock_irq(&gcwq->lock);
+ /* busy ones might have become idle while waiting, retry */
+ goto retry;
+ }
+
+ /*
+ * All idle workers are rebound and waiting for %WORKER_REBIND to
+ * be cleared inside idle_worker_rebind(). Clear and release.
+ * Clearing %WORKER_REBIND from this foreign context is safe
+ * because these workers are still guaranteed to be idle.
+ */
+ for_each_worker_pool(pool, gcwq)
+ list_for_each_entry(worker, &pool->idle_list, entry)
+ worker->flags &= ~WORKER_REBIND;
+
+ wake_up_all(&gcwq->rebind_hold);
+
+ /* rebind busy workers */
+ for_each_busy_worker(worker, i, pos, gcwq) {
+ struct work_struct *rebind_work = &worker->rebind_work;
+
+ /* morph UNBOUND to REBIND */
+ worker->flags &= ~WORKER_UNBOUND;
+ worker->flags |= WORKER_REBIND;
+
+ if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
+ work_data_bits(rebind_work)))
+ continue;
+
+ /* wq doesn't matter, use the default one */
+ debug_work_activate(rebind_work);
+ insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
+ worker->scheduled.next,
+ work_color_to_flags(WORK_NO_COLOR));
+ }
+}
+
static struct worker *alloc_worker(void)
{
struct worker *worker;
@@ -1333,7 +1459,7 @@ static struct worker *alloc_worker(void)
if (worker) {
INIT_LIST_HEAD(&worker->entry);
INIT_LIST_HEAD(&worker->scheduled);
- INIT_WORK(&worker->rebind_work, worker_rebind_fn);
+ INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
/* on creation a worker is in !idle && prep state */
worker->flags = WORKER_PREP;
}
@@ -1342,10 +1468,9 @@ static struct worker *alloc_worker(void)
/**
* create_worker - create a new workqueue worker
- * @gcwq: gcwq the new worker will belong to
- * @bind: whether to set affinity to @cpu or not
+ * @pool: pool the new worker will belong to
*
- * Create a new worker which is bound to @gcwq. The returned worker
+ * Create a new worker which is bound to @pool. The returned worker
* can be started by calling start_worker() or destroyed using
* destroy_worker().
*
@@ -1355,16 +1480,17 @@ static struct worker *alloc_worker(void)
* RETURNS:
* Pointer to the newly created worker.
*/
-static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
+static struct worker *create_worker(struct worker_pool *pool)
{
- bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
+ struct global_cwq *gcwq = pool->gcwq;
+ const char *pri = worker_pool_pri(pool) ? "H" : "";
struct worker *worker = NULL;
int id = -1;
spin_lock_irq(&gcwq->lock);
- while (ida_get_new(&gcwq->worker_ida, &id)) {
+ while (ida_get_new(&pool->worker_ida, &id)) {
spin_unlock_irq(&gcwq->lock);
- if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
+ if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
goto fail;
spin_lock_irq(&gcwq->lock);
}
@@ -1374,38 +1500,43 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
if (!worker)
goto fail;
- worker->gcwq = gcwq;
+ worker->pool = pool;
worker->id = id;
- if (!on_unbound_cpu)
+ if (gcwq->cpu != WORK_CPU_UNBOUND)
worker->task = kthread_create_on_node(worker_thread,
- worker,
- cpu_to_node(gcwq->cpu),
- "kworker/%u:%d", gcwq->cpu, id);
+ worker, cpu_to_node(gcwq->cpu),
+ "kworker/%u:%d%s", gcwq->cpu, id, pri);
else
worker->task = kthread_create(worker_thread, worker,
- "kworker/u:%d", id);
+ "kworker/u:%d%s", id, pri);
if (IS_ERR(worker->task))
goto fail;
+ if (worker_pool_pri(pool))
+ set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
+
/*
- * A rogue worker will become a regular one if CPU comes
- * online later on. Make sure every worker has
- * PF_THREAD_BOUND set.
+ * Determine CPU binding of the new worker depending on
+ * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the
+ * flag remains stable across this function. See the comments
+ * above the flag definition for details.
+ *
+ * As an unbound worker may later become a regular one if CPU comes
+ * online, make sure every worker has %PF_THREAD_BOUND set.
*/
- if (bind && !on_unbound_cpu)
+ if (!(gcwq->flags & GCWQ_DISASSOCIATED)) {
kthread_bind(worker->task, gcwq->cpu);
- else {
+ } else {
worker->task->flags |= PF_THREAD_BOUND;
- if (on_unbound_cpu)
- worker->flags |= WORKER_UNBOUND;
+ worker->flags |= WORKER_UNBOUND;
}
return worker;
fail:
if (id >= 0) {
spin_lock_irq(&gcwq->lock);
- ida_remove(&gcwq->worker_ida, id);
+ ida_remove(&pool->worker_ida, id);
spin_unlock_irq(&gcwq->lock);
}
kfree(worker);
@@ -1424,7 +1555,7 @@ fail:
static void start_worker(struct worker *worker)
{
worker->flags |= WORKER_STARTED;
- worker->gcwq->nr_workers++;
+ worker->pool->nr_workers++;
worker_enter_idle(worker);
wake_up_process(worker->task);
}
@@ -1440,7 +1571,8 @@ static void start_worker(struct worker *worker)
*/
static void destroy_worker(struct worker *worker)
{
- struct global_cwq *gcwq = worker->gcwq;
+ struct worker_pool *pool = worker->pool;
+ struct global_cwq *gcwq = pool->gcwq;
int id = worker->id;
/* sanity check frenzy */
@@ -1448,9 +1580,9 @@ static void destroy_worker(struct worker *worker)
BUG_ON(!list_empty(&worker->scheduled));
if (worker->flags & WORKER_STARTED)
- gcwq->nr_workers--;
+ pool->nr_workers--;
if (worker->flags & WORKER_IDLE)
- gcwq->nr_idle--;
+ pool->nr_idle--;
list_del_init(&worker->entry);
worker->flags |= WORKER_DIE;
@@ -1461,29 +1593,30 @@ static void destroy_worker(struct worker *worker)
kfree(worker);
spin_lock_irq(&gcwq->lock);
- ida_remove(&gcwq->worker_ida, id);
+ ida_remove(&pool->worker_ida, id);
}
-static void idle_worker_timeout(unsigned long __gcwq)
+static void idle_worker_timeout(unsigned long __pool)
{
- struct global_cwq *gcwq = (void *)__gcwq;
+ struct worker_pool *pool = (void *)__pool;
+ struct global_cwq *gcwq = pool->gcwq;
spin_lock_irq(&gcwq->lock);
- if (too_many_workers(gcwq)) {
+ if (too_many_workers(pool)) {
struct worker *worker;
unsigned long expires;
/* idle_list is kept in LIFO order, check the last one */
- worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+ worker = list_entry(pool->idle_list.prev, struct worker, entry);
expires = worker->last_active + IDLE_WORKER_TIMEOUT;
if (time_before(jiffies, expires))
- mod_timer(&gcwq->idle_timer, expires);
+ mod_timer(&pool->idle_timer, expires);
else {
/* it's been idle for too long, wake up manager */
- gcwq->flags |= GCWQ_MANAGE_WORKERS;
- wake_up_worker(gcwq);
+ pool->flags |= POOL_MANAGE_WORKERS;
+ wake_up_worker(pool);
}
}
@@ -1500,7 +1633,7 @@ static bool send_mayday(struct work_struct *work)
return false;
/* mayday mayday mayday */
- cpu = cwq->gcwq->cpu;
+ cpu = cwq->pool->gcwq->cpu;
/* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
if (cpu == WORK_CPU_UNBOUND)
cpu = 0;
@@ -1509,37 +1642,38 @@ static bool send_mayday(struct work_struct *work)
return true;
}
-static void gcwq_mayday_timeout(unsigned long __gcwq)
+static void gcwq_mayday_timeout(unsigned long __pool)
{
- struct global_cwq *gcwq = (void *)__gcwq;
+ struct worker_pool *pool = (void *)__pool;
+ struct global_cwq *gcwq = pool->gcwq;
struct work_struct *work;
spin_lock_irq(&gcwq->lock);
- if (need_to_create_worker(gcwq)) {
+ if (need_to_create_worker(pool)) {
/*
* We've been trying to create a new worker but
* haven't been successful. We might be hitting an
* allocation deadlock. Send distress signals to
* rescuers.
*/
- list_for_each_entry(work, &gcwq->worklist, entry)
+ list_for_each_entry(work, &pool->worklist, entry)
send_mayday(work);
}
spin_unlock_irq(&gcwq->lock);
- mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
+ mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
}
/**
* maybe_create_worker - create a new worker if necessary
- * @gcwq: gcwq to create a new worker for
+ * @pool: pool to create a new worker for
*
- * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to
+ * Create a new worker for @pool if necessary. @pool is guaranteed to
* have at least one idle worker on return from this function. If
* creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
- * sent to all rescuers with works scheduled on @gcwq to resolve
+ * sent to all rescuers with works scheduled on @pool to resolve
* possible allocation deadlock.
*
* On return, need_to_create_worker() is guaranteed to be false and
@@ -1554,52 +1688,54 @@ static void gcwq_mayday_timeout(unsigned long __gcwq)
* false if no action was taken and gcwq->lock stayed locked, true
* otherwise.
*/
-static bool maybe_create_worker(struct global_cwq *gcwq)
+static bool maybe_create_worker(struct worker_pool *pool)
__releases(&gcwq->lock)
__acquires(&gcwq->lock)
{
- if (!need_to_create_worker(gcwq))
+ struct global_cwq *gcwq = pool->gcwq;
+
+ if (!need_to_create_worker(pool))
return false;
restart:
spin_unlock_irq(&gcwq->lock);
/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
- mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
+ mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
while (true) {
struct worker *worker;
- worker = create_worker(gcwq, true);
+ worker = create_worker(pool);
if (worker) {
- del_timer_sync(&gcwq->mayday_timer);
+ del_timer_sync(&pool->mayday_timer);
spin_lock_irq(&gcwq->lock);
start_worker(worker);
- BUG_ON(need_to_create_worker(gcwq));
+ BUG_ON(need_to_create_worker(pool));
return true;
}
- if (!need_to_create_worker(gcwq))
+ if (!need_to_create_worker(pool))
break;
__set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(CREATE_COOLDOWN);
- if (!need_to_create_worker(gcwq))
+ if (!need_to_create_worker(pool))
break;
}
- del_timer_sync(&gcwq->mayday_timer);
+ del_timer_sync(&pool->mayday_timer);
spin_lock_irq(&gcwq->lock);
- if (need_to_create_worker(gcwq))
+ if (need_to_create_worker(pool))
goto restart;
return true;
}
/**
* maybe_destroy_worker - destroy workers which have been idle for a while
- * @gcwq: gcwq to destroy workers for
+ * @pool: pool to destroy workers for
*
- * Destroy @gcwq workers which have been idle for longer than
+ * Destroy @pool workers which have been idle for longer than
* IDLE_WORKER_TIMEOUT.
*
* LOCKING:
@@ -1610,19 +1746,19 @@ restart:
* false if no action was taken and gcwq->lock stayed locked, true
* otherwise.
*/
-static bool maybe_destroy_workers(struct global_cwq *gcwq)
+static bool maybe_destroy_workers(struct worker_pool *pool)
{
bool ret = false;
- while (too_many_workers(gcwq)) {
+ while (too_many_workers(pool)) {
struct worker *worker;
unsigned long expires;
- worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+ worker = list_entry(pool->idle_list.prev, struct worker, entry);
expires = worker->last_active + IDLE_WORKER_TIMEOUT;
if (time_before(jiffies, expires)) {
- mod_timer(&gcwq->idle_timer, expires);
+ mod_timer(&pool->idle_timer, expires);
break;
}
@@ -1655,31 +1791,22 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq)
*/
static bool manage_workers(struct worker *worker)
{
- struct global_cwq *gcwq = worker->gcwq;
+ struct worker_pool *pool = worker->pool;
bool ret = false;
- if (gcwq->flags & GCWQ_MANAGING_WORKERS)
+ if (!mutex_trylock(&pool->manager_mutex))
return ret;
- gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
- gcwq->flags |= GCWQ_MANAGING_WORKERS;
+ pool->flags &= ~POOL_MANAGE_WORKERS;
/*
* Destroy and then create so that may_start_working() is true
* on return.
*/
- ret |= maybe_destroy_workers(gcwq);
- ret |= maybe_create_worker(gcwq);
-
- gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
-
- /*
- * The trustee might be waiting to take over the manager
- * position, tell it we're done.
- */
- if (unlikely(gcwq->trustee))
- wake_up_all(&gcwq->trustee_wait);
+ ret |= maybe_destroy_workers(pool);
+ ret |= maybe_create_worker(pool);
+ mutex_unlock(&pool->manager_mutex);
return ret;
}
@@ -1728,10 +1855,9 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
{
struct work_struct *work = list_first_entry(&cwq->delayed_works,
struct work_struct, entry);
- struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
trace_workqueue_activate_work(work);
- move_linked_works(work, pos, NULL);
+ move_linked_works(work, &cwq->pool->worklist, NULL);
__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
cwq->nr_active++;
}
@@ -1804,7 +1930,8 @@ __releases(&gcwq->lock)
__acquires(&gcwq->lock)
{
struct cpu_workqueue_struct *cwq = get_work_cwq(work);
- struct global_cwq *gcwq = cwq->gcwq;
+ struct worker_pool *pool = worker->pool;
+ struct global_cwq *gcwq = pool->gcwq;
struct hlist_head *bwh = busy_worker_head(gcwq, work);
bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
work_func_t f = work->func;
@@ -1823,6 +1950,15 @@ __acquires(&gcwq->lock)
lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
/*
+ * Ensure we're on the correct CPU. DISASSOCIATED test is
+ * necessary to avoid spurious warnings from rescuers servicing the
+ * unbound or a disassociated gcwq.
+ */
+ WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) &&
+ !(gcwq->flags & GCWQ_DISASSOCIATED) &&
+ raw_smp_processor_id() != gcwq->cpu);
+
+ /*
* A single work shouldn't be executed concurrently by
* multiple workers on a single cpu. Check whether anyone is
* already processing the work. If so, defer the work to the
@@ -1846,27 +1982,19 @@ __acquires(&gcwq->lock)
list_del_init(&work->entry);
/*
- * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
- * wake up another worker; otherwise, clear HIGHPRI_PENDING.
- */
- if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
- struct work_struct *nwork = list_first_entry(&gcwq->worklist,
- struct work_struct, entry);
-
- if (!list_empty(&gcwq->worklist) &&
- get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
- wake_up_worker(gcwq);
- else
- gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
- }
-
- /*
* CPU intensive works don't participate in concurrency
* management. They're the scheduler's responsibility.
*/
if (unlikely(cpu_intensive))
worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
+ /*
+ * Unbound gcwq isn't concurrency managed and work items should be
+ * executed ASAP. Wake up another worker if necessary.
+ */
+ if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
+ wake_up_worker(pool);
+
spin_unlock_irq(&gcwq->lock);
work_clear_pending(work);
@@ -1939,28 +2067,38 @@ static void process_scheduled_works(struct worker *worker)
static int worker_thread(void *__worker)
{
struct worker *worker = __worker;
- struct global_cwq *gcwq = worker->gcwq;
+ struct worker_pool *pool = worker->pool;
+ struct global_cwq *gcwq = pool->gcwq;
/* tell the scheduler that this is a workqueue worker */
worker->task->flags |= PF_WQ_WORKER;
woke_up:
spin_lock_irq(&gcwq->lock);
- /* DIE can be set only while we're idle, checking here is enough */
- if (worker->flags & WORKER_DIE) {
+ /*
+ * DIE can be set only while idle and REBIND set while busy has
+ * @worker->rebind_work scheduled. Checking here is enough.
+ */
+ if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) {
spin_unlock_irq(&gcwq->lock);
- worker->task->flags &= ~PF_WQ_WORKER;
- return 0;
+
+ if (worker->flags & WORKER_DIE) {
+ worker->task->flags &= ~PF_WQ_WORKER;
+ return 0;
+ }
+
+ idle_worker_rebind(worker);
+ goto woke_up;
}
worker_leave_idle(worker);
recheck:
/* no more worker necessary? */
- if (!need_more_worker(gcwq))
+ if (!need_more_worker(pool))
goto sleep;
/* do we need to manage? */
- if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
+ if (unlikely(!may_start_working(pool)) && manage_workers(worker))
goto recheck;
/*
@@ -1979,7 +2117,7 @@ recheck:
do {
struct work_struct *work =
- list_first_entry(&gcwq->worklist,
+ list_first_entry(&pool->worklist,
struct work_struct, entry);
if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
@@ -1991,11 +2129,11 @@ recheck:
move_linked_works(work, &worker->scheduled, NULL);
process_scheduled_works(worker);
}
- } while (keep_working(gcwq));
+ } while (keep_working(pool));
worker_set_flags(worker, WORKER_PREP, false);
sleep:
- if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
+ if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
goto recheck;
/*
@@ -2053,14 +2191,15 @@ repeat:
for_each_mayday_cpu(cpu, wq->mayday_mask) {
unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
- struct global_cwq *gcwq = cwq->gcwq;
+ struct worker_pool *pool = cwq->pool;
+ struct global_cwq *gcwq = pool->gcwq;
struct work_struct *work, *n;
__set_current_state(TASK_RUNNING);
mayday_clear_cpu(cpu, wq->mayday_mask);
/* migrate to the target cpu if possible */
- rescuer->gcwq = gcwq;
+ rescuer->pool = pool;
worker_maybe_bind_and_lock(rescuer);
/*
@@ -2068,7 +2207,7 @@ repeat:
* process'em.
*/
BUG_ON(!list_empty(&rescuer->scheduled));
- list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
+ list_for_each_entry_safe(work, n, &pool->worklist, entry)
if (get_work_cwq(work) == cwq)
move_linked_works(work, scheduled, &n);
@@ -2079,8 +2218,8 @@ repeat:
* regular worker; otherwise, we end up with 0 concurrency
* and stalling the execution.
*/
- if (keep_working(gcwq))
- wake_up_worker(gcwq);
+ if (keep_working(pool))
+ wake_up_worker(pool);
spin_unlock_irq(&gcwq->lock);
}
@@ -2205,7 +2344,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
for_each_cwq_cpu(cpu, wq) {
struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
- struct global_cwq *gcwq = cwq->gcwq;
+ struct global_cwq *gcwq = cwq->pool->gcwq;
spin_lock_irq(&gcwq->lock);
@@ -2421,9 +2560,9 @@ reflush:
struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
bool drained;
- spin_lock_irq(&cwq->gcwq->lock);
+ spin_lock_irq(&cwq->pool->gcwq->lock);
drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
- spin_unlock_irq(&cwq->gcwq->lock);
+ spin_unlock_irq(&cwq->pool->gcwq->lock);
if (drained)
continue;
@@ -2463,7 +2602,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
*/
smp_rmb();
cwq = get_work_cwq(work);
- if (unlikely(!cwq || gcwq != cwq->gcwq))
+ if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
goto already_gone;
} else if (wait_executing) {
worker = find_worker_executing_work(gcwq, work);
@@ -2984,13 +3123,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
if (flags & WQ_MEM_RECLAIM)
flags |= WQ_RESCUER;
- /*
- * Unbound workqueues aren't concurrency managed and should be
- * dispatched to workers immediately.
- */
- if (flags & WQ_UNBOUND)
- flags |= WQ_HIGHPRI;
-
max_active = max_active ?: WQ_DFL_ACTIVE;
max_active = wq_clamp_max_active(max_active, flags, wq->name);
@@ -3011,9 +3143,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
for_each_cwq_cpu(cpu, wq) {
struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
struct global_cwq *gcwq = get_gcwq(cpu);
+ int pool_idx = (bool)(flags & WQ_HIGHPRI);
BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
- cwq->gcwq = gcwq;
+ cwq->pool = &gcwq->pools[pool_idx];
cwq->wq = wq;
cwq->flush_color = -1;
cwq->max_active = max_active;
@@ -3225,369 +3358,143 @@ EXPORT_SYMBOL_GPL(work_busy);
* gcwqs serve mix of short, long and very long running works making
* blocked draining impractical.
*
- * This is solved by allowing a gcwq to be detached from CPU, running
- * it with unbound (rogue) workers and allowing it to be reattached
- * later if the cpu comes back online. A separate thread is created
- * to govern a gcwq in such state and is called the trustee of the
- * gcwq.
- *
- * Trustee states and their descriptions.
- *
- * START Command state used on startup. On CPU_DOWN_PREPARE, a
- * new trustee is started with this state.
- *
- * IN_CHARGE Once started, trustee will enter this state after
- * assuming the manager role and making all existing
- * workers rogue. DOWN_PREPARE waits for trustee to
- * enter this state. After reaching IN_CHARGE, trustee
- * tries to execute the pending worklist until it's empty
- * and the state is set to BUTCHER, or the state is set
- * to RELEASE.
- *
- * BUTCHER Command state which is set by the cpu callback after
- * the cpu has went down. Once this state is set trustee
- * knows that there will be no new works on the worklist
- * and once the worklist is empty it can proceed to
- * killing idle workers.
- *
- * RELEASE Command state which is set by the cpu callback if the
- * cpu down has been canceled or it has come online
- * again. After recognizing this state, trustee stops
- * trying to drain or butcher and clears ROGUE, rebinds
- * all remaining workers back to the cpu and releases
- * manager role.
- *
- * DONE Trustee will enter this state after BUTCHER or RELEASE
- * is complete.
- *
- * trustee CPU draining
- * took over down complete
- * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
- * | | ^
- * | CPU is back online v return workers |
- * ----------------> RELEASE --------------
+ * This is solved by allowing a gcwq to be disassociated from the CPU
+ * running as an unbound one and allowing it to be reattached later if the
+ * cpu comes back online.
*/
-/**
- * trustee_wait_event_timeout - timed event wait for trustee
- * @cond: condition to wait for
- * @timeout: timeout in jiffies
- *
- * wait_event_timeout() for trustee to use. Handles locking and
- * checks for RELEASE request.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times. To be used by trustee.
- *
- * RETURNS:
- * Positive indicating left time if @cond is satisfied, 0 if timed
- * out, -1 if canceled.
- */
-#define trustee_wait_event_timeout(cond, timeout) ({ \
- long __ret = (timeout); \
- while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
- __ret) { \
- spin_unlock_irq(&gcwq->lock); \
- __wait_event_timeout(gcwq->trustee_wait, (cond) || \
- (gcwq->trustee_state == TRUSTEE_RELEASE), \
- __ret); \
- spin_lock_irq(&gcwq->lock); \
- } \
- gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \
-})
-
-/**
- * trustee_wait_event - event wait for trustee
- * @cond: condition to wait for
- *
- * wait_event() for trustee to use. Automatically handles locking and
- * checks for CANCEL request.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times. To be used by trustee.
- *
- * RETURNS:
- * 0 if @cond is satisfied, -1 if canceled.
- */
-#define trustee_wait_event(cond) ({ \
- long __ret1; \
- __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
- __ret1 < 0 ? -1 : 0; \
-})
-
-static int __cpuinit trustee_thread(void *__gcwq)
+/* claim manager positions of all pools */
+static void gcwq_claim_management_and_lock(struct global_cwq *gcwq)
{
- struct global_cwq *gcwq = __gcwq;
- struct worker *worker;
- struct work_struct *work;
- struct hlist_node *pos;
- long rc;
- int i;
-
- BUG_ON(gcwq->cpu != smp_processor_id());
+ struct worker_pool *pool;
+ for_each_worker_pool(pool, gcwq)
+ mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools);
spin_lock_irq(&gcwq->lock);
- /*
- * Claim the manager position and make all workers rogue.
- * Trustee must be bound to the target cpu and can't be
- * cancelled.
- */
- BUG_ON(gcwq->cpu != smp_processor_id());
- rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
- BUG_ON(rc < 0);
-
- gcwq->flags |= GCWQ_MANAGING_WORKERS;
-
- list_for_each_entry(worker, &gcwq->idle_list, entry)
- worker->flags |= WORKER_ROGUE;
+}
- for_each_busy_worker(worker, i, pos, gcwq)
- worker->flags |= WORKER_ROGUE;
+/* release manager positions */
+static void gcwq_release_management_and_unlock(struct global_cwq *gcwq)
+{
+ struct worker_pool *pool;
- /*
- * Call schedule() so that we cross rq->lock and thus can
- * guarantee sched callbacks see the rogue flag. This is
- * necessary as scheduler callbacks may be invoked from other
- * cpus.
- */
spin_unlock_irq(&gcwq->lock);
- schedule();
- spin_lock_irq(&gcwq->lock);
+ for_each_worker_pool(pool, gcwq)
+ mutex_unlock(&pool->manager_mutex);
+}
- /*
- * Sched callbacks are disabled now. Zap nr_running. After
- * this, nr_running stays zero and need_more_worker() and
- * keep_working() are always true as long as the worklist is
- * not empty.
- */
- atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
+static void gcwq_unbind_fn(struct work_struct *work)
+{
+ struct global_cwq *gcwq = get_gcwq(smp_processor_id());
+ struct worker_pool *pool;
+ struct worker *worker;
+ struct hlist_node *pos;
+ int i;
- spin_unlock_irq(&gcwq->lock);
- del_timer_sync(&gcwq->idle_timer);
- spin_lock_irq(&gcwq->lock);
+ BUG_ON(gcwq->cpu != smp_processor_id());
- /*
- * We're now in charge. Notify and proceed to drain. We need
- * to keep the gcwq running during the whole CPU down
- * procedure as other cpu hotunplug callbacks may need to
- * flush currently running tasks.
- */
- gcwq->trustee_state = TRUSTEE_IN_CHARGE;
- wake_up_all(&gcwq->trustee_wait);
+ gcwq_claim_management_and_lock(gcwq);
/*
- * The original cpu is in the process of dying and may go away
- * anytime now. When that happens, we and all workers would
- * be migrated to other cpus. Try draining any left work. We
- * want to get it over with ASAP - spam rescuers, wake up as
- * many idlers as necessary and create new ones till the
- * worklist is empty. Note that if the gcwq is frozen, there
- * may be frozen works in freezable cwqs. Don't declare
- * completion while frozen.
+ * We've claimed all manager positions. Make all workers unbound
+ * and set DISASSOCIATED. Before this, all workers except for the
+ * ones which are still executing works from before the last CPU
+ * down must be on the cpu. After this, they may become diasporas.
*/
- while (gcwq->nr_workers != gcwq->nr_idle ||
- gcwq->flags & GCWQ_FREEZING ||
- gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
- int nr_works = 0;
-
- list_for_each_entry(work, &gcwq->worklist, entry) {
- send_mayday(work);
- nr_works++;
- }
+ for_each_worker_pool(pool, gcwq)
+ list_for_each_entry(worker, &pool->idle_list, entry)
+ worker->flags |= WORKER_UNBOUND;
- list_for_each_entry(worker, &gcwq->idle_list, entry) {
- if (!nr_works--)
- break;
- wake_up_process(worker->task);
- }
+ for_each_busy_worker(worker, i, pos, gcwq)
+ worker->flags |= WORKER_UNBOUND;
- if (need_to_create_worker(gcwq)) {
- spin_unlock_irq(&gcwq->lock);
- worker = create_worker(gcwq, false);
- spin_lock_irq(&gcwq->lock);
- if (worker) {
- worker->flags |= WORKER_ROGUE;
- start_worker(worker);
- }
- }
+ gcwq->flags |= GCWQ_DISASSOCIATED;
- /* give a breather */
- if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
- break;
- }
+ gcwq_release_management_and_unlock(gcwq);
/*
- * Either all works have been scheduled and cpu is down, or
- * cpu down has already been canceled. Wait for and butcher
- * all workers till we're canceled.
+ * Call schedule() so that we cross rq->lock and thus can guarantee
+ * sched callbacks see the %WORKER_UNBOUND flag. This is necessary
+ * as scheduler callbacks may be invoked from other cpus.
*/
- do {
- rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
- while (!list_empty(&gcwq->idle_list))
- destroy_worker(list_first_entry(&gcwq->idle_list,
- struct worker, entry));
- } while (gcwq->nr_workers && rc >= 0);
+ schedule();
/*
- * At this point, either draining has completed and no worker
- * is left, or cpu down has been canceled or the cpu is being
- * brought back up. There shouldn't be any idle one left.
- * Tell the remaining busy ones to rebind once it finishes the
- * currently scheduled works by scheduling the rebind_work.
+ * Sched callbacks are disabled now. Zap nr_running. After this,
+ * nr_running stays zero and need_more_worker() and keep_working()
+ * are always true as long as the worklist is not empty. @gcwq now
+ * behaves as unbound (in terms of concurrency management) gcwq
+ * which is served by workers tied to the CPU.
+ *
+ * On return from this function, the current worker would trigger
+ * unbound chain execution of pending work items if other workers
+ * didn't already.
*/
- WARN_ON(!list_empty(&gcwq->idle_list));
-
- for_each_busy_worker(worker, i, pos, gcwq) {
- struct work_struct *rebind_work = &worker->rebind_work;
-
- /*
- * Rebind_work may race with future cpu hotplug
- * operations. Use a separate flag to mark that
- * rebinding is scheduled.
- */
- worker->flags |= WORKER_REBIND;
- worker->flags &= ~WORKER_ROGUE;
-
- /* queue rebind_work, wq doesn't matter, use the default one */
- if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
- work_data_bits(rebind_work)))
- continue;
-
- debug_work_activate(rebind_work);
- insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
- worker->scheduled.next,
- work_color_to_flags(WORK_NO_COLOR));
- }
-
- /* relinquish manager role */
- gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
-
- /* notify completion */
- gcwq->trustee = NULL;
- gcwq->trustee_state = TRUSTEE_DONE;
- wake_up_all(&gcwq->trustee_wait);
- spin_unlock_irq(&gcwq->lock);
- return 0;
+ for_each_worker_pool(pool, gcwq)
+ atomic_set(get_pool_nr_running(pool), 0);
}
-/**
- * wait_trustee_state - wait for trustee to enter the specified state
- * @gcwq: gcwq the trustee of interest belongs to
- * @state: target state to wait for
- *
- * Wait for the trustee to reach @state. DONE is already matched.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times. To be used by cpu_callback.
+/*
+ * Workqueues should be brought up before normal priority CPU notifiers.
+ * This will be registered high priority CPU notifier.
*/
-static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
-__releases(&gcwq->lock)
-__acquires(&gcwq->lock)
-{
- if (!(gcwq->trustee_state == state ||
- gcwq->trustee_state == TRUSTEE_DONE)) {
- spin_unlock_irq(&gcwq->lock);
- __wait_event(gcwq->trustee_wait,
- gcwq->trustee_state == state ||
- gcwq->trustee_state == TRUSTEE_DONE);
- spin_lock_irq(&gcwq->lock);
- }
-}
-
-static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
struct global_cwq *gcwq = get_gcwq(cpu);
- struct task_struct *new_trustee = NULL;
- struct worker *uninitialized_var(new_worker);
- unsigned long flags;
-
- action &= ~CPU_TASKS_FROZEN;
+ struct worker_pool *pool;
- switch (action) {
- case CPU_DOWN_PREPARE:
- new_trustee = kthread_create(trustee_thread, gcwq,
- "workqueue_trustee/%d\n", cpu);
- if (IS_ERR(new_trustee))
- return notifier_from_errno(PTR_ERR(new_trustee));
- kthread_bind(new_trustee, cpu);
- /* fall through */
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- BUG_ON(gcwq->first_idle);
- new_worker = create_worker(gcwq, false);
- if (!new_worker) {
- if (new_trustee)
- kthread_stop(new_trustee);
- return NOTIFY_BAD;
- }
- }
-
- /* some are called w/ irq disabled, don't disturb irq status */
- spin_lock_irqsave(&gcwq->lock, flags);
+ for_each_worker_pool(pool, gcwq) {
+ struct worker *worker;
- switch (action) {
- case CPU_DOWN_PREPARE:
- /* initialize trustee and tell it to acquire the gcwq */
- BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
- gcwq->trustee = new_trustee;
- gcwq->trustee_state = TRUSTEE_START;
- wake_up_process(gcwq->trustee);
- wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
- /* fall through */
- case CPU_UP_PREPARE:
- BUG_ON(gcwq->first_idle);
- gcwq->first_idle = new_worker;
- break;
+ if (pool->nr_workers)
+ continue;
- case CPU_DYING:
- /*
- * Before this, the trustee and all workers except for
- * the ones which are still executing works from
- * before the last CPU down must be on the cpu. After
- * this, they'll all be diasporas.
- */
- gcwq->flags |= GCWQ_DISASSOCIATED;
- break;
+ worker = create_worker(pool);
+ if (!worker)
+ return NOTIFY_BAD;
- case CPU_POST_DEAD:
- gcwq->trustee_state = TRUSTEE_BUTCHER;
- /* fall through */
- case CPU_UP_CANCELED:
- destroy_worker(gcwq->first_idle);
- gcwq->first_idle = NULL;
+ spin_lock_irq(&gcwq->lock);
+ start_worker(worker);
+ spin_unlock_irq(&gcwq->lock);
+ }
break;
case CPU_DOWN_FAILED:
case CPU_ONLINE:
+ gcwq_claim_management_and_lock(gcwq);
gcwq->flags &= ~GCWQ_DISASSOCIATED;
- if (gcwq->trustee_state != TRUSTEE_DONE) {
- gcwq->trustee_state = TRUSTEE_RELEASE;
- wake_up_process(gcwq->trustee);
- wait_trustee_state(gcwq, TRUSTEE_DONE);
- }
-
- /*
- * Trustee is done and there might be no worker left.
- * Put the first_idle in and request a real manager to
- * take a look.
- */
- spin_unlock_irq(&gcwq->lock);
- kthread_bind(gcwq->first_idle->task, cpu);
- spin_lock_irq(&gcwq->lock);
- gcwq->flags |= GCWQ_MANAGE_WORKERS;
- start_worker(gcwq->first_idle);
- gcwq->first_idle = NULL;
+ rebind_workers(gcwq);
+ gcwq_release_management_and_unlock(gcwq);
break;
}
+ return NOTIFY_OK;
+}
- spin_unlock_irqrestore(&gcwq->lock, flags);
+/*
+ * Workqueues should be brought down after normal priority CPU notifiers.
+ * This will be registered as low priority CPU notifier.
+ */
+static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+ struct work_struct unbind_work;
- return notifier_from_errno(0);
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_PREPARE:
+ /* unbinding should happen on the local CPU */
+ INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
+ schedule_work_on(cpu, &unbind_work);
+ flush_work(&unbind_work);
+ break;
+ }
+ return NOTIFY_OK;
}
#ifdef CONFIG_SMP
@@ -3746,6 +3653,7 @@ void thaw_workqueues(void)
for_each_gcwq_cpu(cpu) {
struct global_cwq *gcwq = get_gcwq(cpu);
+ struct worker_pool *pool;
struct workqueue_struct *wq;
spin_lock_irq(&gcwq->lock);
@@ -3767,7 +3675,8 @@ void thaw_workqueues(void)
cwq_activate_first_delayed(cwq);
}
- wake_up_worker(gcwq);
+ for_each_worker_pool(pool, gcwq)
+ wake_up_worker(pool);
spin_unlock_irq(&gcwq->lock);
}
@@ -3783,46 +3692,57 @@ static int __init init_workqueues(void)
unsigned int cpu;
int i;
- cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
+ cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
+ cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
/* initialize gcwqs */
for_each_gcwq_cpu(cpu) {
struct global_cwq *gcwq = get_gcwq(cpu);
+ struct worker_pool *pool;
spin_lock_init(&gcwq->lock);
- INIT_LIST_HEAD(&gcwq->worklist);
gcwq->cpu = cpu;
gcwq->flags |= GCWQ_DISASSOCIATED;
- INIT_LIST_HEAD(&gcwq->idle_list);
for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
- init_timer_deferrable(&gcwq->idle_timer);
- gcwq->idle_timer.function = idle_worker_timeout;
- gcwq->idle_timer.data = (unsigned long)gcwq;
+ for_each_worker_pool(pool, gcwq) {
+ pool->gcwq = gcwq;
+ INIT_LIST_HEAD(&pool->worklist);
+ INIT_LIST_HEAD(&pool->idle_list);
+
+ init_timer_deferrable(&pool->idle_timer);
+ pool->idle_timer.function = idle_worker_timeout;
+ pool->idle_timer.data = (unsigned long)pool;
- setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
- (unsigned long)gcwq);
+ setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
+ (unsigned long)pool);
- ida_init(&gcwq->worker_ida);
+ mutex_init(&pool->manager_mutex);
+ ida_init(&pool->worker_ida);
+ }
- gcwq->trustee_state = TRUSTEE_DONE;
- init_waitqueue_head(&gcwq->trustee_wait);
+ init_waitqueue_head(&gcwq->rebind_hold);
}
/* create the initial worker */
for_each_online_gcwq_cpu(cpu) {
struct global_cwq *gcwq = get_gcwq(cpu);
- struct worker *worker;
+ struct worker_pool *pool;
if (cpu != WORK_CPU_UNBOUND)
gcwq->flags &= ~GCWQ_DISASSOCIATED;
- worker = create_worker(gcwq, true);
- BUG_ON(!worker);
- spin_lock_irq(&gcwq->lock);
- start_worker(worker);
- spin_unlock_irq(&gcwq->lock);
+
+ for_each_worker_pool(pool, gcwq) {
+ struct worker *worker;
+
+ worker = create_worker(pool);
+ BUG_ON(!worker);
+ spin_lock_irq(&gcwq->lock);
+ start_worker(worker);
+ spin_unlock_irq(&gcwq->lock);
+ }
}
system_wq = alloc_workqueue("events", 0, 0);