diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/async.c | 76 | ||||
-rw-r--r-- | kernel/audit.c | 30 | ||||
-rw-r--r-- | kernel/audit_tree.c | 10 | ||||
-rw-r--r-- | kernel/audit_watch.c | 25 | ||||
-rw-r--r-- | kernel/cgroup.c | 53 | ||||
-rw-r--r-- | kernel/cpuset.c | 130 | ||||
-rw-r--r-- | kernel/exit.c | 8 | ||||
-rw-r--r-- | kernel/fork.c | 2 | ||||
-rw-r--r-- | kernel/irq/irqdomain.c | 8 | ||||
-rw-r--r-- | kernel/irq/manage.c | 6 | ||||
-rw-r--r-- | kernel/kthread.c | 88 | ||||
-rw-r--r-- | kernel/printk.c | 159 | ||||
-rw-r--r-- | kernel/resource.c | 13 | ||||
-rw-r--r-- | kernel/sched/core.c | 92 | ||||
-rw-r--r-- | kernel/sched/fair.c | 113 | ||||
-rw-r--r-- | kernel/sched/sched.h | 23 | ||||
-rw-r--r-- | kernel/signal.c | 15 | ||||
-rw-r--r-- | kernel/task_work.c | 94 | ||||
-rw-r--r-- | kernel/trace/trace.c | 7 | ||||
-rw-r--r-- | kernel/trace/trace_functions.c | 36 | ||||
-rw-r--r-- | kernel/workqueue.c | 1144 |
21 files changed, 1174 insertions, 958 deletions
diff --git a/kernel/async.c b/kernel/async.c index bd0c168a3bb..9d311838485 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -62,8 +62,10 @@ static async_cookie_t next_cookie = 1; #define MAX_WORK 32768 static LIST_HEAD(async_pending); -static LIST_HEAD(async_running); +static ASYNC_DOMAIN(async_running); +static LIST_HEAD(async_domains); static DEFINE_SPINLOCK(async_lock); +static DEFINE_MUTEX(async_register_mutex); struct async_entry { struct list_head list; @@ -71,7 +73,7 @@ struct async_entry { async_cookie_t cookie; async_func_ptr *func; void *data; - struct list_head *running; + struct async_domain *running; }; static DECLARE_WAIT_QUEUE_HEAD(async_done); @@ -82,13 +84,12 @@ static atomic_t entry_count; /* * MUST be called with the lock held! */ -static async_cookie_t __lowest_in_progress(struct list_head *running) +static async_cookie_t __lowest_in_progress(struct async_domain *running) { struct async_entry *entry; - if (!list_empty(running)) { - entry = list_first_entry(running, - struct async_entry, list); + if (!list_empty(&running->domain)) { + entry = list_first_entry(&running->domain, typeof(*entry), list); return entry->cookie; } @@ -99,7 +100,7 @@ static async_cookie_t __lowest_in_progress(struct list_head *running) return next_cookie; /* "infinity" value */ } -static async_cookie_t lowest_in_progress(struct list_head *running) +static async_cookie_t lowest_in_progress(struct async_domain *running) { unsigned long flags; async_cookie_t ret; @@ -119,10 +120,11 @@ static void async_run_entry_fn(struct work_struct *work) container_of(work, struct async_entry, work); unsigned long flags; ktime_t uninitialized_var(calltime), delta, rettime; + struct async_domain *running = entry->running; /* 1) move self to the running queue */ spin_lock_irqsave(&async_lock, flags); - list_move_tail(&entry->list, entry->running); + list_move_tail(&entry->list, &running->domain); spin_unlock_irqrestore(&async_lock, flags); /* 2) run (and print duration) */ @@ -145,6 +147,8 @@ static void async_run_entry_fn(struct work_struct *work) /* 3) remove self from the running queue */ spin_lock_irqsave(&async_lock, flags); list_del(&entry->list); + if (running->registered && --running->count == 0) + list_del_init(&running->node); /* 4) free the entry */ kfree(entry); @@ -156,7 +160,7 @@ static void async_run_entry_fn(struct work_struct *work) wake_up(&async_done); } -static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) +static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running) { struct async_entry *entry; unsigned long flags; @@ -187,6 +191,8 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l spin_lock_irqsave(&async_lock, flags); newcookie = entry->cookie = next_cookie++; list_add_tail(&entry->list, &async_pending); + if (running->registered && running->count++ == 0) + list_add_tail(&running->node, &async_domains); atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); @@ -223,7 +229,7 @@ EXPORT_SYMBOL_GPL(async_schedule); * Note: This function may be called from atomic or non-atomic contexts. */ async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, - struct list_head *running) + struct async_domain *running) { return __async_schedule(ptr, data, running); } @@ -236,22 +242,52 @@ EXPORT_SYMBOL_GPL(async_schedule_domain); */ void async_synchronize_full(void) { + mutex_lock(&async_register_mutex); do { - async_synchronize_cookie(next_cookie); - } while (!list_empty(&async_running) || !list_empty(&async_pending)); + struct async_domain *domain = NULL; + + spin_lock_irq(&async_lock); + if (!list_empty(&async_domains)) + domain = list_first_entry(&async_domains, typeof(*domain), node); + spin_unlock_irq(&async_lock); + + async_synchronize_cookie_domain(next_cookie, domain); + } while (!list_empty(&async_domains)); + mutex_unlock(&async_register_mutex); } EXPORT_SYMBOL_GPL(async_synchronize_full); /** + * async_unregister_domain - ensure no more anonymous waiters on this domain + * @domain: idle domain to flush out of any async_synchronize_full instances + * + * async_synchronize_{cookie|full}_domain() are not flushed since callers + * of these routines should know the lifetime of @domain + * + * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing + */ +void async_unregister_domain(struct async_domain *domain) +{ + mutex_lock(&async_register_mutex); + spin_lock_irq(&async_lock); + WARN_ON(!domain->registered || !list_empty(&domain->node) || + !list_empty(&domain->domain)); + domain->registered = 0; + spin_unlock_irq(&async_lock); + mutex_unlock(&async_register_mutex); +} +EXPORT_SYMBOL_GPL(async_unregister_domain); + +/** * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain - * @list: running list to synchronize on + * @domain: running list to synchronize on * * This function waits until all asynchronous function calls for the - * synchronization domain specified by the running list @list have been done. + * synchronization domain specified by the running list @domain have been done. */ -void async_synchronize_full_domain(struct list_head *list) +void async_synchronize_full_domain(struct async_domain *domain) { - async_synchronize_cookie_domain(next_cookie, list); + async_synchronize_cookie_domain(next_cookie, domain); } EXPORT_SYMBOL_GPL(async_synchronize_full_domain); @@ -261,14 +297,16 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); * @running: running list to synchronize on * * This function waits until all asynchronous function calls for the - * synchronization domain specified by the running list @list submitted + * synchronization domain specified by running list @running submitted * prior to @cookie have been done. */ -void async_synchronize_cookie_domain(async_cookie_t cookie, - struct list_head *running) +void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running) { ktime_t uninitialized_var(starttime), delta, endtime; + if (!running) + return; + if (initcall_debug && system_state == SYSTEM_BOOTING) { printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); diff --git a/kernel/audit.c b/kernel/audit.c index 1c7f2c61416..4a3f28d2ca6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -384,7 +384,7 @@ static void audit_hold_skb(struct sk_buff *skb) static void audit_printk_skb(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); - char *data = NLMSG_DATA(nlh); + char *data = nlmsg_data(nlh); if (nlh->nlmsg_type != AUDIT_EOE) { if (printk_ratelimit()) @@ -516,14 +516,15 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, if (!skb) return NULL; - nlh = NLMSG_NEW(skb, pid, seq, t, size, flags); - data = NLMSG_DATA(nlh); + nlh = nlmsg_put(skb, pid, seq, t, size, flags); + if (!nlh) + goto out_kfree_skb; + data = nlmsg_data(nlh); memcpy(data, payload, size); return skb; -nlmsg_failure: /* Used by NLMSG_NEW */ - if (skb) - kfree_skb(skb); +out_kfree_skb: + kfree_skb(skb); return NULL; } @@ -680,7 +681,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) sessionid = audit_get_sessionid(current); security_task_getsecid(current, &sid); seq = nlh->nlmsg_seq; - data = NLMSG_DATA(nlh); + data = nlmsg_data(nlh); switch (msg_type) { case AUDIT_GET: @@ -961,14 +962,17 @@ static void audit_receive(struct sk_buff *skb) static int __init audit_init(void) { int i; + struct netlink_kernel_cfg cfg = { + .input = audit_receive, + }; if (audit_initialized == AUDIT_DISABLED) return 0; printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, - audit_receive, NULL, THIS_MODULE); + audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, + THIS_MODULE, &cfg); if (!audit_sock) audit_panic("cannot initialize netlink socket"); else @@ -1060,13 +1064,15 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) - goto nlmsg_failure; + goto err; - nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0); + nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0); + if (!nlh) + goto out_kfree_skb; return ab; -nlmsg_failure: /* Used by NLMSG_NEW */ +out_kfree_skb: kfree_skb(ab->skb); ab->skb = NULL; err: diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 5bf0790497e..3a5ca582ba1 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -595,7 +595,7 @@ void audit_trim_trees(void) root_mnt = collect_mounts(&path); path_put(&path); - if (!root_mnt) + if (IS_ERR(root_mnt)) goto skip_it; spin_lock(&hash_lock); @@ -669,8 +669,8 @@ int audit_add_tree_rule(struct audit_krule *rule) goto Err; mnt = collect_mounts(&path); path_put(&path); - if (!mnt) { - err = -ENOMEM; + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); goto Err; } @@ -719,8 +719,8 @@ int audit_tag_tree(char *old, char *new) return err; tagged = collect_mounts(&path2); path_put(&path2); - if (!tagged) - return -ENOMEM; + if (IS_ERR(tagged)) + return PTR_ERR(tagged); err = kern_path(old, 0, &path1); if (err) { diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index e683869365d..3823281401b 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -355,34 +355,15 @@ static void audit_remove_parent_watches(struct audit_parent *parent) /* Get path information necessary for adding watches. */ static int audit_get_nd(struct audit_watch *watch, struct path *parent) { - struct nameidata nd; - struct dentry *d; - int err; - - err = kern_path_parent(watch->path, &nd); - if (err) - return err; - - if (nd.last_type != LAST_NORM) { - path_put(&nd.path); - return -EINVAL; - } - - mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len); - if (IS_ERR(d)) { - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - path_put(&nd.path); + struct dentry *d = kern_path_locked(watch->path, parent); + if (IS_ERR(d)) return PTR_ERR(d); - } + mutex_unlock(&parent->dentry->d_inode->i_mutex); if (d->d_inode) { /* update watch filter fields */ watch->dev = d->d_inode->i_sb->s_dev; watch->ino = d->d_inode->i_ino; } - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - - *parent = nd.path; dput(d); return 0; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b303dfc7dce..79818507e44 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -822,7 +822,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); */ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); -static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); +static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); static const struct inode_operations cgroup_dir_inode_operations; @@ -954,7 +954,7 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) dget(d); d_delete(d); - simple_unlink(d->d_inode, d); + simple_unlink(cgrp->dentry->d_inode, d); list_del_init(&cfe->node); dput(d); @@ -1068,28 +1068,24 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]); BUG_ON(!dummytop->subsys[i]); BUG_ON(dummytop->subsys[i]->cgroup != dummytop); - mutex_lock(&ss->hierarchy_mutex); cgrp->subsys[i] = dummytop->subsys[i]; cgrp->subsys[i]->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) ss->bind(cgrp); - mutex_unlock(&ss->hierarchy_mutex); /* refcount was already taken, and we're keeping it */ } else if (bit & removed_bits) { /* We're removing this subsystem */ BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); BUG_ON(cgrp->subsys[i]->cgroup != cgrp); - mutex_lock(&ss->hierarchy_mutex); if (ss->bind) ss->bind(dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; subsys[i]->root = &rootnode; list_move(&ss->sibling, &rootnode.subsys_list); - mutex_unlock(&ss->hierarchy_mutex); /* subsystem is now free - drop reference on module */ module_put(ss->module); } else if (bit & final_bits) { @@ -1587,7 +1583,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, opts.new_root = new_root; /* Locate an existing or new sb for this hierarchy */ - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); + sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); if (IS_ERR(sb)) { ret = PTR_ERR(sb); cgroup_drop_root(opts.new_root); @@ -2570,7 +2566,7 @@ static const struct inode_operations cgroup_dir_inode_operations = { .rename = cgroup_rename, }; -static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); @@ -3915,37 +3911,6 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, set_bit(CSS_CLEAR_CSS_REFS, &css->flags); } -static void cgroup_lock_hierarchy(struct cgroupfs_root *root) -{ - /* We need to take each hierarchy_mutex in a consistent order */ - int i; - - /* - * No worry about a race with rebind_subsystems that might mess up the - * locking order, since both parties are under cgroup_mutex. - */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; - if (ss->root == root) - mutex_lock(&ss->hierarchy_mutex); - } -} - -static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) -{ - int i; - - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; - if (ss->root == root) - mutex_unlock(&ss->hierarchy_mutex); - } -} - /* * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup @@ -4006,9 +3971,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, ss->post_clone(cgrp); } - cgroup_lock_hierarchy(root); list_add(&cgrp->sibling, &cgrp->parent->children); - cgroup_unlock_hierarchy(root); root->number_of_cgroups++; err = cgroup_create_dir(cgrp, dentry, mode); @@ -4035,9 +3998,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_remove: - cgroup_lock_hierarchy(root); list_del(&cgrp->sibling); - cgroup_unlock_hierarchy(root); root->number_of_cgroups--; err_destroy: @@ -4245,10 +4206,8 @@ again: list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); - cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ list_del_init(&cgrp->sibling); - cgroup_unlock_hierarchy(cgrp->root); list_del_init(&cgrp->allcg_node); @@ -4322,8 +4281,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); - mutex_init(&ss->hierarchy_mutex); - lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); ss->active = 1; /* this function shouldn't be used with modular subsystems, since they @@ -4450,8 +4407,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) } write_unlock(&css_set_lock); - mutex_init(&ss->hierarchy_mutex); - lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); ss->active = 1; /* success! */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8c8bd652dd1..f33c7153b6d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -147,6 +147,12 @@ typedef enum { CS_SPREAD_SLAB, } cpuset_flagbits_t; +/* the type of hotplug event */ +enum hotplug_event { + CPUSET_CPU_OFFLINE, + CPUSET_MEM_OFFLINE, +}; + /* convenient tests for these bits */ static inline int is_cpu_exclusive(const struct cpuset *cs) { @@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) } /* - * Walk the specified cpuset subtree and look for empty cpusets. - * The tasks of such cpuset must be moved to a parent cpuset. + * Helper function to traverse cpusets. + * It can be used to walk the cpuset tree from top to bottom, completing + * one layer before dropping down to the next (thus always processing a + * node before any of its children). + */ +static struct cpuset *cpuset_next(struct list_head *queue) +{ + struct cpuset *cp; + struct cpuset *child; /* scans child cpusets of cp */ + struct cgroup *cont; + + if (list_empty(queue)) + return NULL; + + cp = list_first_entry(queue, struct cpuset, stack_list); + list_del(queue->next); + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { + child = cgroup_cs(cont); + list_add_tail(&child->stack_list, queue); + } + + return cp; +} + + +/* + * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory + * online/offline) and update the cpusets accordingly. + * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such + * cpuset must be moved to a parent cpuset. * * Called with cgroup_mutex held. We take callback_mutex to modify * cpus_allowed and mems_allowed. @@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) * before dropping down to the next. It always processes a node before * any of its children. * - * For now, since we lack memory hot unplug, we'll never see a cpuset - * that has tasks along with an empty 'mems'. But if we did see such - * a cpuset, we'd handle it just like we do if its 'cpus' was empty. + * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY + * if all present pages from a node are offlined. */ -static void scan_for_empty_cpusets(struct cpuset *root) +static void +scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) { LIST_HEAD(queue); - struct cpuset *cp; /* scans cpusets being updated */ - struct cpuset *child; /* scans child cpusets of cp */ - struct cgroup *cont; + struct cpuset *cp; /* scans cpusets being updated */ static nodemask_t oldmems; /* protected by cgroup_mutex */ list_add_tail((struct list_head *)&root->stack_list, &queue); - while (!list_empty(&queue)) { - cp = list_first_entry(&queue, struct cpuset, stack_list); - list_del(queue.next); - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, &queue); + switch (event) { + case CPUSET_CPU_OFFLINE: + while ((cp = cpuset_next(&queue)) != NULL) { + + /* Continue past cpusets with all cpus online */ + if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) + continue; + + /* Remove offline cpus from this cpuset. */ + mutex_lock(&callback_mutex); + cpumask_and(cp->cpus_allowed, cp->cpus_allowed, + cpu_active_mask); + mutex_unlock(&callback_mutex); + + /* Move tasks from the empty cpuset to a parent */ + if (cpumask_empty(cp->cpus_allowed)) + remove_tasks_in_empty_cpuset(cp); + else + update_tasks_cpumask(cp, NULL); } + break; - /* Continue past cpusets with all cpus, mems online */ - if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && - nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) - continue; + case CPUSET_MEM_OFFLINE: + while ((cp = cpuset_next(&queue)) != NULL) { - oldmems = cp->mems_allowed; + /* Continue past cpusets with all mems online */ + if (nodes_subset(cp->mems_allowed, + node_states[N_HIGH_MEMORY])) + continue; - /* Remove offline cpus and mems from this cpuset. */ - mutex_lock(&callback_mutex); - cpumask_and(cp->cpus_allowed, cp->cpus_allowed, - cpu_active_mask); - nodes_and(cp->mems_allowed, cp->mems_allowed, + oldmems = cp->mems_allowed; + + /* Remove offline mems from this cpuset. */ + mutex_lock(&callback_mutex); + nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); - mutex_unlock(&callback_mutex); + mutex_unlock(&callback_mutex); - /* Move tasks from the empty cpuset to a parent */ - if (cpumask_empty(cp->cpus_allowed) || - nodes_empty(cp->mems_allowed)) - remove_tasks_in_empty_cpuset(cp); - else { - update_tasks_cpumask(cp, NULL); - update_tasks_nodemask(cp, &oldmems, NULL); + /* Move tasks from the empty cpuset to a parent */ + if (nodes_empty(cp->mems_allowed)) + remove_tasks_in_empty_cpuset(cp); + else + update_tasks_nodemask(cp, &oldmems, NULL); } } } @@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root) * (of no affect) on systems that are actively using CPU hotplug * but making no active use of cpusets. * + * The only exception to this is suspend/resume, where we don't + * modify cpusets at all. + * * This routine ensures that top_cpuset.cpus_allowed tracks * cpu_active_mask on each CPU hotplug (cpuhp) event. * * Called within get_online_cpus(). Needs to call cgroup_lock() * before calling generate_sched_domains(). + * + * @cpu_online: Indicates whether this is a CPU online event (true) or + * a CPU offline event (false). */ -void cpuset_update_active_cpus(void) +void cpuset_update_active_cpus(bool cpu_online) { struct sched_domain_attr *attr; cpumask_var_t *doms; @@ -2070,7 +2121,10 @@ void cpuset_update_active_cpus(void) mutex_lock(&callback_mutex); cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); mutex_unlock(&callback_mutex); - scan_for_empty_cpusets(&top_cpuset); + + if (!cpu_online) + scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); + ndoms = generate_sched_domains(&doms, &attr); cgroup_unlock(); @@ -2082,7 +2136,7 @@ void cpuset_update_active_cpus(void) /* * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. - * See also the previous routine cpuset_track_online_cpus(). + * See cpuset_update_active_cpus() for CPU hotplug handling. */ static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) @@ -2101,9 +2155,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self, case MEM_OFFLINE: /* * needn't update top_cpuset.mems_allowed explicitly because - * scan_for_empty_cpusets() will update it. + * scan_cpusets_upon_hotplug() will update it. */ - scan_for_empty_cpusets(&top_cpuset); + scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE); break; default: break; diff --git a/kernel/exit.c b/kernel/exit.c index 2f59cc33451..f65345f9e5b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -483,7 +483,7 @@ static void close_files(struct files_struct * files) rcu_read_unlock(); for (;;) { unsigned long set; - i = j * __NFDBITS; + i = j * BITS_PER_LONG; if (i >= fdt->max_fds) break; set = fdt->open_fds[j++]; @@ -953,14 +953,11 @@ void do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against - * an exiting task cleaning up the robust pi futexes, and in - * task_work_add() to avoid the race with exit_task_work(). + * an exiting task cleaning up the robust pi futexes. */ smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); - exit_task_work(tsk); - if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), @@ -995,6 +992,7 @@ void do_exit(long code) exit_shm(tsk); exit_files(tsk); exit_fs(tsk); + exit_task_work(tsk); check_stack_usage(); exit_thread(); diff --git a/kernel/fork.c b/kernel/fork.c index f00e319d837..ff1cad3b7bd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1420,7 +1420,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); - INIT_HLIST_HEAD(&p->task_works); + p->task_works = NULL; /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 41c1564103f..38c5eb839c9 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -448,7 +448,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", - hwirq, domain->of_node ? domain->of_node->full_name : "null", virq); + hwirq, of_node_full_name(domain->of_node), virq); return virq; } @@ -477,7 +477,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller, return intspec[0]; #endif pr_warning("no irq domain found for %s !\n", - controller->full_name); + of_node_full_name(controller)); return 0; } @@ -725,8 +725,8 @@ static int virq_debug_show(struct seq_file *m, void *private) data = irq_desc_get_chip_data(desc); seq_printf(m, data ? "0x%p " : " %p ", data); - if (desc->irq_data.domain && desc->irq_data.domain->of_node) - p = desc->irq_data.domain->of_node->full_name; + if (desc->irq_data.domain) + p = of_node_full_name(desc->irq_data.domain->of_node); else p = none; seq_printf(m, "%s\n", p); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8c548232ba3..814c9ef6bba 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -781,7 +781,7 @@ static void wake_threads_waitq(struct irq_desc *desc) wake_up(&desc->wait_for_threads); } -static void irq_thread_dtor(struct task_work *unused) +static void irq_thread_dtor(struct callback_head *unused) { struct task_struct *tsk = current; struct irq_desc *desc; @@ -813,7 +813,7 @@ static void irq_thread_dtor(struct task_work *unused) */ static int irq_thread(void *data) { - struct task_work on_exit_work; + struct callback_head on_exit_work; static const struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; @@ -830,7 +830,7 @@ static int irq_thread(void *data) sched_setscheduler(current, SCHED_FIFO, ¶m); - init_task_work(&on_exit_work, irq_thread_dtor, NULL); + init_task_work(&on_exit_work, irq_thread_dtor); task_work_add(current, &on_exit_work, false); while (!irq_wait_for_interrupt(action)) { diff --git a/kernel/kthread.c b/kernel/kthread.c index 3d3de633702..b579af57ea1 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -360,16 +360,12 @@ repeat: struct kthread_work, node); list_del_init(&work->node); } + worker->current_work = work; spin_unlock_irq(&worker->lock); if (work) { __set_current_state(TASK_RUNNING); work->func(work); - smp_wmb(); /* wmb worker-b0 paired with flush-b1 */ - work->done_seq = work->queue_seq; - smp_mb(); /* mb worker-b1 paired with flush-b0 */ - if (atomic_read(&work->flushing)) - wake_up_all(&work->done); } else if (!freezing(current)) schedule(); @@ -378,6 +374,19 @@ repeat: } EXPORT_SYMBOL_GPL(kthread_worker_fn); +/* insert @work before @pos in @worker */ +static void insert_kthread_work(struct kthread_worker *worker, + struct kthread_work *work, + struct list_head *pos) +{ + lockdep_assert_held(&worker->lock); + + list_add_tail(&work->node, pos); + work->worker = worker; + if (likely(worker->task)) + wake_up_process(worker->task); +} + /** * queue_kthread_work - queue a kthread_work * @worker: target kthread_worker @@ -395,10 +404,7 @@ bool queue_kthread_work(struct kthread_worker *worker, spin_lock_irqsave(&worker->lock, flags); if (list_empty(&work->node)) { - list_add_tail(&work->node, &worker->work_list); - work->queue_seq++; - if (likely(worker->task)) - wake_up_process(worker->task); + insert_kthread_work(worker, work, &worker->work_list); ret = true; } spin_unlock_irqrestore(&worker->lock, flags); @@ -406,6 +412,18 @@ bool queue_kthread_work(struct kthread_worker *worker, } EXPORT_SYMBOL_GPL(queue_kthread_work); +struct kthread_flush_work { + struct kthread_work work; + struct completion done; +}; + +static void kthread_flush_work_fn(struct kthread_work *work) +{ + struct kthread_flush_work *fwork = + container_of(work, struct kthread_flush_work, work); + complete(&fwork->done); +} + /** * flush_kthread_work - flush a kthread_work * @work: work to flush @@ -414,39 +432,37 @@ EXPORT_SYMBOL_GPL(queue_kthread_work); */ void flush_kthread_work(struct kthread_work *work) { - int seq = work->queue_seq; - - atomic_inc(&work->flushing); + struct kthread_flush_work fwork = { + KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), + COMPLETION_INITIALIZER_ONSTACK(fwork.done), + }; + struct kthread_worker *worker; + bool noop = false; - /* - * mb flush-b0 paired with worker-b1, to make sure either - * worker sees the above increment or we see done_seq update. - */ - smp_mb__after_atomic_inc(); +retry: + worker = work->worker; + if (!worker) + return; - /* A - B <= 0 tests whether B is in front of A regardless of overflow */ - wait_event(work->done, seq - work->done_seq <= 0); - atomic_dec(&work->flushing); + spin_lock_irq(&worker->lock); + if (work->worker != worker) { + spin_unlock_irq(&worker->lock); + goto retry; + } - /* - * rmb flush-b1 paired with worker-b0, to make sure our caller - * sees every change made by work->func(). - */ - smp_mb__after_atomic_dec(); -} -EXPORT_SYMBOL_GPL(flush_kthread_work); + if (!list_empty(&work->node)) + insert_kthread_work(worker, &fwork.work, work->node.next); + else if (worker->current_work == work) + insert_kthread_work(worker, &fwork.work, worker->work_list.next); + else + noop = true; -struct kthread_flush_work { - struct kthread_work work; - struct completion done; -}; + spin_unlock_irq(&worker->lock); -static void kthread_flush_work_fn(struct kthread_work *work) -{ - struct kthread_flush_work *fwork = - container_of(work, struct kthread_flush_work, work); - complete(&fwork->done); + if (!noop) + wait_for_completion(&fwork.done); } +EXPORT_SYMBOL_GPL(flush_kthread_work); /** * flush_kthread_worker - flush all current works on a kthread_worker diff --git a/kernel/printk.c b/kernel/printk.c index ac4bc9e7946..50c96b5651b 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -216,6 +216,7 @@ struct log { */ static DEFINE_RAW_SPINLOCK(logbuf_lock); +#ifdef CONFIG_PRINTK /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; @@ -228,14 +229,19 @@ static u32 log_first_idx; /* index and sequence number of the next record to store in the buffer */ static u64 log_next_seq; -#ifdef CONFIG_PRINTK static u32 log_next_idx; +/* the next printk record to write to the console */ +static u64 console_seq; +static u32 console_idx; +static enum log_flags console_prev; + /* the next printk record to read after the last 'clear' command */ static u64 clear_seq; static u32 clear_idx; -#define LOG_LINE_MAX 1024 +#define PREFIX_MAX 32 +#define LOG_LINE_MAX 1024 - PREFIX_MAX /* record buffer */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) @@ -360,6 +366,7 @@ static void log_store(int facility, int level, struct devkmsg_user { u64 seq; u32 idx; + enum log_flags prev; struct mutex lock; char buf[8192]; }; @@ -425,6 +432,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, struct log *msg; u64 ts_usec; size_t i; + char cont = '-'; size_t len; ssize_t ret; @@ -462,8 +470,25 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, msg = log_from_idx(user->idx); ts_usec = msg->ts_nsec; do_div(ts_usec, 1000); - len = sprintf(user->buf, "%u,%llu,%llu;", - (msg->facility << 3) | msg->level, user->seq, ts_usec); + + /* + * If we couldn't merge continuation line fragments during the print, + * export the stored flags to allow an optional external merge of the + * records. Merging the records isn't always neccessarily correct, like + * when we hit a race during printing. In most cases though, it produces + * better readable output. 'c' in the record flags mark the first + * fragment of a line, '+' the following. + */ + if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT)) + cont = 'c'; + else if ((msg->flags & LOG_CONT) || + ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))) + cont = '+'; + + len = sprintf(user->buf, "%u,%llu,%llu,%c;", + (msg->facility << 3) | msg->level, + user->seq, ts_usec, cont); + user->prev = msg->flags; /* escape non-printable characters */ for (i = 0; i < msg->text_len; i++) { @@ -646,6 +671,15 @@ void log_buf_kexec_setup(void) VMCOREINFO_SYMBOL(log_buf_len); VMCOREINFO_SYMBOL(log_first_idx); VMCOREINFO_SYMBOL(log_next_idx); + /* + * Export struct log size and field offsets. User space tools can + * parse it and detect any changes to structure down the line. + */ + VMCOREINFO_STRUCT_SIZE(log); + VMCOREINFO_OFFSET(log, ts_nsec); + VMCOREINFO_OFFSET(log, len); + VMCOREINFO_OFFSET(log, text_len); + VMCOREINFO_OFFSET(log, dict_len); } #endif @@ -876,7 +910,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev, if (buf) { if (print_prefix(msg, syslog, NULL) + - text_len + 1>= size - len) + text_len + 1 >= size - len) break; if (prefix) @@ -907,7 +941,7 @@ static int syslog_print(char __user *buf, int size) struct log *msg; int len = 0; - text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); + text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; @@ -930,7 +964,8 @@ static int syslog_print(char __user *buf, int size) skip = syslog_partial; msg = log_from_idx(syslog_idx); - n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX); + n = msg_print_text(msg, syslog_prev, true, text, + LOG_LINE_MAX + PREFIX_MAX); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ syslog_idx = log_next(syslog_idx); @@ -969,7 +1004,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) char *text; int len = 0; - text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); + text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; @@ -1022,7 +1057,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear) struct log *msg = log_from_idx(idx); int textlen; - textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX); + textlen = msg_print_text(msg, prev, true, text, + LOG_LINE_MAX + PREFIX_MAX); if (textlen < 0) { len = textlen; break; @@ -1349,20 +1385,36 @@ static struct cont { u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ u8 facility; /* log level of first message */ + enum log_flags flags; /* prefix, newline flags */ bool flushed:1; /* buffer sealed and committed */ } cont; -static void cont_flush(void) +static void cont_flush(enum log_flags flags) { if (cont.flushed) return; if (cont.len == 0) return; - log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec, - NULL, 0, cont.buf, cont.len); - - cont.flushed = true; + if (cont.cons) { + /* + * If a fragment of this line was directly flushed to the + * console; wait for the console to pick up the rest of the + * line. LOG_NOCONS suppresses a duplicated output. + */ + log_store(cont.facility, cont.level, flags | LOG_NOCONS, + cont.ts_nsec, NULL, 0, cont.buf, cont.len); + cont.flags = flags; + cont.flushed = true; + } else { + /* + * If no fragment of this line ever reached the console, + * just submit it to the store and free the buffer. + */ + log_store(cont.facility, cont.level, flags, 0, + NULL, 0, cont.buf, cont.len); + cont.len = 0; + } } static bool cont_add(int facility, int level, const char *text, size_t len) @@ -1371,7 +1423,8 @@ static bool cont_add(int facility, int level, const char *text, size_t len) return false; if (cont.len + len > sizeof(cont.buf)) { - cont_flush(); + /* the line gets too long, split it up in separate records */ + cont_flush(LOG_CONT); return false; } @@ -1380,12 +1433,17 @@ static bool cont_add(int facility, int level, const char *text, size_t len) cont.level = level; cont.owner = current; cont.ts_nsec = local_clock(); + cont.flags = 0; cont.cons = 0; cont.flushed = false; } memcpy(cont.buf + cont.len, text, len); cont.len += len; + + if (cont.len > (sizeof(cont.buf) * 80) / 100) + cont_flush(LOG_CONT); + return true; } @@ -1394,7 +1452,7 @@ static size_t cont_print_text(char *text, size_t size) size_t textlen = 0; size_t len; - if (cont.cons == 0) { + if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) { textlen += print_time(cont.ts_nsec, text); size -= textlen; } @@ -1409,7 +1467,8 @@ static size_t cont_print_text(char *text, size_t size) } if (cont.flushed) { - text[textlen++] = '\n'; + if (cont.flags & LOG_NEWLINE) + text[textlen++] = '\n'; /* got everything, release buffer */ cont.len = 0; } @@ -1507,7 +1566,7 @@ asmlinkage int vprintk_emit(int facility, int level, * or another task also prints continuation lines. */ if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) - cont_flush(); + cont_flush(LOG_NEWLINE); /* buffer line if possible, otherwise store it right away */ if (!cont_add(facility, level, text, text_len)) @@ -1525,7 +1584,7 @@ asmlinkage int vprintk_emit(int facility, int level, if (cont.len && cont.owner == current) { if (!(lflags & LOG_PREFIX)) stored = cont_add(facility, level, text, text_len); - cont_flush(); + cont_flush(LOG_NEWLINE); } if (!stored) @@ -1616,9 +1675,20 @@ asmlinkage int printk(const char *fmt, ...) } EXPORT_SYMBOL(printk); -#else +#else /* CONFIG_PRINTK */ +#define LOG_LINE_MAX 0 +#define PREFIX_MAX 0 #define LOG_LINE_MAX 0 +static u64 syslog_seq; +static u32 syslog_idx; +static u64 console_seq; +static u32 console_idx; +static enum log_flags syslog_prev; +static u64 log_first_seq; +static u32 log_first_idx; +static u64 log_next_seq; +static enum log_flags console_prev; static struct cont { size_t len; size_t cons; @@ -1902,10 +1972,34 @@ void wake_up_klogd(void) this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); } -/* the next printk record to write to the console */ -static u64 console_seq; -static u32 console_idx; -static enum log_flags console_prev; +static void console_cont_flush(char *text, size_t size) +{ + unsigned long flags; + size_t len; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + + if (!cont.len) + goto out; + + /* + * We still queue earlier records, likely because the console was + * busy. The earlier ones need to be printed before this one, we + * did not flush any fragment so far, so just let it queue up. + */ + if (console_seq < log_next_seq && !cont.cons) + goto out; + + len = cont_print_text(text, size); + raw_spin_unlock(&logbuf_lock); + stop_critical_timings(); + call_console_drivers(cont.level, text, len); + start_critical_timings(); + local_irq_restore(flags); + return; +out: + raw_spin_unlock_irqrestore(&logbuf_lock, flags); +} /** * console_unlock - unlock the console system @@ -1923,7 +2017,7 @@ static enum log_flags console_prev; */ void console_unlock(void) { - static char text[LOG_LINE_MAX]; + static char text[LOG_LINE_MAX + PREFIX_MAX]; static u64 seen_seq; unsigned long flags; bool wake_klogd = false; @@ -1937,19 +2031,7 @@ void console_unlock(void) console_may_schedule = 0; /* flush buffered message fragment immediately to console */ - raw_spin_lock_irqsave(&logbuf_lock, flags); - if (cont.len && (cont.cons < cont.len || cont.flushed)) { - size_t len; - - len = cont_print_text(text, sizeof(text)); - raw_spin_unlock(&logbuf_lock); - stop_critical_timings(); - call_console_drivers(cont.level, text, len); - start_critical_timings(); - local_irq_restore(flags); - } else - raw_spin_unlock_irqrestore(&logbuf_lock, flags); - + console_cont_flush(text, sizeof(text)); again: for (;;) { struct log *msg; @@ -1986,6 +2068,7 @@ skip: * will properly dump everything later. */ msg->flags &= ~LOG_NOCONS; + console_prev = msg->flags; goto skip; } diff --git a/kernel/resource.c b/kernel/resource.c index e1d2b8ee76d..dc8b4776444 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -722,14 +722,12 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t write_lock(&resource_lock); + if (!parent) + goto skip; + if ((start < parent->start) || (end > parent->end)) goto out; - for (tmp = res->child; tmp; tmp = tmp->sibling) { - if ((tmp->start < start) || (tmp->end > end)) - goto out; - } - if (res->sibling && (res->sibling->start <= end)) goto out; @@ -741,6 +739,11 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t goto out; } +skip: + for (tmp = res->child; tmp; tmp = tmp->sibling) + if ((tmp->start < start) || (tmp->end > end)) + goto out; + res->start = start; res->end = end; result = 0; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 468bdd44c1b..5d011ef4c0d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. * * sched_move_task() holds both and thus holding either pins the cgroup, - * see set_task_rq(). + * see task_group(). * * Furthermore, all task_rq users should acquire both locks, see * task_rq_lock(). @@ -6024,6 +6024,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this * allows us to avoid some pointer chasing select_idle_sibling(). * + * Iterate domains and sched_groups downward, assigning CPUs to be + * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing + * due to random perturbation self canceling, ie sw buddies pull + * their counterpart to their CPU's hw counterpart. + * * Also keep a unique ID per domain (we use the first cpu number in * the cpumask of the domain), this allows us to quickly tell if * two cpus are in the same cache domain, see cpus_share_cache(). @@ -6037,8 +6042,40 @@ static void update_top_cache_domain(int cpu) int id = cpu; sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) + if (sd) { + struct sched_domain *tmp = sd; + struct sched_group *sg, *prev; + bool right; + + /* + * Traverse to first CPU in group, and count hops + * to cpu from there, switching direction on each + * hop, never ever pointing the last CPU rightward. + */ + do { + id = cpumask_first(sched_domain_span(tmp)); + prev = sg = tmp->groups; + right = 1; + + while (cpumask_first(sched_group_cpus(sg)) != id) + sg = sg->next; + + while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) { + prev = sg; + sg = sg->next; + right = !right; + } + + /* A CPU went down, never point back to domain start. */ + if (right && cpumask_first(sched_group_cpus(sg->next)) == id) + right = false; + + sg = right ? sg->next : prev; + tmp->idle_buddy = cpumask_first(sched_group_cpus(sg)); + } while ((tmp = tmp->child)); + id = cpumask_first(sched_domain_span(sd)); + } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_id, cpu) = id; @@ -7097,34 +7134,66 @@ match2: mutex_unlock(&sched_domains_mutex); } +static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ + /* * Update cpusets according to cpu_active mask. If cpusets are * disabled, cpuset_update_active_cpus() becomes a simple wrapper * around partition_sched_domains(). + * + * If we come here as part of a suspend/resume, don't touch cpusets because we + * want to restore it back to its original state upon resume anyway. */ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { + case CPU_ONLINE_FROZEN: + case CPU_DOWN_FAILED_FROZEN: + + /* + * num_cpus_frozen tracks how many CPUs are involved in suspend + * resume sequence. As long as this is not the last online + * operation in the resume sequence, just build a single sched + * domain, ignoring cpusets. + */ + num_cpus_frozen--; + if (likely(num_cpus_frozen)) { + partition_sched_domains(1, NULL, NULL); + break; + } + + /* + * This is the last CPU online operation. So fall through and + * restore the original sched domains by considering the + * cpuset configurations. + */ + case CPU_ONLINE: case CPU_DOWN_FAILED: - cpuset_update_active_cpus(); - return NOTIFY_OK; + cpuset_update_active_cpus(true); + break; default: return NOTIFY_DONE; } + return NOTIFY_OK; } static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { case CPU_DOWN_PREPARE: - cpuset_update_active_cpus(); - return NOTIFY_OK; + cpuset_update_active_cpus(false); + break; + case CPU_DOWN_PREPARE_FROZEN: + num_cpus_frozen++; + partition_sched_domains(1, NULL, NULL); + break; default: return NOTIFY_DONE; } + return NOTIFY_OK; } void __init sched_init_smp(void) @@ -7589,6 +7658,7 @@ void sched_destroy_group(struct task_group *tg) */ void sched_move_task(struct task_struct *tsk) { + struct task_group *tg; int on_rq, running; unsigned long flags; struct rq *rq; @@ -7603,6 +7673,12 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); + tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, + lockdep_is_held(&tsk->sighand->siglock)), + struct task_group, css); + tg = autogroup_task_group(tsk, tg); + tsk->sched_task_group = tg; + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) tsk->sched_class->task_move_group(tsk, on_rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c099cc6eebe..22321db6495 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target) int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); struct sched_domain *sd; - struct sched_group *sg; - int i; /* * If the task is going to be woken-up on this cpu and if it is @@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target) return prev_cpu; /* - * Otherwise, iterate the domains and find an elegible idle cpu. + * Otherwise, check assigned siblings to find an elegible idle cpu. */ sd = rcu_dereference(per_cpu(sd_llc, target)); - for_each_lower_domain(sd) { - sg = sd->groups; - do { - if (!cpumask_intersects(sched_group_cpus(sg), - tsk_cpus_allowed(p))) - goto next; - - for_each_cpu(i, sched_group_cpus(sg)) { - if (!idle_cpu(i)) - goto next; - } - target = cpumask_first_and(sched_group_cpus(sg), - tsk_cpus_allowed(p)); - goto done; -next: - sg = sg->next; - } while (sg != sd->groups); + for_each_lower_domain(sd) { + if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) + continue; + if (idle_cpu(sd->idle_buddy)) + return sd->idle_buddy; } -done: + return target; } @@ -3068,16 +3054,19 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 +#define LBF_SOME_PINNED 0x04 struct lb_env { struct sched_domain *sd; - int src_cpu; struct rq *src_rq; + int src_cpu; int dst_cpu; struct rq *dst_rq; + struct cpumask *dst_grpmask; + int new_dst_cpu; enum cpu_idle_type idle; long imbalance; unsigned int flags; @@ -3145,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 3) are cache-hot on their current CPU. */ if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { + int new_dst_cpu; + schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + + /* + * Remember if this task can be migrated to any other cpu in + * our sched_group. We may want to revisit it if we couldn't + * meet load balance goals by pulling other tasks on src_cpu. + * + * Also avoid computing new_dst_cpu if we have already computed + * one in current iteration. + */ + if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) + return 0; + + new_dst_cpu = cpumask_first_and(env->dst_grpmask, + tsk_cpus_allowed(p)); + if (new_dst_cpu < nr_cpu_ids) { + env->flags |= LBF_SOME_PINNED; + env->new_dst_cpu = new_dst_cpu; + } return 0; } + + /* Record that we found atleast one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; if (task_running(env->src_rq, p)) { @@ -4227,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, active_balance = 0; + int ld_moved, cur_ld_moved, active_balance = 0; + int lb_iterations, max_lb_iterations; struct sched_group *group; struct rq *busiest; unsigned long flags; @@ -4237,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq, .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, + .dst_grpmask = sched_group_cpus(sd->groups), .idle = idle, .loop_break = sched_nr_migrate_break, }; cpumask_copy(cpus, cpu_active_mask); + max_lb_iterations = cpumask_weight(env.dst_grpmask); schedstat_inc(sd, lb_count[idle]); @@ -4267,6 +4281,7 @@ redo: schedstat_add(sd, lb_imbalance[idle], env.imbalance); ld_moved = 0; + lb_iterations = 1; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -4284,7 +4299,13 @@ more_balance: double_rq_lock(this_rq, busiest); if (!env.loop) update_h_load(env.src_cpu); - ld_moved += move_tasks(&env); + + /* + * cur_ld_moved - load moved in current iteration + * ld_moved - cumulative load moved across iterations + */ + cur_ld_moved = move_tasks(&env); + ld_moved += cur_ld_moved; double_rq_unlock(this_rq, busiest); local_irq_restore(flags); @@ -4296,14 +4317,52 @@ more_balance: /* * some other cpu did the load balance for us. */ - if (ld_moved && this_cpu != smp_processor_id()) - resched_cpu(this_cpu); + if (cur_ld_moved && env.dst_cpu != smp_processor_id()) + resched_cpu(env.dst_cpu); + + /* + * Revisit (affine) tasks on src_cpu that couldn't be moved to + * us and move them to an alternate dst_cpu in our sched_group + * where they can run. The upper limit on how many times we + * iterate on same src_cpu is dependent on number of cpus in our + * sched_group. + * + * This changes load balance semantics a bit on who can move + * load to a given_cpu. In addition to the given_cpu itself + * (or a ilb_cpu acting on its behalf where given_cpu is + * nohz-idle), we now have balance_cpu in a position to move + * load to given_cpu. In rare situations, this may cause + * conflicts (balance_cpu and given_cpu/ilb_cpu deciding + * _independently_ and at _same_ time to move some load to + * given_cpu) causing exceess load to be moved to given_cpu. + * This however should not happen so much in practice and + * moreover subsequent load balance cycles should correct the + * excess load moved. + */ + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && + lb_iterations++ < max_lb_iterations) { + + this_rq = cpu_rq(env.new_dst_cpu); + env.dst_rq = this_rq; + env.dst_cpu = env.new_dst_cpu; + env.flags &= ~LBF_SOME_PINNED; + env.loop = 0; + env.loop_break = sched_nr_migrate_break; + /* + * Go back to "more_balance" rather than "redo" since we + * need to continue with same src_cpu. + */ + goto more_balance; + } /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); - if (!cpumask_empty(cpus)) + if (!cpumask_empty(cpus)) { + env.loop = 0; + env.loop_break = sched_nr_migrate_break; goto redo; + } goto out_balanced; } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 55844f24435..c35a1a7dd4d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -538,22 +538,19 @@ extern int group_balance_cpu(struct sched_group *sg); /* * Return the group to which this tasks belongs. * - * We use task_subsys_state_check() and extend the RCU verification with - * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each - * task it moves into the cgroup. Therefore by holding either of those locks, - * we pin the task to the current cgroup. + * We cannot use task_subsys_state() and friends because the cgroup + * subsystem changes that value before the cgroup_subsys::attach() method + * is called, therefore we cannot pin it and might observe the wrong value. + * + * The same is true for autogroup's p->signal->autogroup->tg, the autogroup + * core changes this before calling sched_move_task(). + * + * Instead we use a 'copy' which is updated from sched_move_task() while + * holding both task_struct::pi_lock and rq::lock. */ static inline struct task_group *task_group(struct task_struct *p) { - struct task_group *tg; - struct cgroup_subsys_state *css; - - css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&task_rq(p)->lock)); - tg = container_of(css, struct task_group, css); - - return autogroup_task_group(p, tg); + return p->sched_task_group; } /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ diff --git a/kernel/signal.c b/kernel/signal.c index 677102789cf..be4f856d52f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1971,6 +1971,13 @@ static void ptrace_do_notify(int signr, int exit_code, int why) void ptrace_notify(int exit_code) { BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); + if (unlikely(current->task_works)) { + if (test_and_clear_ti_thread_flag(current_thread_info(), + TIF_NOTIFY_RESUME)) { + smp_mb__after_clear_bit(); + task_work_run(); + } + } spin_lock_irq(¤t->sighand->siglock); ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); @@ -2191,6 +2198,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct signal_struct *signal = current->signal; int signr; + if (unlikely(current->task_works)) { + if (test_and_clear_ti_thread_flag(current_thread_info(), + TIF_NOTIFY_RESUME)) { + smp_mb__after_clear_bit(); + task_work_run(); + } + } + if (unlikely(uprobe_deny_signal())) return 0; diff --git a/kernel/task_work.c b/kernel/task_work.c index 82d1c794066..91d4e1742a0 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -3,82 +3,78 @@ #include <linux/tracehook.h> int -task_work_add(struct task_struct *task, struct task_work *twork, bool notify) +task_work_add(struct task_struct *task, struct callback_head *twork, bool notify) { + struct callback_head *last, *first; unsigned long flags; - int err = -ESRCH; -#ifndef TIF_NOTIFY_RESUME - if (notify) - return -ENOTSUPP; -#endif /* - * We must not insert the new work if the task has already passed - * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait() - * and check PF_EXITING under pi_lock. + * Not inserting the new work if the task has already passed + * exit_task_work() is the responisbility of callers. */ raw_spin_lock_irqsave(&task->pi_lock, flags); - if (likely(!(task->flags & PF_EXITING))) { - hlist_add_head(&twork->hlist, &task->task_works); - err = 0; - } + last = task->task_works; + first = last ? last->next : twork; + twork->next = first; + if (last) + last->next = twork; + task->task_works = twork; raw_spin_unlock_irqrestore(&task->pi_lock, flags); /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ - if (likely(!err) && notify) + if (notify) set_notify_resume(task); - return err; + return 0; } -struct task_work * +struct callback_head * task_work_cancel(struct task_struct *task, task_work_func_t func) { unsigned long flags; - struct task_work *twork; - struct hlist_node *pos; + struct callback_head *last, *res = NULL; raw_spin_lock_irqsave(&task->pi_lock, flags); - hlist_for_each_entry(twork, pos, &task->task_works, hlist) { - if (twork->func == func) { - hlist_del(&twork->hlist); - goto found; + last = task->task_works; + if (last) { + struct callback_head *q = last, *p = q->next; + while (1) { + if (p->func == func) { + q->next = p->next; + if (p == last) + task->task_works = q == p ? NULL : q; + res = p; + break; + } + if (p == last) + break; + q = p; + p = q->next; } } - twork = NULL; - found: raw_spin_unlock_irqrestore(&task->pi_lock, flags); - - return twork; + return res; } void task_work_run(void) { struct task_struct *task = current; - struct hlist_head task_works; - struct hlist_node *pos; + struct callback_head *p, *q; - raw_spin_lock_irq(&task->pi_lock); - hlist_move_list(&task->task_works, &task_works); - raw_spin_unlock_irq(&task->pi_lock); + while (1) { + raw_spin_lock_irq(&task->pi_lock); + p = task->task_works; + task->task_works = NULL; + raw_spin_unlock_irq(&task->pi_lock); - if (unlikely(hlist_empty(&task_works))) - return; - /* - * We use hlist to save the space in task_struct, but we want fifo. - * Find the last entry, the list should be short, then process them - * in reverse order. - */ - for (pos = task_works.first; pos->next; pos = pos->next) - ; + if (unlikely(!p)) + return; - for (;;) { - struct hlist_node **pprev = pos->pprev; - struct task_work *twork = container_of(pos, struct task_work, - hlist); - twork->func(twork); - - if (pprev == &task_works.first) - break; - pos = container_of(pprev, struct hlist_node, next); + q = p->next; /* head */ + p->next = NULL; /* cut it */ + while (q) { + p = q->next; + q->func(q); + q = p; + } } } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a120f98c411..5c38c81496c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3187,10 +3187,10 @@ static int tracing_set_tracer(const char *buf) } destroy_trace_option_files(topts); - current_trace = t; + current_trace = &nop_trace; - topts = create_trace_option_files(current_trace); - if (current_trace->use_max_tr) { + topts = create_trace_option_files(t); + if (t->use_max_tr) { int cpu; /* we need to make per cpu buffer sizes equivalent */ for_each_tracing_cpu(cpu) { @@ -3210,6 +3210,7 @@ static int tracing_set_tracer(const char *buf) goto out; } + current_trace = t; trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index c7b0c6a7db0..a426f410c06 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -13,6 +13,7 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/ftrace.h> +#include <linux/pstore.h> #include <linux/fs.h> #include "trace.h" @@ -74,6 +75,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) preempt_enable_notrace(); } +/* Our two options */ +enum { + TRACE_FUNC_OPT_STACK = 0x1, + TRACE_FUNC_OPT_PSTORE = 0x2, +}; + +static struct tracer_flags func_flags; + static void function_trace_call(unsigned long ip, unsigned long parent_ip) { @@ -97,6 +106,12 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { + /* + * So far tracing doesn't support multiple buffers, so + * we make an explicit call for now. + */ + if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE)) + pstore_ftrace_call(ip, parent_ip); pc = preempt_count(); trace_function(tr, ip, parent_ip, flags, pc); } @@ -158,15 +173,13 @@ static struct ftrace_ops trace_stack_ops __read_mostly = .flags = FTRACE_OPS_FL_GLOBAL, }; -/* Our two options */ -enum { - TRACE_FUNC_OPT_STACK = 0x1, -}; - static struct tracer_opt func_opts[] = { #ifdef CONFIG_STACKTRACE { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, #endif +#ifdef CONFIG_PSTORE_FTRACE + { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) }, +#endif { } /* Always set a last empty entry */ }; @@ -204,10 +217,11 @@ static void tracing_stop_function_trace(void) static int func_set_flag(u32 old_flags, u32 bit, int set) { - if (bit == TRACE_FUNC_OPT_STACK) { + switch (bit) { + case TRACE_FUNC_OPT_STACK: /* do nothing if already set */ if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) - return 0; + break; if (set) { unregister_ftrace_function(&trace_ops); @@ -217,10 +231,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) register_ftrace_function(&trace_ops); } - return 0; + break; + case TRACE_FUNC_OPT_PSTORE: + break; + default: + return -EINVAL; } - return -EINVAL; + return 0; } static struct tracer function_trace __read_mostly = diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9a3128dc67d..692d97628a1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -45,32 +45,41 @@ #include "workqueue_sched.h" enum { - /* global_cwq flags */ - GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ - GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */ - GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ - GCWQ_FREEZING = 1 << 3, /* freeze in progress */ - GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */ + /* + * global_cwq flags + * + * A bound gcwq is either associated or disassociated with its CPU. + * While associated (!DISASSOCIATED), all workers are bound to the + * CPU and none has %WORKER_UNBOUND set and concurrency management + * is in effect. + * + * While DISASSOCIATED, the cpu may be offline and all workers have + * %WORKER_UNBOUND set and concurrency management disabled, and may + * be executing on any CPU. The gcwq behaves as an unbound one. + * + * Note that DISASSOCIATED can be flipped only while holding + * managership of all pools on the gcwq to avoid changing binding + * state while create_worker() is in progress. + */ + GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */ + GCWQ_FREEZING = 1 << 1, /* freeze in progress */ + + /* pool flags */ + POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ /* worker flags */ WORKER_STARTED = 1 << 0, /* started */ WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ WORKER_PREP = 1 << 3, /* preparing to run works */ - WORKER_ROGUE = 1 << 4, /* not bound to any cpu */ WORKER_REBIND = 1 << 5, /* mom is home, come back */ WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ WORKER_UNBOUND = 1 << 7, /* worker is unbound */ - WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | - WORKER_CPU_INTENSIVE | WORKER_UNBOUND, + WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND | + WORKER_CPU_INTENSIVE, - /* gcwq->trustee_state */ - TRUSTEE_START = 0, /* start */ - TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */ - TRUSTEE_BUTCHER = 2, /* butcher workers */ - TRUSTEE_RELEASE = 3, /* release workers */ - TRUSTEE_DONE = 4, /* trustee is done */ + NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, @@ -84,13 +93,13 @@ enum { (min two ticks) */ MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ CREATE_COOLDOWN = HZ, /* time to breath after fail */ - TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ /* * Rescue workers are used only on emergencies and shared by * all cpus. Give -20. */ RESCUER_NICE_LEVEL = -20, + HIGHPRI_NICE_LEVEL = -20, }; /* @@ -115,6 +124,8 @@ enum { */ struct global_cwq; +struct worker_pool; +struct idle_rebind; /* * The poor guys doing the actual heavy lifting. All on-duty workers @@ -131,12 +142,31 @@ struct worker { struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ struct list_head scheduled; /* L: scheduled works */ struct task_struct *task; /* I: worker task */ - struct global_cwq *gcwq; /* I: the associated gcwq */ + struct worker_pool *pool; /* I: the associated pool */ /* 64 bytes boundary on 64bit, 32 on 32bit */ unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ int id; /* I: worker id */ - struct work_struct rebind_work; /* L: rebind worker to cpu */ + + /* for rebinding worker to CPU */ + struct idle_rebind *idle_rebind; /* L: for idle worker */ + struct work_struct rebind_work; /* L: for busy worker */ +}; + +struct worker_pool { + struct global_cwq *gcwq; /* I: the owning gcwq */ + unsigned int flags; /* X: flags */ + + struct list_head worklist; /* L: list of pending works */ + int nr_workers; /* L: total number of workers */ + int nr_idle; /* L: currently idle ones */ + + struct list_head idle_list; /* X: list of idle workers */ + struct timer_list idle_timer; /* L: worker idle timeout */ + struct timer_list mayday_timer; /* L: SOS timer for workers */ + + struct mutex manager_mutex; /* mutex manager should hold */ + struct ida worker_ida; /* L: for worker IDs */ }; /* @@ -146,27 +176,16 @@ struct worker { */ struct global_cwq { spinlock_t lock; /* the gcwq lock */ - struct list_head worklist; /* L: list of pending works */ unsigned int cpu; /* I: the associated cpu */ unsigned int flags; /* L: GCWQ_* flags */ - int nr_workers; /* L: total number of workers */ - int nr_idle; /* L: currently idle ones */ - - /* workers are chained either in the idle_list or busy_hash */ - struct list_head idle_list; /* X: list of idle workers */ + /* workers are chained either in busy_hash or pool idle_list */ struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; /* L: hash of busy workers */ - struct timer_list idle_timer; /* L: worker idle timeout */ - struct timer_list mayday_timer; /* L: SOS timer for dworkers */ - - struct ida worker_ida; /* L: for worker IDs */ + struct worker_pool pools[2]; /* normal and highpri pools */ - struct task_struct *trustee; /* L: for gcwq shutdown */ - unsigned int trustee_state; /* L: trustee state */ - wait_queue_head_t trustee_wait; /* trustee wait */ - struct worker *first_idle; /* L: first idle worker */ + wait_queue_head_t rebind_hold; /* rebind hold wait */ } ____cacheline_aligned_in_smp; /* @@ -175,7 +194,7 @@ struct global_cwq { * aligned at two's power of the number of flag bits. */ struct cpu_workqueue_struct { - struct global_cwq *gcwq; /* I: the associated gcwq */ + struct worker_pool *pool; /* I: the associated pool */ struct workqueue_struct *wq; /* I: the owning workqueue */ int work_color; /* L: current color */ int flush_color; /* L: flushing color */ @@ -264,6 +283,10 @@ EXPORT_SYMBOL_GPL(system_nrt_freezable_wq); #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> +#define for_each_worker_pool(pool, gcwq) \ + for ((pool) = &(gcwq)->pools[0]; \ + (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++) + #define for_each_busy_worker(worker, i, pos, gcwq) \ for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) @@ -444,7 +467,7 @@ static bool workqueue_freezing; /* W: have wqs started freezing? */ * try_to_wake_up(). Put it in a separate cacheline. */ static DEFINE_PER_CPU(struct global_cwq, global_cwq); -static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); +static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]); /* * Global cpu workqueue and nr_running counter for unbound gcwq. The @@ -452,10 +475,17 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); * workers have WORKER_UNBOUND set. */ static struct global_cwq unbound_global_cwq; -static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */ +static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = { + [0 ... NR_WORKER_POOLS - 1] = ATOMIC_INIT(0), /* always 0 */ +}; static int worker_thread(void *__worker); +static int worker_pool_pri(struct worker_pool *pool) +{ + return pool - pool->gcwq->pools; +} + static struct global_cwq *get_gcwq(unsigned int cpu) { if (cpu != WORK_CPU_UNBOUND) @@ -464,12 +494,15 @@ static struct global_cwq *get_gcwq(unsigned int cpu) return &unbound_global_cwq; } -static atomic_t *get_gcwq_nr_running(unsigned int cpu) +static atomic_t *get_pool_nr_running(struct worker_pool *pool) { + int cpu = pool->gcwq->cpu; + int idx = worker_pool_pri(pool); + if (cpu != WORK_CPU_UNBOUND) - return &per_cpu(gcwq_nr_running, cpu); + return &per_cpu(pool_nr_running, cpu)[idx]; else - return &unbound_gcwq_nr_running; + return &unbound_pool_nr_running[idx]; } static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, @@ -555,7 +588,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) if (data & WORK_STRUCT_CWQ) return ((struct cpu_workqueue_struct *) - (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq; + (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; cpu = data >> WORK_STRUCT_FLAG_BITS; if (cpu == WORK_CPU_NONE) @@ -566,60 +599,62 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) } /* - * Policy functions. These define the policies on how the global - * worker pool is managed. Unless noted otherwise, these functions - * assume that they're being called with gcwq->lock held. + * Policy functions. These define the policies on how the global worker + * pools are managed. Unless noted otherwise, these functions assume that + * they're being called with gcwq->lock held. */ -static bool __need_more_worker(struct global_cwq *gcwq) +static bool __need_more_worker(struct worker_pool *pool) { - return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) || - gcwq->flags & GCWQ_HIGHPRI_PENDING; + return !atomic_read(get_pool_nr_running(pool)); } /* * Need to wake up a worker? Called from anything but currently * running workers. + * + * Note that, because unbound workers never contribute to nr_running, this + * function will always return %true for unbound gcwq as long as the + * worklist isn't empty. */ -static bool need_more_worker(struct global_cwq *gcwq) +static bool need_more_worker(struct worker_pool *pool) { - return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq); + return !list_empty(&pool->worklist) && __need_more_worker(pool); } /* Can I start working? Called from busy but !running workers. */ -static bool may_start_working(struct global_cwq *gcwq) +static bool may_start_working(struct worker_pool *pool) { - return gcwq->nr_idle; + return pool->nr_idle; } /* Do I need to keep working? Called from currently running workers. */ -static bool keep_working(struct global_cwq *gcwq) +static bool keep_working(struct worker_pool *pool) { - atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); + atomic_t *nr_running = get_pool_nr_running(pool); - return !list_empty(&gcwq->worklist) && - (atomic_read(nr_running) <= 1 || - gcwq->flags & GCWQ_HIGHPRI_PENDING); + return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1; } /* Do we need a new worker? Called from manager. */ -static bool need_to_create_worker(struct global_cwq *gcwq) +static bool need_to_create_worker(struct worker_pool *pool) { - return need_more_worker(gcwq) && !may_start_working(gcwq); + return need_more_worker(pool) && !may_start_working(pool); } /* Do I need to be the manager? */ -static bool need_to_manage_workers(struct global_cwq *gcwq) +static bool need_to_manage_workers(struct worker_pool *pool) { - return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS; + return need_to_create_worker(pool) || + (pool->flags & POOL_MANAGE_WORKERS); } /* Do we have too many workers and should some go away? */ -static bool too_many_workers(struct global_cwq *gcwq) +static bool too_many_workers(struct worker_pool *pool) { - bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS; - int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */ - int nr_busy = gcwq->nr_workers - nr_idle; + bool managing = mutex_is_locked(&pool->manager_mutex); + int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ + int nr_busy = pool->nr_workers - nr_idle; return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } @@ -629,26 +664,26 @@ static bool too_many_workers(struct global_cwq *gcwq) */ /* Return the first worker. Safe with preemption disabled */ -static struct worker *first_worker(struct global_cwq *gcwq) +static struct worker *first_worker(struct worker_pool *pool) { - if (unlikely(list_empty(&gcwq->idle_list))) + if (unlikely(list_empty(&pool->idle_list))) return NULL; - return list_first_entry(&gcwq->idle_list, struct worker, entry); + return list_first_entry(&pool->idle_list, struct worker, entry); } /** * wake_up_worker - wake up an idle worker - * @gcwq: gcwq to wake worker for + * @pool: worker pool to wake worker from * - * Wake up the first idle worker of @gcwq. + * Wake up the first idle worker of @pool. * * CONTEXT: * spin_lock_irq(gcwq->lock). */ -static void wake_up_worker(struct global_cwq *gcwq) +static void wake_up_worker(struct worker_pool *pool) { - struct worker *worker = first_worker(gcwq); + struct worker *worker = first_worker(pool); if (likely(worker)) wake_up_process(worker->task); @@ -670,7 +705,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) struct worker *worker = kthread_data(task); if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(get_gcwq_nr_running(cpu)); + atomic_inc(get_pool_nr_running(worker->pool)); } /** @@ -692,8 +727,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, unsigned int cpu) { struct worker *worker = kthread_data(task), *to_wakeup = NULL; - struct global_cwq *gcwq = get_gcwq(cpu); - atomic_t *nr_running = get_gcwq_nr_running(cpu); + struct worker_pool *pool = worker->pool; + atomic_t *nr_running = get_pool_nr_running(pool); if (worker->flags & WORKER_NOT_RUNNING) return NULL; @@ -706,14 +741,14 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, * worklist not empty test sequence is in insert_work(). * Please read comment there. * - * NOT_RUNNING is clear. This means that trustee is not in - * charge and we're running on the local cpu w/ rq lock held - * and preemption disabled, which in turn means that none else - * could be manipulating idle_list, so dereferencing idle_list - * without gcwq lock is safe. + * NOT_RUNNING is clear. This means that we're bound to and + * running on the local cpu w/ rq lock held and preemption + * disabled, which in turn means that none else could be + * manipulating idle_list, so dereferencing idle_list without gcwq + * lock is safe. */ - if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist)) - to_wakeup = first_worker(gcwq); + if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) + to_wakeup = first_worker(pool); return to_wakeup ? to_wakeup->task : NULL; } @@ -733,7 +768,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, static inline void worker_set_flags(struct worker *worker, unsigned int flags, bool wakeup) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; WARN_ON_ONCE(worker->task != current); @@ -744,12 +779,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, */ if ((flags & WORKER_NOT_RUNNING) && !(worker->flags & WORKER_NOT_RUNNING)) { - atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); + atomic_t *nr_running = get_pool_nr_running(pool); if (wakeup) { if (atomic_dec_and_test(nr_running) && - !list_empty(&gcwq->worklist)) - wake_up_worker(gcwq); + !list_empty(&pool->worklist)) + wake_up_worker(pool); } else atomic_dec(nr_running); } @@ -769,7 +804,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags, */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; unsigned int oflags = worker->flags; WARN_ON_ONCE(worker->task != current); @@ -783,7 +818,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) */ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING)) - atomic_inc(get_gcwq_nr_running(gcwq->cpu)); + atomic_inc(get_pool_nr_running(pool)); } /** @@ -867,43 +902,6 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq, } /** - * gcwq_determine_ins_pos - find insertion position - * @gcwq: gcwq of interest - * @cwq: cwq a work is being queued for - * - * A work for @cwq is about to be queued on @gcwq, determine insertion - * position for the work. If @cwq is for HIGHPRI wq, the work is - * queued at the head of the queue but in FIFO order with respect to - * other HIGHPRI works; otherwise, at the end of the queue. This - * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that - * there are HIGHPRI works pending. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - * - * RETURNS: - * Pointer to inserstion position. - */ -static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq, - struct cpu_workqueue_struct *cwq) -{ - struct work_struct *twork; - - if (likely(!(cwq->wq->flags & WQ_HIGHPRI))) - return &gcwq->worklist; - - list_for_each_entry(twork, &gcwq->worklist, entry) { - struct cpu_workqueue_struct *tcwq = get_work_cwq(twork); - - if (!(tcwq->wq->flags & WQ_HIGHPRI)) - break; - } - - gcwq->flags |= GCWQ_HIGHPRI_PENDING; - return &twork->entry; -} - -/** * insert_work - insert a work into gcwq * @cwq: cwq @work belongs to * @work: work to insert @@ -920,7 +918,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq, struct work_struct *work, struct list_head *head, unsigned int extra_flags) { - struct global_cwq *gcwq = cwq->gcwq; + struct worker_pool *pool = cwq->pool; /* we own @work, set data and link */ set_work_cwq(work, cwq, extra_flags); @@ -940,8 +938,8 @@ static void insert_work(struct cpu_workqueue_struct *cwq, */ smp_mb(); - if (__need_more_worker(gcwq)) - wake_up_worker(gcwq); + if (__need_more_worker(pool)) + wake_up_worker(pool); } /* @@ -1043,7 +1041,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, if (likely(cwq->nr_active < cwq->max_active)) { trace_workqueue_activate_work(work); cwq->nr_active++; - worklist = gcwq_determine_ins_pos(gcwq, cwq); + worklist = &cwq->pool->worklist; } else { work_flags |= WORK_STRUCT_DELAYED; worklist = &cwq->delayed_works; @@ -1192,7 +1190,8 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on); */ static void worker_enter_idle(struct worker *worker) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; + struct global_cwq *gcwq = pool->gcwq; BUG_ON(worker->flags & WORKER_IDLE); BUG_ON(!list_empty(&worker->entry) && @@ -1200,27 +1199,24 @@ static void worker_enter_idle(struct worker *worker) /* can't use worker_set_flags(), also called from start_worker() */ worker->flags |= WORKER_IDLE; - gcwq->nr_idle++; + pool->nr_idle++; worker->last_active = jiffies; /* idle_list is LIFO */ - list_add(&worker->entry, &gcwq->idle_list); + list_add(&worker->entry, &pool->idle_list); - if (likely(!(worker->flags & WORKER_ROGUE))) { - if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) - mod_timer(&gcwq->idle_timer, - jiffies + IDLE_WORKER_TIMEOUT); - } else - wake_up_all(&gcwq->trustee_wait); + if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) + mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); /* - * Sanity check nr_running. Because trustee releases gcwq->lock - * between setting %WORKER_ROGUE and zapping nr_running, the - * warning may trigger spuriously. Check iff trustee is idle. + * Sanity check nr_running. Because gcwq_unbind_fn() releases + * gcwq->lock between setting %WORKER_UNBOUND and zapping + * nr_running, the warning may trigger spuriously. Check iff + * unbind is not in progress. */ - WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && - gcwq->nr_workers == gcwq->nr_idle && - atomic_read(get_gcwq_nr_running(gcwq->cpu))); + WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) && + pool->nr_workers == pool->nr_idle && + atomic_read(get_pool_nr_running(pool))); } /** @@ -1234,11 +1230,11 @@ static void worker_enter_idle(struct worker *worker) */ static void worker_leave_idle(struct worker *worker) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; BUG_ON(!(worker->flags & WORKER_IDLE)); worker_clr_flags(worker, WORKER_IDLE); - gcwq->nr_idle--; + pool->nr_idle--; list_del_init(&worker->entry); } @@ -1258,11 +1254,11 @@ static void worker_leave_idle(struct worker *worker) * verbatim as it's best effort and blocking and gcwq may be * [dis]associated in the meantime. * - * This function tries set_cpus_allowed() and locks gcwq and verifies - * the binding against GCWQ_DISASSOCIATED which is set during - * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters - * idle state or fetches works without dropping lock, it can guarantee - * the scheduling requirement described in the first paragraph. + * This function tries set_cpus_allowed() and locks gcwq and verifies the + * binding against %GCWQ_DISASSOCIATED which is set during + * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker + * enters idle state or fetches works without dropping lock, it can + * guarantee the scheduling requirement described in the first paragraph. * * CONTEXT: * Might sleep. Called without any lock but returns with gcwq->lock @@ -1275,7 +1271,7 @@ static void worker_leave_idle(struct worker *worker) static bool worker_maybe_bind_and_lock(struct worker *worker) __acquires(&gcwq->lock) { - struct global_cwq *gcwq = worker->gcwq; + struct global_cwq *gcwq = worker->pool->gcwq; struct task_struct *task = worker->task; while (true) { @@ -1308,16 +1304,40 @@ __acquires(&gcwq->lock) } } +struct idle_rebind { + int cnt; /* # workers to be rebound */ + struct completion done; /* all workers rebound */ +}; + +/* + * Rebind an idle @worker to its CPU. During CPU onlining, this has to + * happen synchronously for idle workers. worker_thread() will test + * %WORKER_REBIND before leaving idle and call this function. + */ +static void idle_worker_rebind(struct worker *worker) +{ + struct global_cwq *gcwq = worker->pool->gcwq; + + /* CPU must be online at this point */ + WARN_ON(!worker_maybe_bind_and_lock(worker)); + if (!--worker->idle_rebind->cnt) + complete(&worker->idle_rebind->done); + spin_unlock_irq(&worker->pool->gcwq->lock); + + /* we did our part, wait for rebind_workers() to finish up */ + wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); +} + /* - * Function for worker->rebind_work used to rebind rogue busy workers - * to the associated cpu which is coming back online. This is - * scheduled by cpu up but can race with other cpu hotplug operations - * and may be executed twice without intervening cpu down. + * Function for @worker->rebind.work used to rebind unbound busy workers to + * the associated cpu which is coming back online. This is scheduled by + * cpu up but can race with other cpu hotplug operations and may be + * executed twice without intervening cpu down. */ -static void worker_rebind_fn(struct work_struct *work) +static void busy_worker_rebind_fn(struct work_struct *work) { struct worker *worker = container_of(work, struct worker, rebind_work); - struct global_cwq *gcwq = worker->gcwq; + struct global_cwq *gcwq = worker->pool->gcwq; if (worker_maybe_bind_and_lock(worker)) worker_clr_flags(worker, WORKER_REBIND); @@ -1325,6 +1345,112 @@ static void worker_rebind_fn(struct work_struct *work) spin_unlock_irq(&gcwq->lock); } +/** + * rebind_workers - rebind all workers of a gcwq to the associated CPU + * @gcwq: gcwq of interest + * + * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding + * is different for idle and busy ones. + * + * The idle ones should be rebound synchronously and idle rebinding should + * be complete before any worker starts executing work items with + * concurrency management enabled; otherwise, scheduler may oops trying to + * wake up non-local idle worker from wq_worker_sleeping(). + * + * This is achieved by repeatedly requesting rebinding until all idle + * workers are known to have been rebound under @gcwq->lock and holding all + * idle workers from becoming busy until idle rebinding is complete. + * + * Once idle workers are rebound, busy workers can be rebound as they + * finish executing their current work items. Queueing the rebind work at + * the head of their scheduled lists is enough. Note that nr_running will + * be properbly bumped as busy workers rebind. + * + * On return, all workers are guaranteed to either be bound or have rebind + * work item scheduled. + */ +static void rebind_workers(struct global_cwq *gcwq) + __releases(&gcwq->lock) __acquires(&gcwq->lock) +{ + struct idle_rebind idle_rebind; + struct worker_pool *pool; + struct worker *worker; + struct hlist_node *pos; + int i; + + lockdep_assert_held(&gcwq->lock); + + for_each_worker_pool(pool, gcwq) + lockdep_assert_held(&pool->manager_mutex); + + /* + * Rebind idle workers. Interlocked both ways. We wait for + * workers to rebind via @idle_rebind.done. Workers will wait for + * us to finish up by watching %WORKER_REBIND. + */ + init_completion(&idle_rebind.done); +retry: + idle_rebind.cnt = 1; + INIT_COMPLETION(idle_rebind.done); + + /* set REBIND and kick idle ones, we'll wait for these later */ + for_each_worker_pool(pool, gcwq) { + list_for_each_entry(worker, &pool->idle_list, entry) { + if (worker->flags & WORKER_REBIND) + continue; + + /* morph UNBOUND to REBIND */ + worker->flags &= ~WORKER_UNBOUND; + worker->flags |= WORKER_REBIND; + + idle_rebind.cnt++; + worker->idle_rebind = &idle_rebind; + + /* worker_thread() will call idle_worker_rebind() */ + wake_up_process(worker->task); + } + } + + if (--idle_rebind.cnt) { + spin_unlock_irq(&gcwq->lock); + wait_for_completion(&idle_rebind.done); + spin_lock_irq(&gcwq->lock); + /* busy ones might have become idle while waiting, retry */ + goto retry; + } + + /* + * All idle workers are rebound and waiting for %WORKER_REBIND to + * be cleared inside idle_worker_rebind(). Clear and release. + * Clearing %WORKER_REBIND from this foreign context is safe + * because these workers are still guaranteed to be idle. + */ + for_each_worker_pool(pool, gcwq) + list_for_each_entry(worker, &pool->idle_list, entry) + worker->flags &= ~WORKER_REBIND; + + wake_up_all(&gcwq->rebind_hold); + + /* rebind busy workers */ + for_each_busy_worker(worker, i, pos, gcwq) { + struct work_struct *rebind_work = &worker->rebind_work; + + /* morph UNBOUND to REBIND */ + worker->flags &= ~WORKER_UNBOUND; + worker->flags |= WORKER_REBIND; + + if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, + work_data_bits(rebind_work))) + continue; + + /* wq doesn't matter, use the default one */ + debug_work_activate(rebind_work); + insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, + worker->scheduled.next, + work_color_to_flags(WORK_NO_COLOR)); + } +} + static struct worker *alloc_worker(void) { struct worker *worker; @@ -1333,7 +1459,7 @@ static struct worker *alloc_worker(void) if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); - INIT_WORK(&worker->rebind_work, worker_rebind_fn); + INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn); /* on creation a worker is in !idle && prep state */ worker->flags = WORKER_PREP; } @@ -1342,10 +1468,9 @@ static struct worker *alloc_worker(void) /** * create_worker - create a new workqueue worker - * @gcwq: gcwq the new worker will belong to - * @bind: whether to set affinity to @cpu or not + * @pool: pool the new worker will belong to * - * Create a new worker which is bound to @gcwq. The returned worker + * Create a new worker which is bound to @pool. The returned worker * can be started by calling start_worker() or destroyed using * destroy_worker(). * @@ -1355,16 +1480,17 @@ static struct worker *alloc_worker(void) * RETURNS: * Pointer to the newly created worker. */ -static struct worker *create_worker(struct global_cwq *gcwq, bool bind) +static struct worker *create_worker(struct worker_pool *pool) { - bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; + struct global_cwq *gcwq = pool->gcwq; + const char *pri = worker_pool_pri(pool) ? "H" : ""; struct worker *worker = NULL; int id = -1; spin_lock_irq(&gcwq->lock); - while (ida_get_new(&gcwq->worker_ida, &id)) { + while (ida_get_new(&pool->worker_ida, &id)) { spin_unlock_irq(&gcwq->lock); - if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL)) + if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) goto fail; spin_lock_irq(&gcwq->lock); } @@ -1374,38 +1500,43 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind) if (!worker) goto fail; - worker->gcwq = gcwq; + worker->pool = pool; worker->id = id; - if (!on_unbound_cpu) + if (gcwq->cpu != WORK_CPU_UNBOUND) worker->task = kthread_create_on_node(worker_thread, - worker, - cpu_to_node(gcwq->cpu), - "kworker/%u:%d", gcwq->cpu, id); + worker, cpu_to_node(gcwq->cpu), + "kworker/%u:%d%s", gcwq->cpu, id, pri); else worker->task = kthread_create(worker_thread, worker, - "kworker/u:%d", id); + "kworker/u:%d%s", id, pri); if (IS_ERR(worker->task)) goto fail; + if (worker_pool_pri(pool)) + set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); + /* - * A rogue worker will become a regular one if CPU comes - * online later on. Make sure every worker has - * PF_THREAD_BOUND set. + * Determine CPU binding of the new worker depending on + * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the + * flag remains stable across this function. See the comments + * above the flag definition for details. + * + * As an unbound worker may later become a regular one if CPU comes + * online, make sure every worker has %PF_THREAD_BOUND set. */ - if (bind && !on_unbound_cpu) + if (!(gcwq->flags & GCWQ_DISASSOCIATED)) { kthread_bind(worker->task, gcwq->cpu); - else { + } else { worker->task->flags |= PF_THREAD_BOUND; - if (on_unbound_cpu) - worker->flags |= WORKER_UNBOUND; + worker->flags |= WORKER_UNBOUND; } return worker; fail: if (id >= 0) { spin_lock_irq(&gcwq->lock); - ida_remove(&gcwq->worker_ida, id); + ida_remove(&pool->worker_ida, id); spin_unlock_irq(&gcwq->lock); } kfree(worker); @@ -1424,7 +1555,7 @@ fail: static void start_worker(struct worker *worker) { worker->flags |= WORKER_STARTED; - worker->gcwq->nr_workers++; + worker->pool->nr_workers++; worker_enter_idle(worker); wake_up_process(worker->task); } @@ -1440,7 +1571,8 @@ static void start_worker(struct worker *worker) */ static void destroy_worker(struct worker *worker) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; + struct global_cwq *gcwq = pool->gcwq; int id = worker->id; /* sanity check frenzy */ @@ -1448,9 +1580,9 @@ static void destroy_worker(struct worker *worker) BUG_ON(!list_empty(&worker->scheduled)); if (worker->flags & WORKER_STARTED) - gcwq->nr_workers--; + pool->nr_workers--; if (worker->flags & WORKER_IDLE) - gcwq->nr_idle--; + pool->nr_idle--; list_del_init(&worker->entry); worker->flags |= WORKER_DIE; @@ -1461,29 +1593,30 @@ static void destroy_worker(struct worker *worker) kfree(worker); spin_lock_irq(&gcwq->lock); - ida_remove(&gcwq->worker_ida, id); + ida_remove(&pool->worker_ida, id); } -static void idle_worker_timeout(unsigned long __gcwq) +static void idle_worker_timeout(unsigned long __pool) { - struct global_cwq *gcwq = (void *)__gcwq; + struct worker_pool *pool = (void *)__pool; + struct global_cwq *gcwq = pool->gcwq; spin_lock_irq(&gcwq->lock); - if (too_many_workers(gcwq)) { + if (too_many_workers(pool)) { struct worker *worker; unsigned long expires; /* idle_list is kept in LIFO order, check the last one */ - worker = list_entry(gcwq->idle_list.prev, struct worker, entry); + worker = list_entry(pool->idle_list.prev, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) - mod_timer(&gcwq->idle_timer, expires); + mod_timer(&pool->idle_timer, expires); else { /* it's been idle for too long, wake up manager */ - gcwq->flags |= GCWQ_MANAGE_WORKERS; - wake_up_worker(gcwq); + pool->flags |= POOL_MANAGE_WORKERS; + wake_up_worker(pool); } } @@ -1500,7 +1633,7 @@ static bool send_mayday(struct work_struct *work) return false; /* mayday mayday mayday */ - cpu = cwq->gcwq->cpu; + cpu = cwq->pool->gcwq->cpu; /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ if (cpu == WORK_CPU_UNBOUND) cpu = 0; @@ -1509,37 +1642,38 @@ static bool send_mayday(struct work_struct *work) return true; } -static void gcwq_mayday_timeout(unsigned long __gcwq) +static void gcwq_mayday_timeout(unsigned long __pool) { - struct global_cwq *gcwq = (void *)__gcwq; + struct worker_pool *pool = (void *)__pool; + struct global_cwq *gcwq = pool->gcwq; struct work_struct *work; spin_lock_irq(&gcwq->lock); - if (need_to_create_worker(gcwq)) { + if (need_to_create_worker(pool)) { /* * We've been trying to create a new worker but * haven't been successful. We might be hitting an * allocation deadlock. Send distress signals to * rescuers. */ - list_for_each_entry(work, &gcwq->worklist, entry) + list_for_each_entry(work, &pool->worklist, entry) send_mayday(work); } spin_unlock_irq(&gcwq->lock); - mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL); + mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } /** * maybe_create_worker - create a new worker if necessary - * @gcwq: gcwq to create a new worker for + * @pool: pool to create a new worker for * - * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to + * Create a new worker for @pool if necessary. @pool is guaranteed to * have at least one idle worker on return from this function. If * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is - * sent to all rescuers with works scheduled on @gcwq to resolve + * sent to all rescuers with works scheduled on @pool to resolve * possible allocation deadlock. * * On return, need_to_create_worker() is guaranteed to be false and @@ -1554,52 +1688,54 @@ static void gcwq_mayday_timeout(unsigned long __gcwq) * false if no action was taken and gcwq->lock stayed locked, true * otherwise. */ -static bool maybe_create_worker(struct global_cwq *gcwq) +static bool maybe_create_worker(struct worker_pool *pool) __releases(&gcwq->lock) __acquires(&gcwq->lock) { - if (!need_to_create_worker(gcwq)) + struct global_cwq *gcwq = pool->gcwq; + + if (!need_to_create_worker(pool)) return false; restart: spin_unlock_irq(&gcwq->lock); /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ - mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); + mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); while (true) { struct worker *worker; - worker = create_worker(gcwq, true); + worker = create_worker(pool); if (worker) { - del_timer_sync(&gcwq->mayday_timer); + del_timer_sync(&pool->mayday_timer); spin_lock_irq(&gcwq->lock); start_worker(worker); - BUG_ON(need_to_create_worker(gcwq)); + BUG_ON(need_to_create_worker(pool)); return true; } - if (!need_to_create_worker(gcwq)) + if (!need_to_create_worker(pool)) break; __set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(CREATE_COOLDOWN); - if (!need_to_create_worker(gcwq)) + if (!need_to_create_worker(pool)) break; } - del_timer_sync(&gcwq->mayday_timer); + del_timer_sync(&pool->mayday_timer); spin_lock_irq(&gcwq->lock); - if (need_to_create_worker(gcwq)) + if (need_to_create_worker(pool)) goto restart; return true; } /** * maybe_destroy_worker - destroy workers which have been idle for a while - * @gcwq: gcwq to destroy workers for + * @pool: pool to destroy workers for * - * Destroy @gcwq workers which have been idle for longer than + * Destroy @pool workers which have been idle for longer than * IDLE_WORKER_TIMEOUT. * * LOCKING: @@ -1610,19 +1746,19 @@ restart: * false if no action was taken and gcwq->lock stayed locked, true * otherwise. */ -static bool maybe_destroy_workers(struct global_cwq *gcwq) +static bool maybe_destroy_workers(struct worker_pool *pool) { bool ret = false; - while (too_many_workers(gcwq)) { + while (too_many_workers(pool)) { struct worker *worker; unsigned long expires; - worker = list_entry(gcwq->idle_list.prev, struct worker, entry); + worker = list_entry(pool->idle_list.prev, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) { - mod_timer(&gcwq->idle_timer, expires); + mod_timer(&pool->idle_timer, expires); break; } @@ -1655,31 +1791,22 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq) */ static bool manage_workers(struct worker *worker) { - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; bool ret = false; - if (gcwq->flags & GCWQ_MANAGING_WORKERS) + if (!mutex_trylock(&pool->manager_mutex)) return ret; - gcwq->flags &= ~GCWQ_MANAGE_WORKERS; - gcwq->flags |= GCWQ_MANAGING_WORKERS; + pool->flags &= ~POOL_MANAGE_WORKERS; /* * Destroy and then create so that may_start_working() is true * on return. */ - ret |= maybe_destroy_workers(gcwq); - ret |= maybe_create_worker(gcwq); - - gcwq->flags &= ~GCWQ_MANAGING_WORKERS; - - /* - * The trustee might be waiting to take over the manager - * position, tell it we're done. - */ - if (unlikely(gcwq->trustee)) - wake_up_all(&gcwq->trustee_wait); + ret |= maybe_destroy_workers(pool); + ret |= maybe_create_worker(pool); + mutex_unlock(&pool->manager_mutex); return ret; } @@ -1728,10 +1855,9 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) { struct work_struct *work = list_first_entry(&cwq->delayed_works, struct work_struct, entry); - struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); trace_workqueue_activate_work(work); - move_linked_works(work, pos, NULL); + move_linked_works(work, &cwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); cwq->nr_active++; } @@ -1804,7 +1930,8 @@ __releases(&gcwq->lock) __acquires(&gcwq->lock) { struct cpu_workqueue_struct *cwq = get_work_cwq(work); - struct global_cwq *gcwq = cwq->gcwq; + struct worker_pool *pool = worker->pool; + struct global_cwq *gcwq = pool->gcwq; struct hlist_head *bwh = busy_worker_head(gcwq, work); bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; work_func_t f = work->func; @@ -1823,6 +1950,15 @@ __acquires(&gcwq->lock) lockdep_copy_map(&lockdep_map, &work->lockdep_map); #endif /* + * Ensure we're on the correct CPU. DISASSOCIATED test is + * necessary to avoid spurious warnings from rescuers servicing the + * unbound or a disassociated gcwq. + */ + WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) && + !(gcwq->flags & GCWQ_DISASSOCIATED) && + raw_smp_processor_id() != gcwq->cpu); + + /* * A single work shouldn't be executed concurrently by * multiple workers on a single cpu. Check whether anyone is * already processing the work. If so, defer the work to the @@ -1846,27 +1982,19 @@ __acquires(&gcwq->lock) list_del_init(&work->entry); /* - * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI, - * wake up another worker; otherwise, clear HIGHPRI_PENDING. - */ - if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) { - struct work_struct *nwork = list_first_entry(&gcwq->worklist, - struct work_struct, entry); - - if (!list_empty(&gcwq->worklist) && - get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI) - wake_up_worker(gcwq); - else - gcwq->flags &= ~GCWQ_HIGHPRI_PENDING; - } - - /* * CPU intensive works don't participate in concurrency * management. They're the scheduler's responsibility. */ if (unlikely(cpu_intensive)) worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); + /* + * Unbound gcwq isn't concurrency managed and work items should be + * executed ASAP. Wake up another worker if necessary. + */ + if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) + wake_up_worker(pool); + spin_unlock_irq(&gcwq->lock); work_clear_pending(work); @@ -1939,28 +2067,38 @@ static void process_scheduled_works(struct worker *worker) static int worker_thread(void *__worker) { struct worker *worker = __worker; - struct global_cwq *gcwq = worker->gcwq; + struct worker_pool *pool = worker->pool; + struct global_cwq *gcwq = pool->gcwq; /* tell the scheduler that this is a workqueue worker */ worker->task->flags |= PF_WQ_WORKER; woke_up: spin_lock_irq(&gcwq->lock); - /* DIE can be set only while we're idle, checking here is enough */ - if (worker->flags & WORKER_DIE) { + /* + * DIE can be set only while idle and REBIND set while busy has + * @worker->rebind_work scheduled. Checking here is enough. + */ + if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) { spin_unlock_irq(&gcwq->lock); - worker->task->flags &= ~PF_WQ_WORKER; - return 0; + + if (worker->flags & WORKER_DIE) { + worker->task->flags &= ~PF_WQ_WORKER; + return 0; + } + + idle_worker_rebind(worker); + goto woke_up; } worker_leave_idle(worker); recheck: /* no more worker necessary? */ - if (!need_more_worker(gcwq)) + if (!need_more_worker(pool)) goto sleep; /* do we need to manage? */ - if (unlikely(!may_start_working(gcwq)) && manage_workers(worker)) + if (unlikely(!may_start_working(pool)) && manage_workers(worker)) goto recheck; /* @@ -1979,7 +2117,7 @@ recheck: do { struct work_struct *work = - list_first_entry(&gcwq->worklist, + list_first_entry(&pool->worklist, struct work_struct, entry); if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { @@ -1991,11 +2129,11 @@ recheck: move_linked_works(work, &worker->scheduled, NULL); process_scheduled_works(worker); } - } while (keep_working(gcwq)); + } while (keep_working(pool)); worker_set_flags(worker, WORKER_PREP, false); sleep: - if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker)) + if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker)) goto recheck; /* @@ -2053,14 +2191,15 @@ repeat: for_each_mayday_cpu(cpu, wq->mayday_mask) { unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); - struct global_cwq *gcwq = cwq->gcwq; + struct worker_pool *pool = cwq->pool; + struct global_cwq *gcwq = pool->gcwq; struct work_struct *work, *n; __set_current_state(TASK_RUNNING); mayday_clear_cpu(cpu, wq->mayday_mask); /* migrate to the target cpu if possible */ - rescuer->gcwq = gcwq; + rescuer->pool = pool; worker_maybe_bind_and_lock(rescuer); /* @@ -2068,7 +2207,7 @@ repeat: * process'em. */ BUG_ON(!list_empty(&rescuer->scheduled)); - list_for_each_entry_safe(work, n, &gcwq->worklist, entry) + list_for_each_entry_safe(work, n, &pool->worklist, entry) if (get_work_cwq(work) == cwq) move_linked_works(work, scheduled, &n); @@ -2079,8 +2218,8 @@ repeat: * regular worker; otherwise, we end up with 0 concurrency * and stalling the execution. */ - if (keep_working(gcwq)) - wake_up_worker(gcwq); + if (keep_working(pool)) + wake_up_worker(pool); spin_unlock_irq(&gcwq->lock); } @@ -2205,7 +2344,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, for_each_cwq_cpu(cpu, wq) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - struct global_cwq *gcwq = cwq->gcwq; + struct global_cwq *gcwq = cwq->pool->gcwq; spin_lock_irq(&gcwq->lock); @@ -2421,9 +2560,9 @@ reflush: struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); bool drained; - spin_lock_irq(&cwq->gcwq->lock); + spin_lock_irq(&cwq->pool->gcwq->lock); drained = !cwq->nr_active && list_empty(&cwq->delayed_works); - spin_unlock_irq(&cwq->gcwq->lock); + spin_unlock_irq(&cwq->pool->gcwq->lock); if (drained) continue; @@ -2463,7 +2602,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, */ smp_rmb(); cwq = get_work_cwq(work); - if (unlikely(!cwq || gcwq != cwq->gcwq)) + if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) goto already_gone; } else if (wait_executing) { worker = find_worker_executing_work(gcwq, work); @@ -2984,13 +3123,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (flags & WQ_MEM_RECLAIM) flags |= WQ_RESCUER; - /* - * Unbound workqueues aren't concurrency managed and should be - * dispatched to workers immediately. - */ - if (flags & WQ_UNBOUND) - flags |= WQ_HIGHPRI; - max_active = max_active ?: WQ_DFL_ACTIVE; max_active = wq_clamp_max_active(max_active, flags, wq->name); @@ -3011,9 +3143,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, for_each_cwq_cpu(cpu, wq) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); struct global_cwq *gcwq = get_gcwq(cpu); + int pool_idx = (bool)(flags & WQ_HIGHPRI); BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); - cwq->gcwq = gcwq; + cwq->pool = &gcwq->pools[pool_idx]; cwq->wq = wq; cwq->flush_color = -1; cwq->max_active = max_active; @@ -3225,369 +3358,143 @@ EXPORT_SYMBOL_GPL(work_busy); * gcwqs serve mix of short, long and very long running works making * blocked draining impractical. * - * This is solved by allowing a gcwq to be detached from CPU, running - * it with unbound (rogue) workers and allowing it to be reattached - * later if the cpu comes back online. A separate thread is created - * to govern a gcwq in such state and is called the trustee of the - * gcwq. - * - * Trustee states and their descriptions. - * - * START Command state used on startup. On CPU_DOWN_PREPARE, a - * new trustee is started with this state. - * - * IN_CHARGE Once started, trustee will enter this state after - * assuming the manager role and making all existing - * workers rogue. DOWN_PREPARE waits for trustee to - * enter this state. After reaching IN_CHARGE, trustee - * tries to execute the pending worklist until it's empty - * and the state is set to BUTCHER, or the state is set - * to RELEASE. - * - * BUTCHER Command state which is set by the cpu callback after - * the cpu has went down. Once this state is set trustee - * knows that there will be no new works on the worklist - * and once the worklist is empty it can proceed to - * killing idle workers. - * - * RELEASE Command state which is set by the cpu callback if the - * cpu down has been canceled or it has come online - * again. After recognizing this state, trustee stops - * trying to drain or butcher and clears ROGUE, rebinds - * all remaining workers back to the cpu and releases - * manager role. - * - * DONE Trustee will enter this state after BUTCHER or RELEASE - * is complete. - * - * trustee CPU draining - * took over down complete - * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE - * | | ^ - * | CPU is back online v return workers | - * ----------------> RELEASE -------------- + * This is solved by allowing a gcwq to be disassociated from the CPU + * running as an unbound one and allowing it to be reattached later if the + * cpu comes back online. */ -/** - * trustee_wait_event_timeout - timed event wait for trustee - * @cond: condition to wait for - * @timeout: timeout in jiffies - * - * wait_event_timeout() for trustee to use. Handles locking and - * checks for RELEASE request. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by trustee. - * - * RETURNS: - * Positive indicating left time if @cond is satisfied, 0 if timed - * out, -1 if canceled. - */ -#define trustee_wait_event_timeout(cond, timeout) ({ \ - long __ret = (timeout); \ - while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \ - __ret) { \ - spin_unlock_irq(&gcwq->lock); \ - __wait_event_timeout(gcwq->trustee_wait, (cond) || \ - (gcwq->trustee_state == TRUSTEE_RELEASE), \ - __ret); \ - spin_lock_irq(&gcwq->lock); \ - } \ - gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \ -}) - -/** - * trustee_wait_event - event wait for trustee - * @cond: condition to wait for - * - * wait_event() for trustee to use. Automatically handles locking and - * checks for CANCEL request. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by trustee. - * - * RETURNS: - * 0 if @cond is satisfied, -1 if canceled. - */ -#define trustee_wait_event(cond) ({ \ - long __ret1; \ - __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\ - __ret1 < 0 ? -1 : 0; \ -}) - -static int __cpuinit trustee_thread(void *__gcwq) +/* claim manager positions of all pools */ +static void gcwq_claim_management_and_lock(struct global_cwq *gcwq) { - struct global_cwq *gcwq = __gcwq; - struct worker *worker; - struct work_struct *work; - struct hlist_node *pos; - long rc; - int i; - - BUG_ON(gcwq->cpu != smp_processor_id()); + struct worker_pool *pool; + for_each_worker_pool(pool, gcwq) + mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools); spin_lock_irq(&gcwq->lock); - /* - * Claim the manager position and make all workers rogue. - * Trustee must be bound to the target cpu and can't be - * cancelled. - */ - BUG_ON(gcwq->cpu != smp_processor_id()); - rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS)); - BUG_ON(rc < 0); - - gcwq->flags |= GCWQ_MANAGING_WORKERS; - - list_for_each_entry(worker, &gcwq->idle_list, entry) - worker->flags |= WORKER_ROGUE; +} - for_each_busy_worker(worker, i, pos, gcwq) - worker->flags |= WORKER_ROGUE; +/* release manager positions */ +static void gcwq_release_management_and_unlock(struct global_cwq *gcwq) +{ + struct worker_pool *pool; - /* - * Call schedule() so that we cross rq->lock and thus can - * guarantee sched callbacks see the rogue flag. This is - * necessary as scheduler callbacks may be invoked from other - * cpus. - */ spin_unlock_irq(&gcwq->lock); - schedule(); - spin_lock_irq(&gcwq->lock); + for_each_worker_pool(pool, gcwq) + mutex_unlock(&pool->manager_mutex); +} - /* - * Sched callbacks are disabled now. Zap nr_running. After - * this, nr_running stays zero and need_more_worker() and - * keep_working() are always true as long as the worklist is - * not empty. - */ - atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); +static void gcwq_unbind_fn(struct work_struct *work) +{ + struct global_cwq *gcwq = get_gcwq(smp_processor_id()); + struct worker_pool *pool; + struct worker *worker; + struct hlist_node *pos; + int i; - spin_unlock_irq(&gcwq->lock); - del_timer_sync(&gcwq->idle_timer); - spin_lock_irq(&gcwq->lock); + BUG_ON(gcwq->cpu != smp_processor_id()); - /* - * We're now in charge. Notify and proceed to drain. We need - * to keep the gcwq running during the whole CPU down - * procedure as other cpu hotunplug callbacks may need to - * flush currently running tasks. - */ - gcwq->trustee_state = TRUSTEE_IN_CHARGE; - wake_up_all(&gcwq->trustee_wait); + gcwq_claim_management_and_lock(gcwq); /* - * The original cpu is in the process of dying and may go away - * anytime now. When that happens, we and all workers would - * be migrated to other cpus. Try draining any left work. We - * want to get it over with ASAP - spam rescuers, wake up as - * many idlers as necessary and create new ones till the - * worklist is empty. Note that if the gcwq is frozen, there - * may be frozen works in freezable cwqs. Don't declare - * completion while frozen. + * We've claimed all manager positions. Make all workers unbound + * and set DISASSOCIATED. Before this, all workers except for the + * ones which are still executing works from before the last CPU + * down must be on the cpu. After this, they may become diasporas. */ - while (gcwq->nr_workers != gcwq->nr_idle || - gcwq->flags & GCWQ_FREEZING || - gcwq->trustee_state == TRUSTEE_IN_CHARGE) { - int nr_works = 0; - - list_for_each_entry(work, &gcwq->worklist, entry) { - send_mayday(work); - nr_works++; - } + for_each_worker_pool(pool, gcwq) + list_for_each_entry(worker, &pool->idle_list, entry) + worker->flags |= WORKER_UNBOUND; - list_for_each_entry(worker, &gcwq->idle_list, entry) { - if (!nr_works--) - break; - wake_up_process(worker->task); - } + for_each_busy_worker(worker, i, pos, gcwq) + worker->flags |= WORKER_UNBOUND; - if (need_to_create_worker(gcwq)) { - spin_unlock_irq(&gcwq->lock); - worker = create_worker(gcwq, false); - spin_lock_irq(&gcwq->lock); - if (worker) { - worker->flags |= WORKER_ROGUE; - start_worker(worker); - } - } + gcwq->flags |= GCWQ_DISASSOCIATED; - /* give a breather */ - if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0) - break; - } + gcwq_release_management_and_unlock(gcwq); /* - * Either all works have been scheduled and cpu is down, or - * cpu down has already been canceled. Wait for and butcher - * all workers till we're canceled. + * Call schedule() so that we cross rq->lock and thus can guarantee + * sched callbacks see the %WORKER_UNBOUND flag. This is necessary + * as scheduler callbacks may be invoked from other cpus. */ - do { - rc = trustee_wait_event(!list_empty(&gcwq->idle_list)); - while (!list_empty(&gcwq->idle_list)) - destroy_worker(list_first_entry(&gcwq->idle_list, - struct worker, entry)); - } while (gcwq->nr_workers && rc >= 0); + schedule(); /* - * At this point, either draining has completed and no worker - * is left, or cpu down has been canceled or the cpu is being - * brought back up. There shouldn't be any idle one left. - * Tell the remaining busy ones to rebind once it finishes the - * currently scheduled works by scheduling the rebind_work. + * Sched callbacks are disabled now. Zap nr_running. After this, + * nr_running stays zero and need_more_worker() and keep_working() + * are always true as long as the worklist is not empty. @gcwq now + * behaves as unbound (in terms of concurrency management) gcwq + * which is served by workers tied to the CPU. + * + * On return from this function, the current worker would trigger + * unbound chain execution of pending work items if other workers + * didn't already. */ - WARN_ON(!list_empty(&gcwq->idle_list)); - - for_each_busy_worker(worker, i, pos, gcwq) { - struct work_struct *rebind_work = &worker->rebind_work; - - /* - * Rebind_work may race with future cpu hotplug - * operations. Use a separate flag to mark that - * rebinding is scheduled. - */ - worker->flags |= WORKER_REBIND; - worker->flags &= ~WORKER_ROGUE; - - /* queue rebind_work, wq doesn't matter, use the default one */ - if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, - work_data_bits(rebind_work))) - continue; - - debug_work_activate(rebind_work); - insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, - worker->scheduled.next, - work_color_to_flags(WORK_NO_COLOR)); - } - - /* relinquish manager role */ - gcwq->flags &= ~GCWQ_MANAGING_WORKERS; - - /* notify completion */ - gcwq->trustee = NULL; - gcwq->trustee_state = TRUSTEE_DONE; - wake_up_all(&gcwq->trustee_wait); - spin_unlock_irq(&gcwq->lock); - return 0; + for_each_worker_pool(pool, gcwq) + atomic_set(get_pool_nr_running(pool), 0); } -/** - * wait_trustee_state - wait for trustee to enter the specified state - * @gcwq: gcwq the trustee of interest belongs to - * @state: target state to wait for - * - * Wait for the trustee to reach @state. DONE is already matched. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by cpu_callback. +/* + * Workqueues should be brought up before normal priority CPU notifiers. + * This will be registered high priority CPU notifier. */ -static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) -__releases(&gcwq->lock) -__acquires(&gcwq->lock) -{ - if (!(gcwq->trustee_state == state || - gcwq->trustee_state == TRUSTEE_DONE)) { - spin_unlock_irq(&gcwq->lock); - __wait_event(gcwq->trustee_wait, - gcwq->trustee_state == state || - gcwq->trustee_state == TRUSTEE_DONE); - spin_lock_irq(&gcwq->lock); - } -} - -static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct global_cwq *gcwq = get_gcwq(cpu); - struct task_struct *new_trustee = NULL; - struct worker *uninitialized_var(new_worker); - unsigned long flags; - - action &= ~CPU_TASKS_FROZEN; + struct worker_pool *pool; - switch (action) { - case CPU_DOWN_PREPARE: - new_trustee = kthread_create(trustee_thread, gcwq, - "workqueue_trustee/%d\n", cpu); - if (IS_ERR(new_trustee)) - return notifier_from_errno(PTR_ERR(new_trustee)); - kthread_bind(new_trustee, cpu); - /* fall through */ + switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - BUG_ON(gcwq->first_idle); - new_worker = create_worker(gcwq, false); - if (!new_worker) { - if (new_trustee) - kthread_stop(new_trustee); - return NOTIFY_BAD; - } - } - - /* some are called w/ irq disabled, don't disturb irq status */ - spin_lock_irqsave(&gcwq->lock, flags); + for_each_worker_pool(pool, gcwq) { + struct worker *worker; - switch (action) { - case CPU_DOWN_PREPARE: - /* initialize trustee and tell it to acquire the gcwq */ - BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE); - gcwq->trustee = new_trustee; - gcwq->trustee_state = TRUSTEE_START; - wake_up_process(gcwq->trustee); - wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); - /* fall through */ - case CPU_UP_PREPARE: - BUG_ON(gcwq->first_idle); - gcwq->first_idle = new_worker; - break; + if (pool->nr_workers) + continue; - case CPU_DYING: - /* - * Before this, the trustee and all workers except for - * the ones which are still executing works from - * before the last CPU down must be on the cpu. After - * this, they'll all be diasporas. - */ - gcwq->flags |= GCWQ_DISASSOCIATED; - break; + worker = create_worker(pool); + if (!worker) + return NOTIFY_BAD; - case CPU_POST_DEAD: - gcwq->trustee_state = TRUSTEE_BUTCHER; - /* fall through */ - case CPU_UP_CANCELED: - destroy_worker(gcwq->first_idle); - gcwq->first_idle = NULL; + spin_lock_irq(&gcwq->lock); + start_worker(worker); + spin_unlock_irq(&gcwq->lock); + } break; case CPU_DOWN_FAILED: case CPU_ONLINE: + gcwq_claim_management_and_lock(gcwq); gcwq->flags &= ~GCWQ_DISASSOCIATED; - if (gcwq->trustee_state != TRUSTEE_DONE) { - gcwq->trustee_state = TRUSTEE_RELEASE; - wake_up_process(gcwq->trustee); - wait_trustee_state(gcwq, TRUSTEE_DONE); - } - - /* - * Trustee is done and there might be no worker left. - * Put the first_idle in and request a real manager to - * take a look. - */ - spin_unlock_irq(&gcwq->lock); - kthread_bind(gcwq->first_idle->task, cpu); - spin_lock_irq(&gcwq->lock); - gcwq->flags |= GCWQ_MANAGE_WORKERS; - start_worker(gcwq->first_idle); - gcwq->first_idle = NULL; + rebind_workers(gcwq); + gcwq_release_management_and_unlock(gcwq); break; } + return NOTIFY_OK; +} - spin_unlock_irqrestore(&gcwq->lock, flags); +/* + * Workqueues should be brought down after normal priority CPU notifiers. + * This will be registered as low priority CPU notifier. + */ +static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct work_struct unbind_work; - return notifier_from_errno(0); + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + /* unbinding should happen on the local CPU */ + INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); + schedule_work_on(cpu, &unbind_work); + flush_work(&unbind_work); + break; + } + return NOTIFY_OK; } #ifdef CONFIG_SMP @@ -3746,6 +3653,7 @@ void thaw_workqueues(void) for_each_gcwq_cpu(cpu) { struct global_cwq *gcwq = get_gcwq(cpu); + struct worker_pool *pool; struct workqueue_struct *wq; spin_lock_irq(&gcwq->lock); @@ -3767,7 +3675,8 @@ void thaw_workqueues(void) cwq_activate_first_delayed(cwq); } - wake_up_worker(gcwq); + for_each_worker_pool(pool, gcwq) + wake_up_worker(pool); spin_unlock_irq(&gcwq->lock); } @@ -3783,46 +3692,57 @@ static int __init init_workqueues(void) unsigned int cpu; int i; - cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); + cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); + cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); /* initialize gcwqs */ for_each_gcwq_cpu(cpu) { struct global_cwq *gcwq = get_gcwq(cpu); + struct worker_pool *pool; spin_lock_init(&gcwq->lock); - INIT_LIST_HEAD(&gcwq->worklist); gcwq->cpu = cpu; gcwq->flags |= GCWQ_DISASSOCIATED; - INIT_LIST_HEAD(&gcwq->idle_list); for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) INIT_HLIST_HEAD(&gcwq->busy_hash[i]); - init_timer_deferrable(&gcwq->idle_timer); - gcwq->idle_timer.function = idle_worker_timeout; - gcwq->idle_timer.data = (unsigned long)gcwq; + for_each_worker_pool(pool, gcwq) { + pool->gcwq = gcwq; + INIT_LIST_HEAD(&pool->worklist); + INIT_LIST_HEAD(&pool->idle_list); + + init_timer_deferrable(&pool->idle_timer); + pool->idle_timer.function = idle_worker_timeout; + pool->idle_timer.data = (unsigned long)pool; - setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout, - (unsigned long)gcwq); + setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, + (unsigned long)pool); - ida_init(&gcwq->worker_ida); + mutex_init(&pool->manager_mutex); + ida_init(&pool->worker_ida); + } - gcwq->trustee_state = TRUSTEE_DONE; - init_waitqueue_head(&gcwq->trustee_wait); + init_waitqueue_head(&gcwq->rebind_hold); } /* create the initial worker */ for_each_online_gcwq_cpu(cpu) { struct global_cwq *gcwq = get_gcwq(cpu); - struct worker *worker; + struct worker_pool *pool; if (cpu != WORK_CPU_UNBOUND) gcwq->flags &= ~GCWQ_DISASSOCIATED; - worker = create_worker(gcwq, true); - BUG_ON(!worker); - spin_lock_irq(&gcwq->lock); - start_worker(worker); - spin_unlock_irq(&gcwq->lock); + + for_each_worker_pool(pool, gcwq) { + struct worker *worker; + + worker = create_worker(pool); + BUG_ON(!worker); + spin_lock_irq(&gcwq->lock); + start_worker(worker); + spin_unlock_irq(&gcwq->lock); + } } system_wq = alloc_workqueue("events", 0, 0); |