diff options
Diffstat (limited to 'kernel')
146 files changed, 10825 insertions, 8163 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 44511d100ea..d2b32ac27a3 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -138,7 +138,7 @@ config INLINE_SPIN_UNLOCK_BH config INLINE_SPIN_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH + depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ config INLINE_SPIN_UNLOCK_IRQRESTORE def_bool y @@ -175,7 +175,7 @@ config INLINE_READ_UNLOCK_BH config INLINE_READ_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_BH + depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ config INLINE_READ_UNLOCK_IRQRESTORE def_bool y @@ -212,7 +212,7 @@ config INLINE_WRITE_UNLOCK_BH config INLINE_WRITE_UNLOCK_IRQ def_bool y - depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH + depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ config INLINE_WRITE_UNLOCK_IRQRESTORE def_bool y diff --git a/kernel/Makefile b/kernel/Makefile index 271fd3119af..1ce47553fb0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -2,14 +2,14 @@ # Makefile for the linux kernel. # -obj-y = fork.o exec_domain.o panic.o printk.o \ +obj-y = fork.o exec_domain.o panic.o \ cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ - notifier.o ksysfs.o cred.o \ + notifier.o ksysfs.o cred.o reboot.o \ async.o range.o groups.o lglock.o smpboot.o ifdef CONFIG_FUNCTION_TRACER @@ -24,7 +24,9 @@ endif obj-y += sched/ obj-y += power/ +obj-y += printk/ obj-y += cpu/ +obj-y += irq/ obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o @@ -78,7 +80,6 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o -obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += rcutree.o diff --git a/kernel/audit.c b/kernel/audit.c index 21c7fa615bd..7b0e23a740c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1056,7 +1056,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, static void wait_for_auditd(unsigned long sleep_time) { DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(&audit_backlog_wait, &wait); if (audit_backlog_limit && @@ -1117,9 +1117,10 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, sleep_time = timeout_start + audit_backlog_wait_time - jiffies; - if ((long)sleep_time > 0) + if ((long)sleep_time > 0) { wait_for_auditd(sleep_time); - continue; + continue; + } } if (audit_rate_check() && printk_ratelimit()) printk(KERN_WARNING diff --git a/kernel/audit.h b/kernel/audit.h index 1c95131ef76..123c9b7c397 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -85,6 +85,7 @@ struct audit_names { struct filename *name; int name_len; /* number of chars to log */ + bool hidden; /* don't log this record */ bool name_put; /* call __putname()? */ unsigned long ino; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index a291aa23fb3..43c307dc945 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -658,6 +658,7 @@ int audit_add_tree_rule(struct audit_krule *rule) struct vfsmount *mnt; int err; + rule->tree = NULL; list_for_each_entry(tree, &tree_list, list) { if (!strcmp(seed->pathname, tree->pathname)) { put_tree(seed); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 6bd4a90d199..f7aee8be7fb 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -423,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->lsm_rule = NULL; /* Support legacy tests for a valid loginuid */ - if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) { + if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) { f->type = AUDIT_LOGINUID_SET; f->val = 0; } @@ -865,6 +865,12 @@ static inline int audit_add_rule(struct audit_entry *entry) err = audit_add_watch(&entry->rule, &list); if (err) { mutex_unlock(&audit_filter_mutex); + /* + * normally audit_add_tree_rule() will free it + * on failure + */ + if (tree) + audit_put_tree(tree); goto error; } } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3c8a601324a..9845cb32b60 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1399,8 +1399,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } i = 0; - list_for_each_entry(n, &context->names_list, list) + list_for_each_entry(n, &context->names_list, list) { + if (n->hidden) + continue; audit_log_name(context, n, NULL, i++, &call_panic); + } /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); @@ -1769,14 +1772,15 @@ void audit_putname(struct filename *name) * __audit_inode - store the inode and device from a lookup * @name: name being audited * @dentry: dentry being audited - * @parent: does this dentry represent the parent? + * @flags: attributes for this particular entry */ void __audit_inode(struct filename *name, const struct dentry *dentry, - unsigned int parent) + unsigned int flags) { struct audit_context *context = current->audit_context; const struct inode *inode = dentry->d_inode; struct audit_names *n; + bool parent = flags & AUDIT_INODE_PARENT; if (!context->in_syscall) return; @@ -1831,6 +1835,8 @@ out: if (parent) { n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; n->type = AUDIT_TYPE_PARENT; + if (flags & AUDIT_INODE_HIDDEN) + n->hidden = true; } else { n->name_len = AUDIT_NAME_FULL; n->type = AUDIT_TYPE_NORMAL; diff --git a/kernel/capability.c b/kernel/capability.c index f6c2ce5701e..4e66bf9275b 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -433,18 +433,6 @@ bool capable(int cap) EXPORT_SYMBOL(capable); /** - * nsown_capable - Check superior capability to one's own user_ns - * @cap: The capability in question - * - * Return true if the current task has the given superior capability - * targeted at its own user namespace. - */ -bool nsown_capable(int cap) -{ - return ns_capable(current_user_ns(), cap); -} - -/** * inode_capable - Check superior capability over inode * @inode: The inode in question * @cap: The capability in question @@ -464,3 +452,4 @@ bool inode_capable(const struct inode *inode, int cap) return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); } +EXPORT_SYMBOL(inode_capable); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a7c9e6ddb97..8bd9cfdc70d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -60,12 +60,10 @@ #include <linux/poll.h> #include <linux/flex_array.h> /* used in cgroup_attach_task */ #include <linux/kthread.h> +#include <linux/file.h> #include <linux/atomic.h> -/* css deactivation bias, makes css->refcnt negative to deny new trygets */ -#define CSS_DEACT_BIAS INT_MIN - /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. @@ -84,7 +82,7 @@ */ #ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); -EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ +EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ #else static DEFINE_MUTEX(cgroup_mutex); #endif @@ -99,16 +97,19 @@ static DEFINE_MUTEX(cgroup_root_mutex); */ #define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, #define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) -static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { +static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = { #include <linux/cgroup_subsys.h> }; /* - * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the - * subsystems that are otherwise unattached - it never has more than a - * single cgroup, and all tasks are part of that cgroup. + * The dummy hierarchy, reserved for the subsystems that are otherwise + * unattached - it never has more than a single cgroup, and all tasks are + * part of that cgroup. */ -static struct cgroupfs_root rootnode; +static struct cgroupfs_root cgroup_dummy_root; + +/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ +static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; /* * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. @@ -117,6 +118,7 @@ struct cfent { struct list_head node; struct dentry *dentry; struct cftype *type; + struct cgroup_subsys_state *css; /* file xattrs */ struct simple_xattrs xattrs; @@ -159,9 +161,9 @@ struct css_id { */ struct cgroup_event { /* - * Cgroup which the event belongs to. + * css which the event belongs to. */ - struct cgroup *cgrp; + struct cgroup_subsys_state *css; /* * Control file which the event associated. */ @@ -186,18 +188,28 @@ struct cgroup_event { /* The list of hierarchy roots */ -static LIST_HEAD(roots); -static int root_count; - -static DEFINE_IDA(hierarchy_ida); -static int next_hierarchy_id; -static DEFINE_SPINLOCK(hierarchy_id_lock); +static LIST_HEAD(cgroup_roots); +static int cgroup_root_count; -/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ -#define dummytop (&rootnode.top_cgroup) +/* + * Hierarchy ID allocation and mapping. It follows the same exclusion + * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for + * writes, either for reads. + */ +static DEFINE_IDR(cgroup_hierarchy_idr); static struct cgroup_name root_cgroup_name = { .name = "/" }; +/* + * Assign a monotonically increasing serial number to cgroups. It + * guarantees cgroups with bigger numbers are newer than those with smaller + * numbers. Also, as cgroups are always appended to the parent's + * ->children list, it guarantees that sibling cgroups are always sorted in + * the ascending serial number order on the list. Protected by + * cgroup_mutex. + */ +static u64 cgroup_serial_nr_next = 1; + /* This flag indicates whether tasks in the fork and exit paths should * check for fork/exit handlers to call. This avoids us having to do * extra work in the fork/exit path if none of the subsystems need to @@ -205,27 +217,38 @@ static struct cgroup_name root_cgroup_name = { .name = "/" }; */ static int need_forkexit_callback __read_mostly; -static int cgroup_destroy_locked(struct cgroup *cgrp); -static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, - struct cftype cfts[], bool is_add); +static struct cftype cgroup_base_files[]; -static int css_unbias_refcnt(int refcnt) -{ - return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; -} +static void cgroup_destroy_css_killed(struct cgroup *cgrp); +static int cgroup_destroy_locked(struct cgroup *cgrp); +static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], + bool is_add); -/* the current nr of refs, always >= 0 whether @css is deactivated or not */ -static int css_refcnt(struct cgroup_subsys_state *css) +/** + * cgroup_css - obtain a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest (%NULL returns the dummy_css) + * + * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This + * function must be called either under cgroup_mutex or rcu_read_lock() and + * the caller is responsible for pinning the returned css if it wants to + * keep accessing it outside the said locks. This function may return + * %NULL if @cgrp doesn't have @subsys_id enabled. + */ +static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) { - int v = atomic_read(&css->refcnt); - - return css_unbias_refcnt(v); + if (ss) + return rcu_dereference_check(cgrp->subsys[ss->subsys_id], + lockdep_is_held(&cgroup_mutex)); + else + return &cgrp->dummy_css; } /* convenient tests for these bits */ -inline int cgroup_is_removed(const struct cgroup *cgrp) +static inline bool cgroup_is_dead(const struct cgroup *cgrp) { - return test_bit(CGRP_REMOVED, &cgrp->flags); + return test_bit(CGRP_DEAD, &cgrp->flags); } /** @@ -261,16 +284,38 @@ static int notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } -/* - * for_each_subsys() allows you to iterate on each subsystem attached to - * an active hierarchy +/** + * for_each_subsys - iterate all loaded cgroup subsystems + * @ss: the iteration cursor + * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end + * + * Should be called under cgroup_mutex. + */ +#define for_each_subsys(ss, i) \ + for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + !((ss) = cgroup_subsys[i]); })) { } \ + else + +/** + * for_each_builtin_subsys - iterate all built-in cgroup subsystems + * @ss: the iteration cursor + * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end + * + * Bulit-in subsystems are always present and iteration itself doesn't + * require any synchronization. */ -#define for_each_subsys(_root, _ss) \ -list_for_each_entry(_ss, &_root->subsys_list, sibling) +#define for_each_builtin_subsys(ss, i) \ + for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ + (((ss) = cgroup_subsys[i]) || true); (i)++) -/* for_each_active_root() allows you to iterate across the active hierarchies */ -#define for_each_active_root(_root) \ -list_for_each_entry(_root, &roots, root_list) +/* iterate each subsystem attached to a hierarchy */ +#define for_each_root_subsys(root, ss) \ + list_for_each_entry((ss), &(root)->subsys_list, sibling) + +/* iterate across the active hierarchies */ +#define for_each_active_root(root) \ + list_for_each_entry((root), &cgroup_roots, root_list) static inline struct cgroup *__d_cgrp(struct dentry *dentry) { @@ -297,7 +342,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry) static bool cgroup_lock_live_group(struct cgroup *cgrp) { mutex_lock(&cgroup_mutex); - if (cgroup_is_removed(cgrp)) { + if (cgroup_is_dead(cgrp)) { mutex_unlock(&cgroup_mutex); return false; } @@ -312,20 +357,24 @@ static void cgroup_release_agent(struct work_struct *work); static DECLARE_WORK(release_agent_work, cgroup_release_agent); static void check_for_release(struct cgroup *cgrp); -/* Link structure for associating css_set objects with cgroups */ -struct cg_cgroup_link { - /* - * List running through cg_cgroup_links associated with a - * cgroup, anchored on cgroup->css_sets - */ - struct list_head cgrp_link_list; - struct cgroup *cgrp; - /* - * List running through cg_cgroup_links pointing at a - * single css_set object, anchored on css_set->cg_links - */ - struct list_head cg_link_list; - struct css_set *cg; +/* + * A cgroup can be associated with multiple css_sets as different tasks may + * belong to different cgroups on different hierarchies. In the other + * direction, a css_set is naturally associated with multiple cgroups. + * This M:N relationship is represented by the following link structure + * which exists for each association and allows traversing the associations + * from both sides. + */ +struct cgrp_cset_link { + /* the cgroup and css_set this link associates */ + struct cgroup *cgrp; + struct css_set *cset; + + /* list of cgrp_cset_links anchored at cgrp->cset_links */ + struct list_head cset_link; + + /* list of cgrp_cset_links anchored at css_set->cgrp_links */ + struct list_head cgrp_link; }; /* The default css_set - used by init and its children prior to any @@ -336,14 +385,16 @@ struct cg_cgroup_link { */ static struct css_set init_css_set; -static struct cg_cgroup_link init_css_set_link; +static struct cgrp_cset_link init_cgrp_cset_link; static int cgroup_init_idr(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); -/* css_set_lock protects the list of css_set objects, and the - * chain of tasks off each css_set. Nests outside task->alloc_lock - * due to cgroup_iter_start() */ +/* + * css_set_lock protects the list of css_set objects, and the chain of + * tasks off each css_set. Nests outside task->alloc_lock due to + * css_task_iter_start(). + */ static DEFINE_RWLOCK(css_set_lock); static int css_set_count; @@ -357,106 +408,102 @@ static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) { - int i; unsigned long key = 0UL; + struct cgroup_subsys *ss; + int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) + for_each_subsys(ss, i) key += (unsigned long)css[i]; key = (key >> 16) ^ key; return key; } -/* We don't maintain the lists running through each css_set to its - * task until after the first call to cgroup_iter_start(). This - * reduces the fork()/exit() overhead for people who have cgroups - * compiled into their kernel but not actually in use */ +/* + * We don't maintain the lists running through each css_set to its task + * until after the first call to css_task_iter_start(). This reduces the + * fork()/exit() overhead for people who have cgroups compiled into their + * kernel but not actually in use. + */ static int use_task_css_set_links __read_mostly; -static void __put_css_set(struct css_set *cg, int taskexit) +static void __put_css_set(struct css_set *cset, int taskexit) { - struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; + struct cgrp_cset_link *link, *tmp_link; + /* * Ensure that the refcount doesn't hit zero while any readers * can see it. Similar to atomic_dec_and_lock(), but for an * rwlock */ - if (atomic_add_unless(&cg->refcount, -1, 1)) + if (atomic_add_unless(&cset->refcount, -1, 1)) return; write_lock(&css_set_lock); - if (!atomic_dec_and_test(&cg->refcount)) { + if (!atomic_dec_and_test(&cset->refcount)) { write_unlock(&css_set_lock); return; } /* This css_set is dead. unlink it and release cgroup refcounts */ - hash_del(&cg->hlist); + hash_del(&cset->hlist); css_set_count--; - list_for_each_entry_safe(link, saved_link, &cg->cg_links, - cg_link_list) { + list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { struct cgroup *cgrp = link->cgrp; - list_del(&link->cg_link_list); - list_del(&link->cgrp_link_list); - /* - * We may not be holding cgroup_mutex, and if cgrp->count is - * dropped to 0 the cgroup can be destroyed at any time, hence - * rcu_read_lock is used to keep it alive. - */ - rcu_read_lock(); - if (atomic_dec_and_test(&cgrp->count) && - notify_on_release(cgrp)) { + list_del(&link->cset_link); + list_del(&link->cgrp_link); + + /* @cgrp can't go away while we're holding css_set_lock */ + if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } - rcu_read_unlock(); kfree(link); } write_unlock(&css_set_lock); - kfree_rcu(cg, rcu_head); + kfree_rcu(cset, rcu_head); } /* * refcounted get/put for css_set objects */ -static inline void get_css_set(struct css_set *cg) +static inline void get_css_set(struct css_set *cset) { - atomic_inc(&cg->refcount); + atomic_inc(&cset->refcount); } -static inline void put_css_set(struct css_set *cg) +static inline void put_css_set(struct css_set *cset) { - __put_css_set(cg, 0); + __put_css_set(cset, 0); } -static inline void put_css_set_taskexit(struct css_set *cg) +static inline void put_css_set_taskexit(struct css_set *cset) { - __put_css_set(cg, 1); + __put_css_set(cset, 1); } -/* +/** * compare_css_sets - helper function for find_existing_css_set(). - * @cg: candidate css_set being tested - * @old_cg: existing css_set for a task + * @cset: candidate css_set being tested + * @old_cset: existing css_set for a task * @new_cgrp: cgroup that's being entered by the task * @template: desired set of css pointers in css_set (pre-calculated) * - * Returns true if "cg" matches "old_cg" except for the hierarchy + * Returns true if "cset" matches "old_cset" except for the hierarchy * which "new_cgrp" belongs to, for which it should match "new_cgrp". */ -static bool compare_css_sets(struct css_set *cg, - struct css_set *old_cg, +static bool compare_css_sets(struct css_set *cset, + struct css_set *old_cset, struct cgroup *new_cgrp, struct cgroup_subsys_state *template[]) { struct list_head *l1, *l2; - if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { + if (memcmp(template, cset->subsys, sizeof(cset->subsys))) { /* Not all subsystems matched */ return false; } @@ -470,28 +517,28 @@ static bool compare_css_sets(struct css_set *cg, * candidates. */ - l1 = &cg->cg_links; - l2 = &old_cg->cg_links; + l1 = &cset->cgrp_links; + l2 = &old_cset->cgrp_links; while (1) { - struct cg_cgroup_link *cgl1, *cgl2; - struct cgroup *cg1, *cg2; + struct cgrp_cset_link *link1, *link2; + struct cgroup *cgrp1, *cgrp2; l1 = l1->next; l2 = l2->next; /* See if we reached the end - both lists are equal length. */ - if (l1 == &cg->cg_links) { - BUG_ON(l2 != &old_cg->cg_links); + if (l1 == &cset->cgrp_links) { + BUG_ON(l2 != &old_cset->cgrp_links); break; } else { - BUG_ON(l2 == &old_cg->cg_links); + BUG_ON(l2 == &old_cset->cgrp_links); } /* Locate the cgroups associated with these links. */ - cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); - cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); - cg1 = cgl1->cgrp; - cg2 = cgl2->cgrp; + link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link); + link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link); + cgrp1 = link1->cgrp; + cgrp2 = link2->cgrp; /* Hierarchies should be linked in the same order. */ - BUG_ON(cg1->root != cg2->root); + BUG_ON(cgrp1->root != cgrp2->root); /* * If this hierarchy is the hierarchy of the cgroup @@ -500,196 +547,193 @@ static bool compare_css_sets(struct css_set *cg, * hierarchy, then this css_set should point to the * same cgroup as the old css_set. */ - if (cg1->root == new_cgrp->root) { - if (cg1 != new_cgrp) + if (cgrp1->root == new_cgrp->root) { + if (cgrp1 != new_cgrp) return false; } else { - if (cg1 != cg2) + if (cgrp1 != cgrp2) return false; } } return true; } -/* - * find_existing_css_set() is a helper for - * find_css_set(), and checks to see whether an existing - * css_set is suitable. - * - * oldcg: the cgroup group that we're using before the cgroup - * transition - * - * cgrp: the cgroup that we're moving into - * - * template: location in which to build the desired set of subsystem - * state objects for the new cgroup group +/** + * find_existing_css_set - init css array and find the matching css_set + * @old_cset: the css_set that we're using before the cgroup transition + * @cgrp: the cgroup that we're moving into + * @template: out param for the new set of csses, should be clear on entry */ -static struct css_set *find_existing_css_set( - struct css_set *oldcg, - struct cgroup *cgrp, - struct cgroup_subsys_state *template[]) +static struct css_set *find_existing_css_set(struct css_set *old_cset, + struct cgroup *cgrp, + struct cgroup_subsys_state *template[]) { - int i; struct cgroupfs_root *root = cgrp->root; - struct css_set *cg; + struct cgroup_subsys *ss; + struct css_set *cset; unsigned long key; + int i; /* * Build the set of subsystem state objects that we want to see in the * new css_set. while subsystems can change globally, the entries here * won't change, so no need for locking. */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + for_each_subsys(ss, i) { if (root->subsys_mask & (1UL << i)) { /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ - template[i] = cgrp->subsys[i]; + template[i] = cgroup_css(cgrp, ss); } else { /* Subsystem is not in this hierarchy, so we * don't want to change the subsystem state */ - template[i] = oldcg->subsys[i]; + template[i] = old_cset->subsys[i]; } } key = css_set_hash(template); - hash_for_each_possible(css_set_table, cg, hlist, key) { - if (!compare_css_sets(cg, oldcg, cgrp, template)) + hash_for_each_possible(css_set_table, cset, hlist, key) { + if (!compare_css_sets(cset, old_cset, cgrp, template)) continue; /* This css_set matches what we need */ - return cg; + return cset; } /* No existing cgroup group matched */ return NULL; } -static void free_cg_links(struct list_head *tmp) +static void free_cgrp_cset_links(struct list_head *links_to_free) { - struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; + struct cgrp_cset_link *link, *tmp_link; - list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { - list_del(&link->cgrp_link_list); + list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) { + list_del(&link->cset_link); kfree(link); } } -/* - * allocate_cg_links() allocates "count" cg_cgroup_link structures - * and chains them on tmp through their cgrp_link_list fields. Returns 0 on - * success or a negative error +/** + * allocate_cgrp_cset_links - allocate cgrp_cset_links + * @count: the number of links to allocate + * @tmp_links: list_head the allocated links are put on + * + * Allocate @count cgrp_cset_link structures and chain them on @tmp_links + * through ->cset_link. Returns 0 on success or -errno. */ -static int allocate_cg_links(int count, struct list_head *tmp) +static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links) { - struct cg_cgroup_link *link; + struct cgrp_cset_link *link; int i; - INIT_LIST_HEAD(tmp); + + INIT_LIST_HEAD(tmp_links); + for (i = 0; i < count; i++) { - link = kmalloc(sizeof(*link), GFP_KERNEL); + link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) { - free_cg_links(tmp); + free_cgrp_cset_links(tmp_links); return -ENOMEM; } - list_add(&link->cgrp_link_list, tmp); + list_add(&link->cset_link, tmp_links); } return 0; } /** * link_css_set - a helper function to link a css_set to a cgroup - * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() - * @cg: the css_set to be linked + * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links() + * @cset: the css_set to be linked * @cgrp: the destination cgroup */ -static void link_css_set(struct list_head *tmp_cg_links, - struct css_set *cg, struct cgroup *cgrp) +static void link_css_set(struct list_head *tmp_links, struct css_set *cset, + struct cgroup *cgrp) { - struct cg_cgroup_link *link; + struct cgrp_cset_link *link; - BUG_ON(list_empty(tmp_cg_links)); - link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, - cgrp_link_list); - link->cg = cg; + BUG_ON(list_empty(tmp_links)); + link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); + link->cset = cset; link->cgrp = cgrp; - atomic_inc(&cgrp->count); - list_move(&link->cgrp_link_list, &cgrp->css_sets); + list_move(&link->cset_link, &cgrp->cset_links); /* * Always add links to the tail of the list so that the list * is sorted by order of hierarchy creation */ - list_add_tail(&link->cg_link_list, &cg->cg_links); + list_add_tail(&link->cgrp_link, &cset->cgrp_links); } -/* - * find_css_set() takes an existing cgroup group and a - * cgroup object, and returns a css_set object that's - * equivalent to the old group, but with the given cgroup - * substituted into the appropriate hierarchy. Must be called with - * cgroup_mutex held +/** + * find_css_set - return a new css_set with one cgroup updated + * @old_cset: the baseline css_set + * @cgrp: the cgroup to be updated + * + * Return a new css_set that's equivalent to @old_cset, but with @cgrp + * substituted into the appropriate hierarchy. */ -static struct css_set *find_css_set( - struct css_set *oldcg, struct cgroup *cgrp) +static struct css_set *find_css_set(struct css_set *old_cset, + struct cgroup *cgrp) { - struct css_set *res; - struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - - struct list_head tmp_cg_links; - - struct cg_cgroup_link *link; + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { }; + struct css_set *cset; + struct list_head tmp_links; + struct cgrp_cset_link *link; unsigned long key; + lockdep_assert_held(&cgroup_mutex); + /* First see if we already have a cgroup group that matches * the desired set */ read_lock(&css_set_lock); - res = find_existing_css_set(oldcg, cgrp, template); - if (res) - get_css_set(res); + cset = find_existing_css_set(old_cset, cgrp, template); + if (cset) + get_css_set(cset); read_unlock(&css_set_lock); - if (res) - return res; + if (cset) + return cset; - res = kmalloc(sizeof(*res), GFP_KERNEL); - if (!res) + cset = kzalloc(sizeof(*cset), GFP_KERNEL); + if (!cset) return NULL; - /* Allocate all the cg_cgroup_link objects that we'll need */ - if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { - kfree(res); + /* Allocate all the cgrp_cset_link objects that we'll need */ + if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) { + kfree(cset); return NULL; } - atomic_set(&res->refcount, 1); - INIT_LIST_HEAD(&res->cg_links); - INIT_LIST_HEAD(&res->tasks); - INIT_HLIST_NODE(&res->hlist); + atomic_set(&cset->refcount, 1); + INIT_LIST_HEAD(&cset->cgrp_links); + INIT_LIST_HEAD(&cset->tasks); + INIT_HLIST_NODE(&cset->hlist); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ - memcpy(res->subsys, template, sizeof(res->subsys)); + memcpy(cset->subsys, template, sizeof(cset->subsys)); write_lock(&css_set_lock); /* Add reference counts and links from the new css_set. */ - list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { + list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; + if (c->root == cgrp->root) c = cgrp; - link_css_set(&tmp_cg_links, res, c); + link_css_set(&tmp_links, cset, c); } - BUG_ON(!list_empty(&tmp_cg_links)); + BUG_ON(!list_empty(&tmp_links)); css_set_count++; /* Add this cgroup group to the hash table */ - key = css_set_hash(res->subsys); - hash_add(css_set_table, &res->hlist, key); + key = css_set_hash(cset->subsys); + hash_add(css_set_table, &cset->hlist, key); write_unlock(&css_set_lock); - return res; + return cset; } /* @@ -699,7 +743,7 @@ static struct css_set *find_css_set( static struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroupfs_root *root) { - struct css_set *css; + struct css_set *cset; struct cgroup *res = NULL; BUG_ON(!mutex_is_locked(&cgroup_mutex)); @@ -709,13 +753,15 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * task can't change groups, so the only thing that can happen * is that it exits and its css is set back to init_css_set. */ - css = task->cgroups; - if (css == &init_css_set) { + cset = task_css_set(task); + if (cset == &init_css_set) { res = &root->top_cgroup; } else { - struct cg_cgroup_link *link; - list_for_each_entry(link, &css->cg_links, cg_link_list) { + struct cgrp_cset_link *link; + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; + if (c->root == root) { res = c; break; @@ -785,10 +831,8 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, */ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); -static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); -static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, - unsigned long subsys_mask); +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); static const struct inode_operations cgroup_dir_inode_operations; static const struct file_operations proc_cgroupstats_operations; @@ -797,8 +841,7 @@ static struct backing_dev_info cgroup_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; -static int alloc_css_id(struct cgroup_subsys *ss, - struct cgroup *parent, struct cgroup *child); +static int alloc_css_id(struct cgroup_subsys_state *child_css); static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) { @@ -828,16 +871,9 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) static void cgroup_free_fn(struct work_struct *work) { - struct cgroup *cgrp = container_of(work, struct cgroup, free_work); - struct cgroup_subsys *ss; + struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); mutex_lock(&cgroup_mutex); - /* - * Release the subsystem state objects. - */ - for_each_subsys(cgrp->root, ss) - ss->css_free(cgrp); - cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -848,8 +884,6 @@ static void cgroup_free_fn(struct work_struct *work) */ dput(cgrp->parent->dentry); - ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); - /* * Drop the active superblock reference that we took when we * created the cgroup. This will free cgrp->root, if we are @@ -873,7 +907,8 @@ static void cgroup_free_rcu(struct rcu_head *head) { struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); - schedule_work(&cgrp->free_work); + INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); + schedule_work(&cgrp->destroy_work); } static void cgroup_diput(struct dentry *dentry, struct inode *inode) @@ -882,7 +917,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) if (S_ISDIR(inode->i_mode)) { struct cgroup *cgrp = dentry->d_fsdata; - BUG_ON(!(cgroup_is_removed(cgrp))); + BUG_ON(!(cgroup_is_dead(cgrp))); call_rcu(&cgrp->rcu_head, cgroup_free_rcu); } else { struct cfent *cfe = __d_cfe(dentry); @@ -939,27 +974,22 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) } /** - * cgroup_clear_directory - selective removal of base and subsystem files - * @dir: directory containing the files - * @base_files: true if the base files should be removed + * cgroup_clear_dir - remove subsys files in a cgroup directory + * @cgrp: target cgroup * @subsys_mask: mask of the subsystem ids whose files should be removed */ -static void cgroup_clear_directory(struct dentry *dir, bool base_files, - unsigned long subsys_mask) +static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) { - struct cgroup *cgrp = __d_cgrp(dir); struct cgroup_subsys *ss; + int i; - for_each_subsys(cgrp->root, ss) { + for_each_subsys(ss, i) { struct cftype_set *set; - if (!test_bit(ss->subsys_id, &subsys_mask)) + + if (!test_bit(i, &subsys_mask)) continue; list_for_each_entry(set, &ss->cftsets, node) - cgroup_addrm_files(cgrp, NULL, set->cfts, false); - } - if (base_files) { - while (!list_empty(&cgrp->files)) - cgroup_rm_file(cgrp, NULL); + cgroup_addrm_files(cgrp, set->cfts, false); } } @@ -969,9 +999,6 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, static void cgroup_d_remove_dir(struct dentry *dentry) { struct dentry *parent; - struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - - cgroup_clear_directory(dentry, true, root->subsys_mask); parent = dentry->d_parent; spin_lock(&parent->d_lock); @@ -988,92 +1015,104 @@ static void cgroup_d_remove_dir(struct dentry *dentry) * returns an error, no reference counts are touched. */ static int rebind_subsystems(struct cgroupfs_root *root, - unsigned long final_subsys_mask) + unsigned long added_mask, unsigned removed_mask) { - unsigned long added_mask, removed_mask; struct cgroup *cgrp = &root->top_cgroup; - int i; + struct cgroup_subsys *ss; + unsigned long pinned = 0; + int i, ret; BUG_ON(!mutex_is_locked(&cgroup_mutex)); BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); - removed_mask = root->actual_subsys_mask & ~final_subsys_mask; - added_mask = final_subsys_mask & ~root->actual_subsys_mask; /* Check that any added subsystems are currently free */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - unsigned long bit = 1UL << i; - struct cgroup_subsys *ss = subsys[i]; - if (!(bit & added_mask)) + for_each_subsys(ss, i) { + if (!(added_mask & (1 << i))) continue; - /* - * Nobody should tell us to do a subsys that doesn't exist: - * parse_cgroupfs_options should catch that case and refcounts - * ensure that subsystems won't disappear once selected. - */ - BUG_ON(ss == NULL); - if (ss->root != &rootnode) { - /* Subsystem isn't free */ - return -EBUSY; + + /* is the subsystem mounted elsewhere? */ + if (ss->root != &cgroup_dummy_root) { + ret = -EBUSY; + goto out_put; + } + + /* pin the module */ + if (!try_module_get(ss->module)) { + ret = -ENOENT; + goto out_put; } + pinned |= 1 << i; } - /* Currently we don't handle adding/removing subsystems when - * any child cgroups exist. This is theoretically supportable - * but involves complex error handling, so it's being left until - * later */ - if (root->number_of_cgroups > 1) - return -EBUSY; + /* subsys could be missing if unloaded between parsing and here */ + if (added_mask != pinned) { + ret = -ENOENT; + goto out_put; + } + + ret = cgroup_populate_dir(cgrp, added_mask); + if (ret) + goto out_put; - /* Process each subsystem */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; + /* + * Nothing can fail from this point on. Remove files for the + * removed subsystems and rebind each subsystem. + */ + cgroup_clear_dir(cgrp, removed_mask); + + for_each_subsys(ss, i) { unsigned long bit = 1UL << i; + if (bit & added_mask) { /* We're binding this subsystem to this hierarchy */ - BUG_ON(ss == NULL); - BUG_ON(cgrp->subsys[i]); - BUG_ON(!dummytop->subsys[i]); - BUG_ON(dummytop->subsys[i]->cgroup != dummytop); - cgrp->subsys[i] = dummytop->subsys[i]; - cgrp->subsys[i]->cgroup = cgrp; + BUG_ON(cgroup_css(cgrp, ss)); + BUG_ON(!cgroup_css(cgroup_dummy_top, ss)); + BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top); + + rcu_assign_pointer(cgrp->subsys[i], + cgroup_css(cgroup_dummy_top, ss)); + cgroup_css(cgrp, ss)->cgroup = cgrp; + list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) - ss->bind(cgrp); + ss->bind(cgroup_css(cgrp, ss)); + /* refcount was already taken, and we're keeping it */ + root->subsys_mask |= bit; } else if (bit & removed_mask) { /* We're removing this subsystem */ - BUG_ON(ss == NULL); - BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); - BUG_ON(cgrp->subsys[i]->cgroup != cgrp); + BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss)); + BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp); + if (ss->bind) - ss->bind(dummytop); - dummytop->subsys[i]->cgroup = dummytop; - cgrp->subsys[i] = NULL; - subsys[i]->root = &rootnode; - list_move(&ss->sibling, &rootnode.subsys_list); + ss->bind(cgroup_css(cgroup_dummy_top, ss)); + + cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top; + RCU_INIT_POINTER(cgrp->subsys[i], NULL); + + cgroup_subsys[i]->root = &cgroup_dummy_root; + list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); + /* subsystem is now free - drop reference on module */ module_put(ss->module); - } else if (bit & final_subsys_mask) { - /* Subsystem state should already exist */ - BUG_ON(ss == NULL); - BUG_ON(!cgrp->subsys[i]); - /* - * a refcount was taken, but we already had one, so - * drop the extra reference. - */ - module_put(ss->module); -#ifdef CONFIG_MODULE_UNLOAD - BUG_ON(ss->module && !module_refcount(ss->module)); -#endif - } else { - /* Subsystem state shouldn't exist */ - BUG_ON(cgrp->subsys[i]); + root->subsys_mask &= ~bit; } } - root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; + + /* + * Mark @root has finished binding subsystems. @root->subsys_mask + * now matches the bound subsystems. + */ + root->flags |= CGRP_ROOT_SUBSYS_BOUND; return 0; + +out_put: + for_each_subsys(ss, i) + if (pinned & (1 << i)) + module_put(ss->module); + return ret; } static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) @@ -1082,7 +1121,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) struct cgroup_subsys *ss; mutex_lock(&cgroup_root_mutex); - for_each_subsys(root, ss) + for_each_root_subsys(root, ss) seq_printf(seq, ",%s", ss->name); if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) seq_puts(seq, ",sane_behavior"); @@ -1114,18 +1153,18 @@ struct cgroup_sb_opts { }; /* - * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call - * with cgroup_mutex held to protect the subsys[] array. This function takes - * refcounts on subsystems to be used, unless it returns error, in which case - * no refcounts are taken. + * Convert a hierarchy specifier into a bitmask of subsystems and + * flags. Call with cgroup_mutex held to protect the cgroup_subsys[] + * array. This function takes refcounts on subsystems to be used, unless it + * returns error, in which case no refcounts are taken. */ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data; bool all_ss = false, one_ss = false; unsigned long mask = (unsigned long)-1; + struct cgroup_subsys *ss; int i; - bool module_pin_failed = false; BUG_ON(!mutex_is_locked(&cgroup_mutex)); @@ -1202,10 +1241,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; } - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; + for_each_subsys(ss, i) { if (strcmp(token, ss->name)) continue; if (ss->disabled) @@ -1228,16 +1264,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) * otherwise if 'none', 'name=' and a subsystem name options * were not specified, let's default to 'all' */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) { - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; - if (ss->disabled) - continue; - set_bit(i, &opts->subsys_mask); - } - } + if (all_ss || (!one_ss && !opts->none && !opts->name)) + for_each_subsys(ss, i) + if (!ss->disabled) + set_bit(i, &opts->subsys_mask); /* Consistency checks */ @@ -1275,54 +1305,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (!opts->subsys_mask && !opts->name) return -EINVAL; - /* - * Grab references on all the modules we'll need, so the subsystems - * don't dance around before rebind_subsystems attaches them. This may - * take duplicate reference counts on a subsystem that's already used, - * but rebind_subsystems handles this case. - */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - unsigned long bit = 1UL << i; - - if (!(bit & opts->subsys_mask)) - continue; - if (!try_module_get(subsys[i]->module)) { - module_pin_failed = true; - break; - } - } - if (module_pin_failed) { - /* - * oops, one of the modules was going away. this means that we - * raced with a module_delete call, and to the user this is - * essentially a "subsystem doesn't exist" case. - */ - for (i--; i >= 0; i--) { - /* drop refcounts only on the ones we took */ - unsigned long bit = 1UL << i; - - if (!(bit & opts->subsys_mask)) - continue; - module_put(subsys[i]->module); - } - return -ENOENT; - } - return 0; } -static void drop_parsed_module_refcounts(unsigned long subsys_mask) -{ - int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - unsigned long bit = 1UL << i; - - if (!(bit & subsys_mask)) - continue; - module_put(subsys[i]->module); - } -} - static int cgroup_remount(struct super_block *sb, int *flags, char *data) { int ret = 0; @@ -1345,7 +1330,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) if (ret) goto out_unlock; - if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) + if (opts.subsys_mask != root->subsys_mask || opts.release_agent) pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); @@ -1353,30 +1338,24 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) removed_mask = root->subsys_mask & ~opts.subsys_mask; /* Don't allow flags or name to change at remount */ - if (opts.flags != root->flags || + if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || (opts.name && strcmp(opts.name, root->name))) { + pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", + opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", + root->flags & CGRP_ROOT_OPTION_MASK, root->name); ret = -EINVAL; - drop_parsed_module_refcounts(opts.subsys_mask); goto out_unlock; } - /* - * Clear out the files of subsystems that should be removed, do - * this before rebind_subsystems, since rebind_subsystems may - * change this hierarchy's subsys_list. - */ - cgroup_clear_directory(cgrp->dentry, false, removed_mask); - - ret = rebind_subsystems(root, opts.subsys_mask); - if (ret) { - /* rebind_subsystems failed, re-populate the removed files */ - cgroup_populate_dir(cgrp, false, removed_mask); - drop_parsed_module_refcounts(opts.subsys_mask); + /* remounting is not allowed for populated hierarchies */ + if (root->number_of_cgroups > 1) { + ret = -EBUSY; goto out_unlock; } - /* re-populate subsystem files */ - cgroup_populate_dir(cgrp, false, added_mask); + ret = rebind_subsystems(root, added_mask, removed_mask); + if (ret) + goto out_unlock; if (opts.release_agent) strcpy(root->release_agent_path, opts.release_agent); @@ -1401,12 +1380,11 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->files); - INIT_LIST_HEAD(&cgrp->css_sets); - INIT_LIST_HEAD(&cgrp->allcg_node); + INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); - INIT_WORK(&cgrp->free_work, cgroup_free_fn); mutex_init(&cgrp->pidlist_mutex); + cgrp->dummy_css.cgroup = cgrp; INIT_LIST_HEAD(&cgrp->event_list); spin_lock_init(&cgrp->event_list_lock); simple_xattrs_init(&cgrp->xattrs); @@ -1418,37 +1396,38 @@ static void init_cgroup_root(struct cgroupfs_root *root) INIT_LIST_HEAD(&root->subsys_list); INIT_LIST_HEAD(&root->root_list); - INIT_LIST_HEAD(&root->allcg_list); root->number_of_cgroups = 1; cgrp->root = root; - cgrp->name = &root_cgroup_name; + RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); init_cgroup_housekeeping(cgrp); - list_add_tail(&cgrp->allcg_node, &root->allcg_list); + idr_init(&root->cgroup_idr); } -static bool init_root_id(struct cgroupfs_root *root) +static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) { - int ret = 0; + int id; - do { - if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) - return false; - spin_lock(&hierarchy_id_lock); - /* Try to allocate the next unused ID */ - ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, - &root->hierarchy_id); - if (ret == -ENOSPC) - /* Try again starting from 0 */ - ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); - if (!ret) { - next_hierarchy_id = root->hierarchy_id + 1; - } else if (ret != -EAGAIN) { - /* Can only get here if the 31-bit IDR is full ... */ - BUG_ON(ret); - } - spin_unlock(&hierarchy_id_lock); - } while (ret); - return true; + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&cgroup_root_mutex); + + id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end, + GFP_KERNEL); + if (id < 0) + return id; + + root->hierarchy_id = id; + return 0; +} + +static void cgroup_exit_root_id(struct cgroupfs_root *root) +{ + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&cgroup_root_mutex); + + if (root->hierarchy_id) { + idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); + root->hierarchy_id = 0; + } } static int cgroup_test_super(struct super_block *sb, void *data) @@ -1482,15 +1461,18 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) if (!root) return ERR_PTR(-ENOMEM); - if (!init_root_id(root)) { - kfree(root); - return ERR_PTR(-ENOMEM); - } init_cgroup_root(root); + /* + * We need to set @root->subsys_mask now so that @root can be + * matched by cgroup_test_super() before it finishes + * initialization; otherwise, competing mounts with the same + * options may try to bind the same subsystems instead of waiting + * for the first one leading to unexpected mount errors. + * SUBSYS_BOUND will be set once actual binding is complete. + */ root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; - ida_init(&root->cgroup_ida); if (opts->release_agent) strcpy(root->release_agent_path, opts->release_agent); if (opts->name) @@ -1500,17 +1482,15 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) return root; } -static void cgroup_drop_root(struct cgroupfs_root *root) +static void cgroup_free_root(struct cgroupfs_root *root) { - if (!root) - return; + if (root) { + /* hierarhcy ID shoulid already have been released */ + WARN_ON_ONCE(root->hierarchy_id); - BUG_ON(!root->hierarchy_id); - spin_lock(&hierarchy_id_lock); - ida_remove(&hierarchy_ida, root->hierarchy_id); - spin_unlock(&hierarchy_id_lock); - ida_destroy(&root->cgroup_ida); - kfree(root); + idr_destroy(&root->cgroup_idr); + kfree(root); + } } static int cgroup_set_super(struct super_block *sb, void *data) @@ -1573,7 +1553,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int ret = 0; struct super_block *sb; struct cgroupfs_root *new_root; + struct list_head tmp_links; struct inode *inode; + const struct cred *cred; /* First find the desired set of subsystems */ mutex_lock(&cgroup_mutex); @@ -1589,7 +1571,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, new_root = cgroup_root_from_opts(&opts); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); - goto drop_modules; + goto out_err; } opts.new_root = new_root; @@ -1597,20 +1579,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); if (IS_ERR(sb)) { ret = PTR_ERR(sb); - cgroup_drop_root(opts.new_root); - goto drop_modules; + cgroup_free_root(opts.new_root); + goto out_err; } root = sb->s_fs_info; BUG_ON(!root); if (root == opts.new_root) { /* We used the new root structure, so this is a new hierarchy */ - struct list_head tmp_cg_links; struct cgroup *root_cgrp = &root->top_cgroup; struct cgroupfs_root *existing_root; - const struct cred *cred; int i; - struct css_set *cg; + struct css_set *cset; BUG_ON(sb->s_root != NULL); @@ -1623,6 +1603,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); + root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, + 0, 1, GFP_KERNEL); + if (root_cgrp->id < 0) + goto unlock_drop; + /* Check for name clashes with existing mounts */ ret = -EBUSY; if (strlen(root->name)) @@ -1637,45 +1622,58 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, * that's us. The worst that can happen is that we * have some link structures left over */ - ret = allocate_cg_links(css_set_count, &tmp_cg_links); + ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); if (ret) goto unlock_drop; - ret = rebind_subsystems(root, root->subsys_mask); - if (ret == -EBUSY) { - free_cg_links(&tmp_cg_links); + /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ + ret = cgroup_init_root_id(root, 2, 0); + if (ret) goto unlock_drop; - } + + sb->s_root->d_fsdata = root_cgrp; + root_cgrp->dentry = sb->s_root; + + /* + * We're inside get_sb() and will call lookup_one_len() to + * create the root files, which doesn't work if SELinux is + * in use. The following cred dancing somehow works around + * it. See 2ce9738ba ("cgroupfs: use init_cred when + * populating new cgroupfs mount") for more details. + */ + cred = override_creds(&init_cred); + + ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); + if (ret) + goto rm_base_files; + + ret = rebind_subsystems(root, root->subsys_mask, 0); + if (ret) + goto rm_base_files; + + revert_creds(cred); + /* * There must be no failure case after here, since rebinding * takes care of subsystems' refcounts, which are explicitly * dropped in the failure exit path. */ - /* EBUSY should be the only error here */ - BUG_ON(ret); - - list_add(&root->root_list, &roots); - root_count++; - - sb->s_root->d_fsdata = root_cgrp; - root->top_cgroup.dentry = sb->s_root; + list_add(&root->root_list, &cgroup_roots); + cgroup_root_count++; /* Link the top cgroup in this hierarchy into all * the css_set objects */ write_lock(&css_set_lock); - hash_for_each(css_set_table, i, cg, hlist) - link_css_set(&tmp_cg_links, cg, root_cgrp); + hash_for_each(css_set_table, i, cset, hlist) + link_css_set(&tmp_links, cset, root_cgrp); write_unlock(&css_set_lock); - free_cg_links(&tmp_cg_links); + free_cgrp_cset_links(&tmp_links); BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); - cred = override_creds(&init_cred); - cgroup_populate_dir(root_cgrp, true, root->subsys_mask); - revert_creds(cred); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); @@ -1684,9 +1682,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, * We re-used an existing hierarchy - the new root (if * any) is not needed */ - cgroup_drop_root(opts.new_root); + cgroup_free_root(opts.new_root); - if (root->flags != opts.flags) { + if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); ret = -EINVAL; @@ -1695,23 +1693,23 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); } } - - /* no subsys rebinding, so refcounts don't change */ - drop_parsed_module_refcounts(opts.subsys_mask); } kfree(opts.release_agent); kfree(opts.name); return dget(sb->s_root); + rm_base_files: + free_cgrp_cset_links(&tmp_links); + cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); + revert_creds(cred); unlock_drop: + cgroup_exit_root_id(root); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); drop_new_super: deactivate_locked_super(sb); - drop_modules: - drop_parsed_module_refcounts(opts.subsys_mask); out_err: kfree(opts.release_agent); kfree(opts.name); @@ -1721,49 +1719,53 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, static void cgroup_kill_sb(struct super_block *sb) { struct cgroupfs_root *root = sb->s_fs_info; struct cgroup *cgrp = &root->top_cgroup; + struct cgrp_cset_link *link, *tmp_link; int ret; - struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; BUG_ON(!root); BUG_ON(root->number_of_cgroups != 1); BUG_ON(!list_empty(&cgrp->children)); + mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); /* Rebind all subsystems back to the default hierarchy */ - ret = rebind_subsystems(root, 0); - /* Shouldn't be able to fail ... */ - BUG_ON(ret); + if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { + ret = rebind_subsystems(root, 0, root->subsys_mask); + /* Shouldn't be able to fail ... */ + BUG_ON(ret); + } /* - * Release all the links from css_sets to this hierarchy's + * Release all the links from cset_links to this hierarchy's * root cgroup */ write_lock(&css_set_lock); - list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, - cgrp_link_list) { - list_del(&link->cg_link_list); - list_del(&link->cgrp_link_list); + list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { + list_del(&link->cset_link); + list_del(&link->cgrp_link); kfree(link); } write_unlock(&css_set_lock); if (!list_empty(&root->root_list)) { list_del(&root->root_list); - root_count--; + cgroup_root_count--; } + cgroup_exit_root_id(root); + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgrp->dentry->d_inode->i_mutex); simple_xattrs_free(&cgrp->xattrs); kill_litter_super(sb); - cgroup_drop_root(root); + cgroup_free_root(root); } static struct file_system_type cgroup_fs_type = { @@ -1825,13 +1827,52 @@ out: } EXPORT_SYMBOL_GPL(cgroup_path); +/** + * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy + * @task: target task + * @buf: the buffer to write the path into + * @buflen: the length of the buffer + * + * Determine @task's cgroup on the first (the one with the lowest non-zero + * hierarchy_id) cgroup hierarchy and copy its path into @buf. This + * function grabs cgroup_mutex and shouldn't be used inside locks used by + * cgroup controller callbacks. + * + * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short. + */ +int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) +{ + struct cgroupfs_root *root; + struct cgroup *cgrp; + int hierarchy_id = 1, ret = 0; + + if (buflen < 2) + return -ENAMETOOLONG; + + mutex_lock(&cgroup_mutex); + + root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); + + if (root) { + cgrp = task_cgroup_from_root(task, root); + ret = cgroup_path(cgrp, buf, buflen); + } else { + /* if no hierarchy exists, everyone is in "/" */ + memcpy(buf, "/", 2); + } + + mutex_unlock(&cgroup_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(task_cgroup_path); + /* * Control Group taskset */ struct task_and_cgroup { struct task_struct *task; struct cgroup *cgrp; - struct css_set *cg; + struct css_set *cset; }; struct cgroup_taskset { @@ -1881,18 +1922,20 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) EXPORT_SYMBOL_GPL(cgroup_taskset_next); /** - * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task + * cgroup_taskset_cur_css - return the matching css for the current task * @tset: taskset of interest + * @subsys_id: the ID of the target subsystem * - * Return the cgroup for the current (last returned) task of @tset. This - * function must be preceded by either cgroup_taskset_first() or - * cgroup_taskset_next(). + * Return the css for the current (last returned) task of @tset for + * subsystem specified by @subsys_id. This function must be preceded by + * either cgroup_taskset_first() or cgroup_taskset_next(). */ -struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) +struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, + int subsys_id) { - return tset->cur_cgrp; + return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]); } -EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); +EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css); /** * cgroup_taskset_size - return the number of tasks in taskset @@ -1910,10 +1953,11 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); * * Must be called with cgroup_mutex and threadgroup locked. */ -static void cgroup_task_migrate(struct cgroup *oldcgrp, - struct task_struct *tsk, struct css_set *newcg) +static void cgroup_task_migrate(struct cgroup *old_cgrp, + struct task_struct *tsk, + struct css_set *new_cset) { - struct css_set *oldcg; + struct css_set *old_cset; /* * We are synchronized through threadgroup_lock() against PF_EXITING @@ -1921,25 +1965,25 @@ static void cgroup_task_migrate(struct cgroup *oldcgrp, * css_set to init_css_set and dropping the old one. */ WARN_ON_ONCE(tsk->flags & PF_EXITING); - oldcg = tsk->cgroups; + old_cset = task_css_set(tsk); task_lock(tsk); - rcu_assign_pointer(tsk->cgroups, newcg); + rcu_assign_pointer(tsk->cgroups, new_cset); task_unlock(tsk); /* Update the css_set linked lists if we're using them */ write_lock(&css_set_lock); if (!list_empty(&tsk->cg_list)) - list_move(&tsk->cg_list, &newcg->tasks); + list_move(&tsk->cg_list, &new_cset->tasks); write_unlock(&css_set_lock); /* - * We just gained a reference on oldcg by taking it from the task. As - * trading it for newcg is protected by cgroup_mutex, we're safe to drop - * it here; it will be freed under RCU. + * We just gained a reference on old_cset by taking it from the + * task. As trading it for new_cset is protected by cgroup_mutex, + * we're safe to drop it here; it will be freed under RCU. */ - set_bit(CGRP_RELEASABLE, &oldcgrp->flags); - put_css_set(oldcg); + set_bit(CGRP_RELEASABLE, &old_cgrp->flags); + put_css_set(old_cset); } /** @@ -1995,7 +2039,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, /* @tsk either already exited or can't exit until the end */ if (tsk->flags & PF_EXITING) - continue; + goto next; /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); @@ -2003,7 +2047,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, ent.cgrp = task_cgroup_from_root(tsk, root); /* nothing to do if this task is already in the cgroup */ if (ent.cgrp == cgrp) - continue; + goto next; /* * saying GFP_ATOMIC has no effect here because we did prealloc * earlier, but it's good form to communicate our expectations. @@ -2011,7 +2055,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, retval = flex_array_put(group, i, &ent, GFP_ATOMIC); BUG_ON(retval != 0); i++; - + next: if (!threadgroup) break; } while_each_thread(leader, tsk); @@ -2029,9 +2073,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, /* * step 1: check that we can legitimately attach to the cgroup. */ - for_each_subsys(root, ss) { + for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); + if (ss->can_attach) { - retval = ss->can_attach(cgrp, &tset); + retval = ss->can_attach(css, &tset); if (retval) { failed_ss = ss; goto out_cancel_attach; @@ -2044,9 +2090,12 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * we use find_css_set, which allocates a new one if necessary. */ for (i = 0; i < group_size; i++) { + struct css_set *old_cset; + tc = flex_array_get(group, i); - tc->cg = find_css_set(tc->task->cgroups, cgrp); - if (!tc->cg) { + old_cset = task_css_set(tc->task); + tc->cset = find_css_set(old_cset, cgrp); + if (!tc->cset) { retval = -ENOMEM; goto out_put_css_set_refs; } @@ -2059,16 +2108,18 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, */ for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); + cgroup_task_migrate(tc->cgrp, tc->task, tc->cset); } /* nothing is sensitive to fork() after this point. */ /* * step 4: do subsystem attach callbacks. */ - for_each_subsys(root, ss) { + for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); + if (ss->attach) - ss->attach(cgrp, &tset); + ss->attach(css, &tset); } /* @@ -2079,18 +2130,20 @@ out_put_css_set_refs: if (retval) { for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - if (!tc->cg) + if (!tc->cset) break; - put_css_set(tc->cg); + put_css_set(tc->cset); } } out_cancel_attach: if (retval) { - for_each_subsys(root, ss) { + for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); + if (ss == failed_ss) break; if (ss->cancel_attach) - ss->cancel_attach(cgrp, &tset); + ss->cancel_attach(css, &tset); } } out_free_group_list: @@ -2191,9 +2244,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) mutex_lock(&cgroup_mutex); for_each_active_root(root) { - struct cgroup *from_cg = task_cgroup_from_root(from, root); + struct cgroup *from_cgrp = task_cgroup_from_root(from, root); - retval = cgroup_attach_task(from_cg, tsk, false); + retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) break; } @@ -2203,34 +2256,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); -static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) +static int cgroup_tasks_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 pid) { - return attach_task_by_pid(cgrp, pid, false); + return attach_task_by_pid(css->cgroup, pid, false); } -static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) +static int cgroup_procs_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 tgid) { - return attach_task_by_pid(cgrp, tgid, true); + return attach_task_by_pid(css->cgroup, tgid, true); } -static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +static int cgroup_release_agent_write(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) { - BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); if (strlen(buffer) >= PATH_MAX) return -EINVAL; - if (!cgroup_lock_live_group(cgrp)) + if (!cgroup_lock_live_group(css->cgroup)) return -ENODEV; mutex_lock(&cgroup_root_mutex); - strcpy(cgrp->root->release_agent_path, buffer); + strcpy(css->cgroup->root->release_agent_path, buffer); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); return 0; } -static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *seq) +static int cgroup_release_agent_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *seq) { + struct cgroup *cgrp = css->cgroup; + if (!cgroup_lock_live_group(cgrp)) return -ENODEV; seq_puts(seq, cgrp->root->release_agent_path); @@ -2239,20 +2296,20 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, return 0; } -static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *seq) +static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *seq) { - seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); + seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); return 0; } /* A buffer size big enough for numbers or short strings */ #define CGROUP_LOCAL_BUFFER_SIZE 64 -static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) +static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + const char __user *userbuf, size_t nbytes, + loff_t *unused_ppos) { char buffer[CGROUP_LOCAL_BUFFER_SIZE]; int retval = 0; @@ -2270,22 +2327,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, u64 val = simple_strtoull(strstrip(buffer), &end, 0); if (*end) return -EINVAL; - retval = cft->write_u64(cgrp, cft, val); + retval = cft->write_u64(css, cft, val); } else { s64 val = simple_strtoll(strstrip(buffer), &end, 0); if (*end) return -EINVAL; - retval = cft->write_s64(cgrp, cft, val); + retval = cft->write_s64(css, cft, val); } if (!retval) retval = nbytes; return retval; } -static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) +static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + const char __user *userbuf, size_t nbytes, + loff_t *unused_ppos) { char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; int retval = 0; @@ -2308,7 +2365,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, } buffer[nbytes] = 0; /* nul-terminate */ - retval = cft->write_string(cgrp, cft, strstrip(buffer)); + retval = cft->write_string(css, cft, strstrip(buffer)); if (!retval) retval = nbytes; out: @@ -2318,65 +2375,60 @@ out: } static ssize_t cgroup_file_write(struct file *file, const char __user *buf, - size_t nbytes, loff_t *ppos) + size_t nbytes, loff_t *ppos) { + struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct cgroup_subsys_state *css = cfe->css; - if (cgroup_is_removed(cgrp)) - return -ENODEV; if (cft->write) - return cft->write(cgrp, cft, file, buf, nbytes, ppos); + return cft->write(css, cft, file, buf, nbytes, ppos); if (cft->write_u64 || cft->write_s64) - return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); if (cft->write_string) - return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_write_string(css, cft, file, buf, nbytes, ppos); if (cft->trigger) { - int ret = cft->trigger(cgrp, (unsigned int)cft->private); + int ret = cft->trigger(css, (unsigned int)cft->private); return ret ? ret : nbytes; } return -EINVAL; } -static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, loff_t *ppos) { char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - u64 val = cft->read_u64(cgrp, cft); + u64 val = cft->read_u64(css, cft); int len = sprintf(tmp, "%llu\n", (unsigned long long) val); return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } -static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, loff_t *ppos) { char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - s64 val = cft->read_s64(cgrp, cft); + s64 val = cft->read_s64(css, cft); int len = sprintf(tmp, "%lld\n", (long long) val); return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } static ssize_t cgroup_file_read(struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) + size_t nbytes, loff_t *ppos) { + struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - - if (cgroup_is_removed(cgrp)) - return -ENODEV; + struct cgroup_subsys_state *css = cfe->css; if (cft->read) - return cft->read(cgrp, cft, file, buf, nbytes, ppos); + return cft->read(css, cft, file, buf, nbytes, ppos); if (cft->read_u64) - return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); if (cft->read_s64) - return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); return -EINVAL; } @@ -2385,11 +2437,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, * supports string->u64 maps, but can be extended in future. */ -struct cgroup_seqfile_state { - struct cftype *cft; - struct cgroup *cgroup; -}; - static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) { struct seq_file *sf = cb->state; @@ -2398,67 +2445,86 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) static int cgroup_seqfile_show(struct seq_file *m, void *arg) { - struct cgroup_seqfile_state *state = m->private; - struct cftype *cft = state->cft; + struct cfent *cfe = m->private; + struct cftype *cft = cfe->type; + struct cgroup_subsys_state *css = cfe->css; + if (cft->read_map) { struct cgroup_map_cb cb = { .fill = cgroup_map_add, .state = m, }; - return cft->read_map(state->cgroup, cft, &cb); + return cft->read_map(css, cft, &cb); } - return cft->read_seq_string(state->cgroup, cft, m); -} - -static int cgroup_seqfile_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - kfree(seq->private); - return single_release(inode, file); + return cft->read_seq_string(css, cft, m); } static const struct file_operations cgroup_seqfile_operations = { .read = seq_read, .write = cgroup_file_write, .llseek = seq_lseek, - .release = cgroup_seqfile_release, + .release = single_release, }; static int cgroup_file_open(struct inode *inode, struct file *file) { + struct cfent *cfe = __d_cfe(file->f_dentry); + struct cftype *cft = __d_cft(file->f_dentry); + struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); + struct cgroup_subsys_state *css; int err; - struct cftype *cft; err = generic_file_open(inode, file); if (err) return err; - cft = __d_cft(file->f_dentry); + + /* + * If the file belongs to a subsystem, pin the css. Will be + * unpinned either on open failure or release. This ensures that + * @css stays alive for all file operations. + */ + rcu_read_lock(); + css = cgroup_css(cgrp, cft->ss); + if (cft->ss && !css_tryget(css)) + css = NULL; + rcu_read_unlock(); + + if (!css) + return -ENODEV; + + /* + * @cfe->css is used by read/write/close to determine the + * associated css. @file->private_data would be a better place but + * that's already used by seqfile. Multiple accessors may use it + * simultaneously which is okay as the association never changes. + */ + WARN_ON_ONCE(cfe->css && cfe->css != css); + cfe->css = css; if (cft->read_map || cft->read_seq_string) { - struct cgroup_seqfile_state *state = - kzalloc(sizeof(*state), GFP_USER); - if (!state) - return -ENOMEM; - state->cft = cft; - state->cgroup = __d_cgrp(file->f_dentry->d_parent); file->f_op = &cgroup_seqfile_operations; - err = single_open(file, cgroup_seqfile_show, state); - if (err < 0) - kfree(state); - } else if (cft->open) + err = single_open(file, cgroup_seqfile_show, cfe); + } else if (cft->open) { err = cft->open(inode, file); - else - err = 0; + } + if (css->ss && err) + css_put(css); return err; } static int cgroup_file_release(struct inode *inode, struct file *file) { + struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); + struct cgroup_subsys_state *css = cfe->css; + int ret = 0; + if (cft->release) - return cft->release(inode, file); - return 0; + ret = cft->release(inode, file); + if (css->ss) + css_put(css); + return ret; } /* @@ -2486,6 +2552,13 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, cgrp = __d_cgrp(old_dentry); + /* + * This isn't a proper migration and its usefulness is very + * limited. Disallow if sane_behavior. + */ + if (cgroup_sane_behavior(cgrp)) + return -EPERM; + name = cgroup_alloc_name(new_dentry); if (!name) return -ENOMEM; @@ -2496,7 +2569,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, return ret; } - old_name = cgrp->name; + old_name = rcu_dereference_protected(cgrp->name, true); rcu_assign_pointer(cgrp->name, name); kfree_rcu(old_name, rcu_head); @@ -2577,7 +2650,7 @@ static const struct inode_operations cgroup_file_inode_operations = { }; static const struct inode_operations cgroup_dir_inode_operations = { - .lookup = cgroup_lookup, + .lookup = simple_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .rename = cgroup_rename, @@ -2587,14 +2660,6 @@ static const struct inode_operations cgroup_dir_inode_operations = { .removexattr = cgroup_removexattr, }; -static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) -{ - if (dentry->d_name.len > NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); - d_add(dentry, NULL); - return NULL; -} - /* * Check if a file is a control file */ @@ -2673,8 +2738,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) return mode; } -static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, - struct cftype *cft) +static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) { struct dentry *dir = cgrp->dentry; struct cgroup *parent = __d_cgrp(dir); @@ -2684,8 +2748,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { - strcpy(name, subsys->name); + if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && + !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { + strcpy(name, cft->ss->name); strcat(name, "."); } strcat(name, cft->name); @@ -2719,11 +2784,25 @@ out: return error; } -static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, - struct cftype cfts[], bool is_add) +/** + * cgroup_addrm_files - add or remove files to a cgroup directory + * @cgrp: the target cgroup + * @cfts: array of cftypes to be added + * @is_add: whether to add or remove + * + * Depending on @is_add, add or remove files defined by @cfts on @cgrp. + * For removals, this function never fails. If addition fails, this + * function doesn't remove files already added. The caller is responsible + * for cleaning up. + */ +static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], + bool is_add) { struct cftype *cft; - int err, ret = 0; + int ret; + + lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); + lockdep_assert_held(&cgroup_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ @@ -2735,70 +2814,90 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, continue; if (is_add) { - err = cgroup_add_file(cgrp, subsys, cft); - if (err) + ret = cgroup_add_file(cgrp, cft); + if (ret) { pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", - cft->name, err); - ret = err; + cft->name, ret); + return ret; + } } else { cgroup_rm_file(cgrp, cft); } } - return ret; + return 0; } -static DEFINE_MUTEX(cgroup_cft_mutex); - static void cgroup_cfts_prepare(void) - __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) + __acquires(&cgroup_mutex) { /* * Thanks to the entanglement with vfs inode locking, we can't walk * the existing cgroups under cgroup_mutex and create files. - * Instead, we increment reference on all cgroups and build list of - * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure - * exclusive access to the field. + * Instead, we use css_for_each_descendant_pre() and drop RCU read + * lock before calling cgroup_addrm_files(). */ - mutex_lock(&cgroup_cft_mutex); mutex_lock(&cgroup_mutex); } -static void cgroup_cfts_commit(struct cgroup_subsys *ss, - struct cftype *cfts, bool is_add) - __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) +static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) + __releases(&cgroup_mutex) { LIST_HEAD(pending); - struct cgroup *cgrp, *n; + struct cgroup_subsys *ss = cfts[0].ss; + struct cgroup *root = &ss->root->top_cgroup; + struct super_block *sb = ss->root->sb; + struct dentry *prev = NULL; + struct inode *inode; + struct cgroup_subsys_state *css; + u64 update_before; + int ret = 0; /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ - if (cfts && ss->root != &rootnode) { - list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { - dget(cgrp->dentry); - list_add_tail(&cgrp->cft_q_node, &pending); - } + if (!cfts || ss->root == &cgroup_dummy_root || + !atomic_inc_not_zero(&sb->s_active)) { + mutex_unlock(&cgroup_mutex); + return 0; } - mutex_unlock(&cgroup_mutex); - /* - * All new cgroups will see @cfts update on @ss->cftsets. Add/rm - * files for all cgroups which were created before. + * All cgroups which are created after we drop cgroup_mutex will + * have the updated set of files, so we only need to update the + * cgroups created before the current @cgroup_serial_nr_next. */ - list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { - struct inode *inode = cgrp->dentry->d_inode; + update_before = cgroup_serial_nr_next; + + mutex_unlock(&cgroup_mutex); + + /* add/rm files for all cgroups created before */ + rcu_read_lock(); + css_for_each_descendant_pre(css, cgroup_css(root, ss)) { + struct cgroup *cgrp = css->cgroup; + + if (cgroup_is_dead(cgrp)) + continue; + + inode = cgrp->dentry->d_inode; + dget(cgrp->dentry); + rcu_read_unlock(); + + dput(prev); + prev = cgrp->dentry; mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); - if (!cgroup_is_removed(cgrp)) - cgroup_addrm_files(cgrp, ss, cfts, is_add); + if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) + ret = cgroup_addrm_files(cgrp, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); - list_del_init(&cgrp->cft_q_node); - dput(cgrp->dentry); + rcu_read_lock(); + if (ret) + break; } - - mutex_unlock(&cgroup_cft_mutex); + rcu_read_unlock(); + dput(prev); + deactivate_super(sb); + return ret; } /** @@ -2818,48 +2917,56 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype_set *set; + struct cftype *cft; + int ret; set = kzalloc(sizeof(*set), GFP_KERNEL); if (!set) return -ENOMEM; + for (cft = cfts; cft->name[0] != '\0'; cft++) + cft->ss = ss; + cgroup_cfts_prepare(); set->cfts = cfts; list_add_tail(&set->node, &ss->cftsets); - cgroup_cfts_commit(ss, cfts, true); - - return 0; + ret = cgroup_cfts_commit(cfts, true); + if (ret) + cgroup_rm_cftypes(cfts); + return ret; } EXPORT_SYMBOL_GPL(cgroup_add_cftypes); /** * cgroup_rm_cftypes - remove an array of cftypes from a subsystem - * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * - * Unregister @cfts from @ss. Files described by @cfts are removed from - * all existing cgroups to which @ss is attached and all future cgroups - * won't have them either. This function can be called anytime whether @ss - * is attached or not. + * Unregister @cfts. Files described by @cfts are removed from all + * existing cgroups and all future cgroups won't have them either. This + * function can be called anytime whether @cfts' subsys is attached or not. * * Returns 0 on successful unregistration, -ENOENT if @cfts is not - * registered with @ss. + * registered. */ -int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +int cgroup_rm_cftypes(struct cftype *cfts) { struct cftype_set *set; + if (!cfts || !cfts[0].ss) + return -ENOENT; + cgroup_cfts_prepare(); - list_for_each_entry(set, &ss->cftsets, node) { + list_for_each_entry(set, &cfts[0].ss->cftsets, node) { if (set->cfts == cfts) { - list_del_init(&set->node); - cgroup_cfts_commit(ss, cfts, false); + list_del(&set->node); + kfree(set); + cgroup_cfts_commit(cfts, false); return 0; } } - cgroup_cfts_commit(ss, NULL, false); + cgroup_cfts_commit(NULL, false); return -ENOENT; } @@ -2872,46 +2979,20 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; - struct cg_cgroup_link *link; + struct cgrp_cset_link *link; read_lock(&css_set_lock); - list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { - count += atomic_read(&link->cg->refcount); - } + list_for_each_entry(link, &cgrp->cset_links, cset_link) + count += atomic_read(&link->cset->refcount); read_unlock(&css_set_lock); return count; } /* - * Advance a list_head iterator. The iterator should be positioned at - * the start of a css_set - */ -static void cgroup_advance_iter(struct cgroup *cgrp, - struct cgroup_iter *it) -{ - struct list_head *l = it->cg_link; - struct cg_cgroup_link *link; - struct css_set *cg; - - /* Advance to the next non-empty css_set */ - do { - l = l->next; - if (l == &cgrp->css_sets) { - it->cg_link = NULL; - return; - } - link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); - cg = link->cg; - } while (list_empty(&cg->tasks)); - it->cg_link = l; - it->task = cg->tasks.next; -} - -/* - * To reduce the fork() overhead for systems that are not actually - * using their cgroups capability, we don't maintain the lists running - * through each css_set to its tasks until we see the list actually - * used - in other words after the first call to cgroup_iter_start(). + * To reduce the fork() overhead for systems that are not actually using + * their cgroups capability, we don't maintain the lists running through + * each css_set to its tasks until we see the list actually used - in other + * words after the first call to css_task_iter_start(). */ static void cgroup_enable_task_cg_lists(void) { @@ -2934,7 +3015,7 @@ static void cgroup_enable_task_cg_lists(void) * entry won't be deleted though the process has exited. */ if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) - list_add(&p->cg_list, &p->cgroups->tasks); + list_add(&p->cg_list, &task_css_set(p)->tasks); task_unlock(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); @@ -2942,54 +3023,121 @@ static void cgroup_enable_task_cg_lists(void) } /** - * cgroup_next_descendant_pre - find the next descendant for pre-order walk - * @pos: the current position (%NULL to initiate traversal) - * @cgroup: cgroup whose descendants to walk + * css_next_child - find the next child of a given css + * @pos_css: the current position (%NULL to initiate traversal) + * @parent_css: css whose children to walk * - * To be used by cgroup_for_each_descendant_pre(). Find the next - * descendant to visit for pre-order traversal of @cgroup's descendants. + * This function returns the next child of @parent_css and should be called + * under RCU read lock. The only requirement is that @parent_css and + * @pos_css are accessible. The next sibling is guaranteed to be returned + * regardless of their states. */ -struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, - struct cgroup *cgroup) +struct cgroup_subsys_state * +css_next_child(struct cgroup_subsys_state *pos_css, + struct cgroup_subsys_state *parent_css) { + struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; + struct cgroup *cgrp = parent_css->cgroup; struct cgroup *next; WARN_ON_ONCE(!rcu_read_lock_held()); - /* if first iteration, pretend we just visited @cgroup */ + /* + * @pos could already have been removed. Once a cgroup is removed, + * its ->sibling.next is no longer updated when its next sibling + * changes. As CGRP_DEAD assertion is serialized and happens + * before the cgroup is taken off the ->sibling list, if we see it + * unasserted, it's guaranteed that the next sibling hasn't + * finished its grace period even if it's already removed, and thus + * safe to dereference from this RCU critical section. If + * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed + * to be visible as %true here. + * + * If @pos is dead, its next pointer can't be dereferenced; + * however, as each cgroup is given a monotonically increasing + * unique serial number and always appended to the sibling list, + * the next one can be found by walking the parent's children until + * we see a cgroup with higher serial number than @pos's. While + * this path can be slower, it's taken only when either the current + * cgroup is removed or iteration and removal race. + */ + if (!pos) { + next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); + } else if (likely(!cgroup_is_dead(pos))) { + next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); + } else { + list_for_each_entry_rcu(next, &cgrp->children, sibling) + if (next->serial_nr > pos->serial_nr) + break; + } + + if (&next->sibling == &cgrp->children) + return NULL; + + return cgroup_css(next, parent_css->ss); +} +EXPORT_SYMBOL_GPL(css_next_child); + +/** + * css_next_descendant_pre - find the next descendant for pre-order walk + * @pos: the current position (%NULL to initiate traversal) + * @root: css whose descendants to walk + * + * To be used by css_for_each_descendant_pre(). Find the next descendant + * to visit for pre-order traversal of @root's descendants. @root is + * included in the iteration and the first node to be visited. + * + * While this function requires RCU read locking, it doesn't require the + * whole traversal to be contained in a single RCU critical section. This + * function will return the correct next descendant as long as both @pos + * and @root are accessible and @pos is a descendant of @root. + */ +struct cgroup_subsys_state * +css_next_descendant_pre(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *root) +{ + struct cgroup_subsys_state *next; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* if first iteration, visit @root */ if (!pos) - pos = cgroup; + return root; /* visit the first child if exists */ - next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); + next = css_next_child(NULL, pos); if (next) return next; /* no child, visit my or the closest ancestor's next sibling */ - while (pos != cgroup) { - next = list_entry_rcu(pos->sibling.next, struct cgroup, - sibling); - if (&next->sibling != &pos->parent->children) + while (pos != root) { + next = css_next_child(pos, css_parent(pos)); + if (next) return next; - - pos = pos->parent; + pos = css_parent(pos); } return NULL; } -EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); +EXPORT_SYMBOL_GPL(css_next_descendant_pre); /** - * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup - * @pos: cgroup of interest + * css_rightmost_descendant - return the rightmost descendant of a css + * @pos: css of interest * - * Return the rightmost descendant of @pos. If there's no descendant, - * @pos is returned. This can be used during pre-order traversal to skip + * Return the rightmost descendant of @pos. If there's no descendant, @pos + * is returned. This can be used during pre-order traversal to skip * subtree of @pos. + * + * While this function requires RCU read locking, it doesn't require the + * whole traversal to be contained in a single RCU critical section. This + * function will return the correct rightmost descendant as long as @pos is + * accessible. */ -struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) +struct cgroup_subsys_state * +css_rightmost_descendant(struct cgroup_subsys_state *pos) { - struct cgroup *last, *tmp; + struct cgroup_subsys_state *last, *tmp; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -2997,100 +3145,167 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) last = pos; /* ->prev isn't RCU safe, walk ->next till the end */ pos = NULL; - list_for_each_entry_rcu(tmp, &last->children, sibling) + css_for_each_child(tmp, last) pos = tmp; } while (pos); return last; } -EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); +EXPORT_SYMBOL_GPL(css_rightmost_descendant); -static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) +static struct cgroup_subsys_state * +css_leftmost_descendant(struct cgroup_subsys_state *pos) { - struct cgroup *last; + struct cgroup_subsys_state *last; do { last = pos; - pos = list_first_or_null_rcu(&pos->children, struct cgroup, - sibling); + pos = css_next_child(NULL, pos); } while (pos); return last; } /** - * cgroup_next_descendant_post - find the next descendant for post-order walk + * css_next_descendant_post - find the next descendant for post-order walk * @pos: the current position (%NULL to initiate traversal) - * @cgroup: cgroup whose descendants to walk + * @root: css whose descendants to walk + * + * To be used by css_for_each_descendant_post(). Find the next descendant + * to visit for post-order traversal of @root's descendants. @root is + * included in the iteration and the last node to be visited. * - * To be used by cgroup_for_each_descendant_post(). Find the next - * descendant to visit for post-order traversal of @cgroup's descendants. + * While this function requires RCU read locking, it doesn't require the + * whole traversal to be contained in a single RCU critical section. This + * function will return the correct next descendant as long as both @pos + * and @cgroup are accessible and @pos is a descendant of @cgroup. */ -struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, - struct cgroup *cgroup) +struct cgroup_subsys_state * +css_next_descendant_post(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *root) { - struct cgroup *next; + struct cgroup_subsys_state *next; WARN_ON_ONCE(!rcu_read_lock_held()); - /* if first iteration, visit the leftmost descendant */ - if (!pos) { - next = cgroup_leftmost_descendant(cgroup); - return next != cgroup ? next : NULL; - } + /* if first iteration, visit leftmost descendant which may be @root */ + if (!pos) + return css_leftmost_descendant(root); + + /* if we visited @root, we're done */ + if (pos == root) + return NULL; /* if there's an unvisited sibling, visit its leftmost descendant */ - next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); - if (&next->sibling != &pos->parent->children) - return cgroup_leftmost_descendant(next); + next = css_next_child(pos, css_parent(pos)); + if (next) + return css_leftmost_descendant(next); /* no sibling left, visit parent */ - next = pos->parent; - return next != cgroup ? next : NULL; + return css_parent(pos); +} +EXPORT_SYMBOL_GPL(css_next_descendant_post); + +/** + * css_advance_task_iter - advance a task itererator to the next css_set + * @it: the iterator to advance + * + * Advance @it to the next css_set to walk. + */ +static void css_advance_task_iter(struct css_task_iter *it) +{ + struct list_head *l = it->cset_link; + struct cgrp_cset_link *link; + struct css_set *cset; + + /* Advance to the next non-empty css_set */ + do { + l = l->next; + if (l == &it->origin_css->cgroup->cset_links) { + it->cset_link = NULL; + return; + } + link = list_entry(l, struct cgrp_cset_link, cset_link); + cset = link->cset; + } while (list_empty(&cset->tasks)); + it->cset_link = l; + it->task = cset->tasks.next; } -EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); -void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) +/** + * css_task_iter_start - initiate task iteration + * @css: the css to walk tasks of + * @it: the task iterator to use + * + * Initiate iteration through the tasks of @css. The caller can call + * css_task_iter_next() to walk through the tasks until the function + * returns NULL. On completion of iteration, css_task_iter_end() must be + * called. + * + * Note that this function acquires a lock which is released when the + * iteration finishes. The caller can't sleep while iteration is in + * progress. + */ +void css_task_iter_start(struct cgroup_subsys_state *css, + struct css_task_iter *it) __acquires(css_set_lock) { /* - * The first time anyone tries to iterate across a cgroup, - * we need to enable the list linking each css_set to its - * tasks, and fix up all existing tasks. + * The first time anyone tries to iterate across a css, we need to + * enable the list linking each css_set to its tasks, and fix up + * all existing tasks. */ if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); read_lock(&css_set_lock); - it->cg_link = &cgrp->css_sets; - cgroup_advance_iter(cgrp, it); + + it->origin_css = css; + it->cset_link = &css->cgroup->cset_links; + + css_advance_task_iter(it); } -struct task_struct *cgroup_iter_next(struct cgroup *cgrp, - struct cgroup_iter *it) +/** + * css_task_iter_next - return the next task for the iterator + * @it: the task iterator being iterated + * + * The "next" function for task iteration. @it should have been + * initialized via css_task_iter_start(). Returns NULL when the iteration + * reaches the end. + */ +struct task_struct *css_task_iter_next(struct css_task_iter *it) { struct task_struct *res; struct list_head *l = it->task; - struct cg_cgroup_link *link; + struct cgrp_cset_link *link; /* If the iterator cg is NULL, we have no tasks */ - if (!it->cg_link) + if (!it->cset_link) return NULL; res = list_entry(l, struct task_struct, cg_list); /* Advance iterator to find next entry */ l = l->next; - link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); - if (l == &link->cg->tasks) { - /* We reached the end of this task list - move on to - * the next cg_cgroup_link */ - cgroup_advance_iter(cgrp, it); + link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); + if (l == &link->cset->tasks) { + /* + * We reached the end of this task list - move on to the + * next cgrp_cset_link. + */ + css_advance_task_iter(it); } else { it->task = l; } return res; } -void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) +/** + * css_task_iter_end - finish task iteration + * @it: the task iterator to finish + * + * Finish task iteration started by css_task_iter_start(). + */ +void css_task_iter_end(struct css_task_iter *it) __releases(css_set_lock) { read_unlock(&css_set_lock); @@ -3131,46 +3346,49 @@ static inline int started_after(void *p1, void *p2) } /** - * cgroup_scan_tasks - iterate though all the tasks in a cgroup - * @scan: struct cgroup_scanner containing arguments for the scan + * css_scan_tasks - iterate though all the tasks in a css + * @css: the css to iterate tasks of + * @test: optional test callback + * @process: process callback + * @data: data passed to @test and @process + * @heap: optional pre-allocated heap used for task iteration * - * Arguments include pointers to callback functions test_task() and - * process_task(). - * Iterate through all the tasks in a cgroup, calling test_task() for each, - * and if it returns true, call process_task() for it also. - * The test_task pointer may be NULL, meaning always true (select all tasks). - * Effectively duplicates cgroup_iter_{start,next,end}() - * but does not lock css_set_lock for the call to process_task(). - * The struct cgroup_scanner may be embedded in any structure of the caller's - * creation. - * It is guaranteed that process_task() will act on every task that - * is a member of the cgroup for the duration of this call. This - * function may or may not call process_task() for tasks that exit - * or move to a different cgroup during the call, or are forked or - * move into the cgroup during the call. + * Iterate through all the tasks in @css, calling @test for each, and if it + * returns %true, call @process for it also. * - * Note that test_task() may be called with locks held, and may in some - * situations be called multiple times for the same task, so it should - * be cheap. - * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been - * pre-allocated and will be used for heap operations (and its "gt" member will - * be overwritten), else a temporary heap will be used (allocation of which - * may cause this function to fail). + * @test may be NULL, meaning always true (select all tasks), which + * effectively duplicates css_task_iter_{start,next,end}() but does not + * lock css_set_lock for the call to @process. + * + * It is guaranteed that @process will act on every task that is a member + * of @css for the duration of this call. This function may or may not + * call @process for tasks that exit or move to a different css during the + * call, or are forked or move into the css during the call. + * + * Note that @test may be called with locks held, and may in some + * situations be called multiple times for the same task, so it should be + * cheap. + * + * If @heap is non-NULL, a heap has been pre-allocated and will be used for + * heap operations (and its "gt" member will be overwritten), else a + * temporary heap will be used (allocation of which may cause this function + * to fail). */ -int cgroup_scan_tasks(struct cgroup_scanner *scan) +int css_scan_tasks(struct cgroup_subsys_state *css, + bool (*test)(struct task_struct *, void *), + void (*process)(struct task_struct *, void *), + void *data, struct ptr_heap *heap) { int retval, i; - struct cgroup_iter it; + struct css_task_iter it; struct task_struct *p, *dropped; /* Never dereference latest_task, since it's not refcounted */ struct task_struct *latest_task = NULL; struct ptr_heap tmp_heap; - struct ptr_heap *heap; struct timespec latest_time = { 0, 0 }; - if (scan->heap) { + if (heap) { /* The caller supplied our heap and pre-allocated its memory */ - heap = scan->heap; heap->gt = &started_after; } else { /* We need to allocate our own heap memory */ @@ -3183,25 +3401,24 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) again: /* - * Scan tasks in the cgroup, using the scanner's "test_task" callback - * to determine which are of interest, and using the scanner's - * "process_task" callback to process any of them that need an update. - * Since we don't want to hold any locks during the task updates, - * gather tasks to be processed in a heap structure. - * The heap is sorted by descending task start time. - * If the statically-sized heap fills up, we overflow tasks that - * started later, and in future iterations only consider tasks that - * started after the latest task in the previous pass. This + * Scan tasks in the css, using the @test callback to determine + * which are of interest, and invoking @process callback on the + * ones which need an update. Since we don't want to hold any + * locks during the task updates, gather tasks to be processed in a + * heap structure. The heap is sorted by descending task start + * time. If the statically-sized heap fills up, we overflow tasks + * that started later, and in future iterations only consider tasks + * that started after the latest task in the previous pass. This * guarantees forward progress and that we don't miss any tasks. */ heap->size = 0; - cgroup_iter_start(scan->cg, &it); - while ((p = cgroup_iter_next(scan->cg, &it))) { + css_task_iter_start(css, &it); + while ((p = css_task_iter_next(&it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one */ - if (scan->test_task && !scan->test_task(p, scan)) + if (test && !test(p, data)) continue; /* * Only process tasks that started after the last task @@ -3229,7 +3446,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * the heap and wasn't inserted */ } - cgroup_iter_end(scan->cg, &it); + css_task_iter_end(&it); if (heap->size) { for (i = 0; i < heap->size; i++) { @@ -3239,7 +3456,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) latest_task = q; } /* Process the task per the caller's callback */ - scan->process_task(q, scan); + process(q, data); put_task_struct(q); } /* @@ -3256,10 +3473,9 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) return 0; } -static void cgroup_transfer_one_task(struct task_struct *task, - struct cgroup_scanner *scan) +static void cgroup_transfer_one_task(struct task_struct *task, void *data) { - struct cgroup *new_cgroup = scan->data; + struct cgroup *new_cgroup = data; mutex_lock(&cgroup_mutex); cgroup_attach_task(new_cgroup, task, false); @@ -3273,15 +3489,8 @@ static void cgroup_transfer_one_task(struct task_struct *task, */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { - struct cgroup_scanner scan; - - scan.cg = from; - scan.test_task = NULL; /* select all tasks in cgroup */ - scan.process_task = cgroup_transfer_one_task; - scan.heap = NULL; - scan.data = to; - - return cgroup_scan_tasks(&scan); + return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task, + to, NULL); } /* @@ -3323,7 +3532,7 @@ struct cgroup_pidlist { /* pointer to the cgroup we belong to, for list removal purposes */ struct cgroup *owner; /* protects the other fields */ - struct rw_semaphore mutex; + struct rw_semaphore rwsem; }; /* @@ -3396,7 +3605,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, struct pid_namespace *ns = task_active_pid_ns(current); /* - * We can't drop the pidlist_mutex before taking the l->mutex in case + * We can't drop the pidlist_mutex before taking the l->rwsem in case * the last ref-holder is trying to remove l from the list at the same * time. Holding the pidlist_mutex precludes somebody taking whichever * list we find out from under us - compare release_pid_array(). @@ -3405,23 +3614,21 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, list_for_each_entry(l, &cgrp->pidlists, links) { if (l->key.type == type && l->key.ns == ns) { /* make sure l doesn't vanish out from under us */ - down_write(&l->mutex); + down_write(&l->rwsem); mutex_unlock(&cgrp->pidlist_mutex); return l; } } /* entry not found; create a new one */ - l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); + l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); if (!l) { mutex_unlock(&cgrp->pidlist_mutex); return l; } - init_rwsem(&l->mutex); - down_write(&l->mutex); + init_rwsem(&l->rwsem); + down_write(&l->rwsem); l->key.type = type; l->key.ns = get_pid_ns(ns); - l->use_count = 0; /* don't increment here */ - l->list = NULL; l->owner = cgrp; list_add(&l->links, &cgrp->pidlists); mutex_unlock(&cgrp->pidlist_mutex); @@ -3437,7 +3644,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, pid_t *array; int length; int pid, n = 0; /* used for populating the array */ - struct cgroup_iter it; + struct css_task_iter it; struct task_struct *tsk; struct cgroup_pidlist *l; @@ -3452,8 +3659,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (!array) return -ENOMEM; /* now, populate the array */ - cgroup_iter_start(cgrp, &it); - while ((tsk = cgroup_iter_next(cgrp, &it))) { + css_task_iter_start(&cgrp->dummy_css, &it); + while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ @@ -3464,7 +3671,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } - cgroup_iter_end(cgrp, &it); + css_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ sort(array, length, sizeof(pid_t), cmppid, NULL); @@ -3480,7 +3687,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, l->list = array; l->length = length; l->use_count++; - up_write(&l->mutex); + up_write(&l->rwsem); *lp = l; return 0; } @@ -3498,7 +3705,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { int ret = -EINVAL; struct cgroup *cgrp; - struct cgroup_iter it; + struct css_task_iter it; struct task_struct *tsk; /* @@ -3512,8 +3719,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) ret = 0; cgrp = dentry->d_fsdata; - cgroup_iter_start(cgrp, &it); - while ((tsk = cgroup_iter_next(cgrp, &it))) { + css_task_iter_start(&cgrp->dummy_css, &it); + while ((tsk = css_task_iter_next(&it))) { switch (tsk->state) { case TASK_RUNNING: stats->nr_running++; @@ -3533,7 +3740,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) break; } } - cgroup_iter_end(cgrp, &it); + css_task_iter_end(&it); err: return ret; @@ -3558,7 +3765,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) int index = 0, pid = *pos; int *iter; - down_read(&l->mutex); + down_read(&l->rwsem); if (pid) { int end = l->length; @@ -3585,7 +3792,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) static void cgroup_pidlist_stop(struct seq_file *s, void *v) { struct cgroup_pidlist *l = s->private; - up_read(&l->mutex); + up_read(&l->rwsem); } static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) @@ -3631,7 +3838,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l) * pidlist_mutex, we have to take pidlist_mutex first. */ mutex_lock(&l->owner->pidlist_mutex); - down_write(&l->mutex); + down_write(&l->rwsem); BUG_ON(!l->use_count); if (!--l->use_count) { /* we're the last user if refcount is 0; remove and free */ @@ -3639,12 +3846,12 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l) mutex_unlock(&l->owner->pidlist_mutex); pidlist_free(l->list); put_pid_ns(l->key.ns); - up_write(&l->mutex); + up_write(&l->rwsem); kfree(l); return; } mutex_unlock(&l->owner->pidlist_mutex); - up_write(&l->mutex); + up_write(&l->rwsem); } static int cgroup_pidlist_release(struct inode *inode, struct file *file) @@ -3708,25 +3915,41 @@ static int cgroup_procs_open(struct inode *unused, struct file *file) return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); } -static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, - struct cftype *cft) +static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft) { - return notify_on_release(cgrp); + return notify_on_release(css->cgroup); } -static int cgroup_write_notify_on_release(struct cgroup *cgrp, - struct cftype *cft, - u64 val) +static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { - clear_bit(CGRP_RELEASABLE, &cgrp->flags); + clear_bit(CGRP_RELEASABLE, &css->cgroup->flags); if (val) - set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); else - clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); return 0; } /* + * When dput() is called asynchronously, if umount has been done and + * then deactivate_super() in cgroup_free_fn() kills the superblock, + * there's a small window that vfs will see the root dentry with non-zero + * refcnt and trigger BUG(). + * + * That's why we hold a reference before dput() and drop it right after. + */ +static void cgroup_dput(struct cgroup *cgrp) +{ + struct super_block *sb = cgrp->root->sb; + + atomic_inc(&sb->s_active); + dput(cgrp->dentry); + deactivate_super(sb); +} + +/* * Unregister event and free resources. * * Gets called from workqueue. @@ -3735,18 +3958,18 @@ static void cgroup_event_remove(struct work_struct *work) { struct cgroup_event *event = container_of(work, struct cgroup_event, remove); - struct cgroup *cgrp = event->cgrp; + struct cgroup_subsys_state *css = event->css; remove_wait_queue(event->wqh, &event->wait); - event->cft->unregister_event(cgrp, event->cft, event->eventfd); + event->cft->unregister_event(css, event->cft, event->eventfd); /* Notify userspace the event is going away. */ eventfd_signal(event->eventfd, 1); eventfd_ctx_put(event->eventfd); kfree(event); - dput(cgrp->dentry); + css_put(css); } /* @@ -3759,7 +3982,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, { struct cgroup_event *event = container_of(wait, struct cgroup_event, wait); - struct cgroup *cgrp = event->cgrp; + struct cgroup *cgrp = event->css->cgroup; unsigned long flags = (unsigned long)key; if (flags & POLLHUP) { @@ -3803,14 +4026,15 @@ static void cgroup_event_ptable_queue_proc(struct file *file, * Input must be in format '<event_fd> <control_fd> <args>'. * Interpretation of args is defined by control file implementation. */ -static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, + struct cftype *cft, const char *buffer) { - struct cgroup_event *event = NULL; - struct cgroup *cgrp_cfile; + struct cgroup *cgrp = dummy_css->cgroup; + struct cgroup_event *event; + struct cgroup_subsys_state *cfile_css; unsigned int efd, cfd; - struct file *efile = NULL; - struct file *cfile = NULL; + struct fd efile; + struct fd cfile; char *endp; int ret; @@ -3827,139 +4051,126 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) return -ENOMEM; - event->cgrp = cgrp; + INIT_LIST_HEAD(&event->list); init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); init_waitqueue_func_entry(&event->wait, cgroup_event_wake); INIT_WORK(&event->remove, cgroup_event_remove); - efile = eventfd_fget(efd); - if (IS_ERR(efile)) { - ret = PTR_ERR(efile); - goto fail; + efile = fdget(efd); + if (!efile.file) { + ret = -EBADF; + goto out_kfree; } - event->eventfd = eventfd_ctx_fileget(efile); + event->eventfd = eventfd_ctx_fileget(efile.file); if (IS_ERR(event->eventfd)) { ret = PTR_ERR(event->eventfd); - goto fail; + goto out_put_efile; } - cfile = fget(cfd); - if (!cfile) { + cfile = fdget(cfd); + if (!cfile.file) { ret = -EBADF; - goto fail; + goto out_put_eventfd; } /* the process need read permission on control file */ /* AV: shouldn't we check that it's been opened for read instead? */ - ret = inode_permission(file_inode(cfile), MAY_READ); + ret = inode_permission(file_inode(cfile.file), MAY_READ); if (ret < 0) - goto fail; + goto out_put_cfile; - event->cft = __file_cft(cfile); + event->cft = __file_cft(cfile.file); if (IS_ERR(event->cft)) { ret = PTR_ERR(event->cft); - goto fail; + goto out_put_cfile; + } + + if (!event->cft->ss) { + ret = -EBADF; + goto out_put_cfile; } /* - * The file to be monitored must be in the same cgroup as - * cgroup.event_control is. + * Determine the css of @cfile, verify it belongs to the same + * cgroup as cgroup.event_control, and associate @event with it. + * Remaining events are automatically removed on cgroup destruction + * but the removal is asynchronous, so take an extra ref. */ - cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); - if (cgrp_cfile != cgrp) { - ret = -EINVAL; - goto fail; - } + rcu_read_lock(); + + ret = -EINVAL; + event->css = cgroup_css(cgrp, event->cft->ss); + cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); + if (event->css && event->css == cfile_css && css_tryget(event->css)) + ret = 0; + + rcu_read_unlock(); + if (ret) + goto out_put_cfile; if (!event->cft->register_event || !event->cft->unregister_event) { ret = -EINVAL; - goto fail; + goto out_put_css; } - ret = event->cft->register_event(cgrp, event->cft, + ret = event->cft->register_event(event->css, event->cft, event->eventfd, buffer); if (ret) - goto fail; - - efile->f_op->poll(efile, &event->pt); + goto out_put_css; - /* - * Events should be removed after rmdir of cgroup directory, but before - * destroying subsystem state objects. Let's take reference to cgroup - * directory dentry to do that. - */ - dget(cgrp->dentry); + efile.file->f_op->poll(efile.file, &event->pt); spin_lock(&cgrp->event_list_lock); list_add(&event->list, &cgrp->event_list); spin_unlock(&cgrp->event_list_lock); - fput(cfile); - fput(efile); + fdput(cfile); + fdput(efile); return 0; -fail: - if (cfile) - fput(cfile); - - if (event && event->eventfd && !IS_ERR(event->eventfd)) - eventfd_ctx_put(event->eventfd); - - if (!IS_ERR_OR_NULL(efile)) - fput(efile); - +out_put_css: + css_put(event->css); +out_put_cfile: + fdput(cfile); +out_put_eventfd: + eventfd_ctx_put(event->eventfd); +out_put_efile: + fdput(efile); +out_kfree: kfree(event); return ret; } -static u64 cgroup_clone_children_read(struct cgroup *cgrp, - struct cftype *cft) +static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); } -static int cgroup_clone_children_write(struct cgroup *cgrp, - struct cftype *cft, - u64 val) +static int cgroup_clone_children_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { if (val) - set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); else - clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); return 0; } -/* - * for the common functions, 'private' gives the type of file - */ -/* for hysterical raisins, we can't put this on the older files */ -#define CGROUP_FILE_GENERIC_PREFIX "cgroup." -static struct cftype files[] = { - { - .name = "tasks", - .open = cgroup_tasks_open, - .write_u64 = cgroup_tasks_write, - .release = cgroup_pidlist_release, - .mode = S_IRUGO | S_IWUSR, - }, +static struct cftype cgroup_base_files[] = { { - .name = CGROUP_FILE_GENERIC_PREFIX "procs", + .name = "cgroup.procs", .open = cgroup_procs_open, .write_u64 = cgroup_procs_write, .release = cgroup_pidlist_release, .mode = S_IRUGO | S_IWUSR, }, { - .name = "notify_on_release", - .read_u64 = cgroup_read_notify_on_release, - .write_u64 = cgroup_write_notify_on_release, - }, - { - .name = CGROUP_FILE_GENERIC_PREFIX "event_control", + .name = "cgroup.event_control", .write_string = cgroup_write_event_control, .mode = S_IWUGO, }, @@ -3974,9 +4185,29 @@ static struct cftype files[] = { .flags = CFTYPE_ONLY_ON_ROOT, .read_seq_string = cgroup_sane_behavior_show, }, + + /* + * Historical crazy stuff. These don't have "cgroup." prefix and + * don't exist if sane_behavior. If you're depending on these, be + * prepared to be burned. + */ + { + .name = "tasks", + .flags = CFTYPE_INSANE, /* use "procs" instead */ + .open = cgroup_tasks_open, + .write_u64 = cgroup_tasks_write, + .release = cgroup_pidlist_release, + .mode = S_IRUGO | S_IWUSR, + }, + { + .name = "notify_on_release", + .flags = CFTYPE_INSANE, + .read_u64 = cgroup_read_notify_on_release, + .write_u64 = cgroup_write_notify_on_release, + }, { .name = "release_agent", - .flags = CFTYPE_ONLY_ON_ROOT, + .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, .read_seq_string = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, .max_write_len = PATH_MAX, @@ -3985,101 +4216,145 @@ static struct cftype files[] = { }; /** - * cgroup_populate_dir - selectively creation of files in a directory + * cgroup_populate_dir - create subsys files in a cgroup directory * @cgrp: target cgroup - * @base_files: true if the base files should be added * @subsys_mask: mask of the subsystem ids whose files should be added + * + * On failure, no file is added. */ -static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, - unsigned long subsys_mask) +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) { - int err; struct cgroup_subsys *ss; - - if (base_files) { - err = cgroup_addrm_files(cgrp, NULL, files, true); - if (err < 0) - return err; - } + int i, ret = 0; /* process cftsets of each subsystem */ - for_each_subsys(cgrp->root, ss) { + for_each_subsys(ss, i) { struct cftype_set *set; - if (!test_bit(ss->subsys_id, &subsys_mask)) + + if (!test_bit(i, &subsys_mask)) continue; - list_for_each_entry(set, &ss->cftsets, node) - cgroup_addrm_files(cgrp, ss, set->cfts, true); + list_for_each_entry(set, &ss->cftsets, node) { + ret = cgroup_addrm_files(cgrp, set->cfts, true); + if (ret < 0) + goto err; + } } /* This cgroup is ready now */ - for_each_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + for_each_root_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); + struct css_id *id = rcu_dereference_protected(css->id, true); + /* * Update id->css pointer and make this css visible from * CSS ID functions. This pointer will be dereferened * from RCU-read-side without locks. */ - if (css->id) - rcu_assign_pointer(css->id->css, css); + if (id) + rcu_assign_pointer(id->css, css); } return 0; +err: + cgroup_clear_dir(cgrp, subsys_mask); + return ret; } -static void css_dput_fn(struct work_struct *work) +/* + * css destruction is four-stage process. + * + * 1. Destruction starts. Killing of the percpu_ref is initiated. + * Implemented in kill_css(). + * + * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs + * and thus css_tryget() is guaranteed to fail, the css can be offlined + * by invoking offline_css(). After offlining, the base ref is put. + * Implemented in css_killed_work_fn(). + * + * 3. When the percpu_ref reaches zero, the only possible remaining + * accessors are inside RCU read sections. css_release() schedules the + * RCU callback. + * + * 4. After the grace period, the css can be freed. Implemented in + * css_free_work_fn(). + * + * It is actually hairier because both step 2 and 4 require process context + * and thus involve punting to css->destroy_work adding two additional + * steps to the already complex sequence. + */ +static void css_free_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = - container_of(work, struct cgroup_subsys_state, dput_work); - struct dentry *dentry = css->cgroup->dentry; - struct super_block *sb = dentry->d_sb; + container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup *cgrp = css->cgroup; - atomic_inc(&sb->s_active); - dput(dentry); - deactivate_super(sb); + if (css->parent) + css_put(css->parent); + + css->ss->css_free(css); + cgroup_dput(cgrp); } -static void init_cgroup_css(struct cgroup_subsys_state *css, - struct cgroup_subsys *ss, - struct cgroup *cgrp) +static void css_free_rcu_fn(struct rcu_head *rcu_head) +{ + struct cgroup_subsys_state *css = + container_of(rcu_head, struct cgroup_subsys_state, rcu_head); + + /* + * css holds an extra ref to @cgrp->dentry which is put on the last + * css_put(). dput() requires process context which we don't have. + */ + INIT_WORK(&css->destroy_work, css_free_work_fn); + schedule_work(&css->destroy_work); +} + +static void css_release(struct percpu_ref *ref) +{ + struct cgroup_subsys_state *css = + container_of(ref, struct cgroup_subsys_state, refcnt); + + call_rcu(&css->rcu_head, css_free_rcu_fn); +} + +static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, + struct cgroup *cgrp) { css->cgroup = cgrp; - atomic_set(&css->refcnt, 1); + css->ss = ss; css->flags = 0; css->id = NULL; - if (cgrp == dummytop) + + if (cgrp->parent) + css->parent = cgroup_css(cgrp->parent, ss); + else css->flags |= CSS_ROOT; - BUG_ON(cgrp->subsys[ss->subsys_id]); - cgrp->subsys[ss->subsys_id] = css; - /* - * css holds an extra ref to @cgrp->dentry which is put on the last - * css_put(). dput() requires process context, which css_put() may - * be called without. @css->dput_work will be used to invoke - * dput() asynchronously from css_put(). - */ - INIT_WORK(&css->dput_work, css_dput_fn); + BUG_ON(cgroup_css(cgrp, ss)); } -/* invoke ->post_create() on a new CSS and mark it online if successful */ -static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) +/* invoke ->css_online() on a new CSS and mark it online if successful */ +static int online_css(struct cgroup_subsys_state *css) { + struct cgroup_subsys *ss = css->ss; int ret = 0; lockdep_assert_held(&cgroup_mutex); if (ss->css_online) - ret = ss->css_online(cgrp); - if (!ret) - cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; + ret = ss->css_online(css); + if (!ret) { + css->flags |= CSS_ONLINE; + css->cgroup->nr_css++; + rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); + } return ret; } -/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ -static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) - __releases(&cgroup_mutex) __acquires(&cgroup_mutex) +/* if the CSS is online, invoke ->css_offline() on it and mark it offline */ +static void offline_css(struct cgroup_subsys_state *css) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys *ss = css->ss; lockdep_assert_held(&cgroup_mutex); @@ -4087,9 +4362,11 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) return; if (ss->css_offline) - ss->css_offline(cgrp); + ss->css_offline(css); - cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; + css->flags &= ~CSS_ONLINE; + css->cgroup->nr_css--; + RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); } /* @@ -4103,6 +4380,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) static long cgroup_create(struct cgroup *parent, struct dentry *dentry, umode_t mode) { + struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { }; struct cgroup *cgrp; struct cgroup_name *name; struct cgroupfs_root *root = parent->root; @@ -4120,7 +4398,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_cgrp; rcu_assign_pointer(cgrp->name, name); - cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); + /* + * Temporarily set the pointer to NULL, so idr_find() won't return + * a half-baked cgroup. + */ + cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); if (cgrp->id < 0) goto err_free_name; @@ -4149,6 +4431,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, cgrp->dentry = dentry; cgrp->parent = parent; + cgrp->dummy_css.parent = &parent->dummy_css; cgrp->root = parent->root; if (notify_on_release(parent)) @@ -4157,17 +4440,24 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); - for_each_subsys(root, ss) { + for_each_root_subsys(root, ss) { struct cgroup_subsys_state *css; - css = ss->css_alloc(cgrp); + css = ss->css_alloc(cgroup_css(parent, ss)); if (IS_ERR(css)) { err = PTR_ERR(css); goto err_free_all; } - init_cgroup_css(css, ss, cgrp); + css_ar[ss->subsys_id] = css; + + err = percpu_ref_init(&css->refcnt, css_release); + if (err) + goto err_free_all; + + init_css(css, ss, cgrp); + if (ss->use_id) { - err = alloc_css_id(ss, parent, cgrp); + err = alloc_css_id(css); if (err) goto err_free_all; } @@ -4183,21 +4473,28 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_all; lockdep_assert_held(&dentry->d_inode->i_mutex); + cgrp->serial_nr = cgroup_serial_nr_next++; + /* allocation complete, commit to creation */ - list_add_tail(&cgrp->allcg_node, &root->allcg_list); list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; - /* each css holds a ref to the cgroup's dentry */ - for_each_subsys(root, ss) + /* each css holds a ref to the cgroup's dentry and the parent css */ + for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; + dget(dentry); + css_get(css->parent); + } /* hold a ref to the parent's dentry */ dget(parent->dentry); /* creation succeeded, notify subsystems */ - for_each_subsys(root, ss) { - err = online_css(ss, cgrp); + for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; + + err = online_css(css); if (err) goto err_destroy; @@ -4211,7 +4508,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } } - err = cgroup_populate_dir(cgrp, true, root->subsys_mask); + idr_replace(&root->cgroup_idr, cgrp, cgrp->id); + + err = cgroup_addrm_files(cgrp, cgroup_base_files, true); + if (err) + goto err_destroy; + + err = cgroup_populate_dir(cgrp, root->subsys_mask); if (err) goto err_destroy; @@ -4221,15 +4524,19 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, return 0; err_free_all: - for_each_subsys(root, ss) { - if (cgrp->subsys[ss->subsys_id]) - ss->css_free(cgrp); + for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; + + if (css) { + percpu_ref_cancel_init(&css->refcnt); + ss->css_free(css); + } } mutex_unlock(&cgroup_mutex); /* Release the reference count that we took on the superblock */ deactivate_super(sb); err_free_id: - ida_simple_remove(&root->cgroup_ida, cgrp->id); + idr_remove(&root->cgroup_idr, cgrp->id); err_free_name: kfree(rcu_dereference_raw(cgrp->name)); err_free_cgrp: @@ -4251,63 +4558,188 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return cgroup_create(c_parent, dentry, mode | S_IFDIR); } +/* + * This is called when the refcnt of a css is confirmed to be killed. + * css_tryget() is now guaranteed to fail. + */ +static void css_killed_work_fn(struct work_struct *work) +{ + struct cgroup_subsys_state *css = + container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup *cgrp = css->cgroup; + + mutex_lock(&cgroup_mutex); + + /* + * css_tryget() is guaranteed to fail now. Tell subsystems to + * initate destruction. + */ + offline_css(css); + + /* + * If @cgrp is marked dead, it's waiting for refs of all css's to + * be disabled before proceeding to the second phase of cgroup + * destruction. If we are the last one, kick it off. + */ + if (!cgrp->nr_css && cgroup_is_dead(cgrp)) + cgroup_destroy_css_killed(cgrp); + + mutex_unlock(&cgroup_mutex); + + /* + * Put the css refs from kill_css(). Each css holds an extra + * reference to the cgroup's dentry and cgroup removal proceeds + * regardless of css refs. On the last put of each css, whenever + * that may be, the extra dentry ref is put so that dentry + * destruction happens only after all css's are released. + */ + css_put(css); +} + +/* css kill confirmation processing requires process context, bounce */ +static void css_killed_ref_fn(struct percpu_ref *ref) +{ + struct cgroup_subsys_state *css = + container_of(ref, struct cgroup_subsys_state, refcnt); + + INIT_WORK(&css->destroy_work, css_killed_work_fn); + schedule_work(&css->destroy_work); +} + +/** + * kill_css - destroy a css + * @css: css to destroy + * + * This function initiates destruction of @css by removing cgroup interface + * files and putting its base reference. ->css_offline() will be invoked + * asynchronously once css_tryget() is guaranteed to fail and when the + * reference count reaches zero, @css will be released. + */ +static void kill_css(struct cgroup_subsys_state *css) +{ + cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); + + /* + * Killing would put the base ref, but we need to keep it alive + * until after ->css_offline(). + */ + css_get(css); + + /* + * cgroup core guarantees that, by the time ->css_offline() is + * invoked, no new css reference will be given out via + * css_tryget(). We can't simply call percpu_ref_kill() and + * proceed to offlining css's because percpu_ref_kill() doesn't + * guarantee that the ref is seen as killed on all CPUs on return. + * + * Use percpu_ref_kill_and_confirm() to get notifications as each + * css is confirmed to be seen as killed on all CPUs. + */ + percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); +} + +/** + * cgroup_destroy_locked - the first stage of cgroup destruction + * @cgrp: cgroup to be destroyed + * + * css's make use of percpu refcnts whose killing latency shouldn't be + * exposed to userland and are RCU protected. Also, cgroup core needs to + * guarantee that css_tryget() won't succeed by the time ->css_offline() is + * invoked. To satisfy all the requirements, destruction is implemented in + * the following two steps. + * + * s1. Verify @cgrp can be destroyed and mark it dying. Remove all + * userland visible parts and start killing the percpu refcnts of + * css's. Set up so that the next stage will be kicked off once all + * the percpu refcnts are confirmed to be killed. + * + * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the + * rest of destruction. Once all cgroup references are gone, the + * cgroup is RCU-freed. + * + * This function implements s1. After this step, @cgrp is gone as far as + * the userland is concerned and a new cgroup with the same name may be + * created. As cgroup doesn't care about the names internally, this + * doesn't cause any problem. + */ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct dentry *d = cgrp->dentry; - struct cgroup *parent = cgrp->parent; struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; + struct cgroup *child; + bool empty; lockdep_assert_held(&d->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); - if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) + /* + * css_set_lock synchronizes access to ->cset_links and prevents + * @cgrp from being removed while __put_css_set() is in progress. + */ + read_lock(&css_set_lock); + empty = list_empty(&cgrp->cset_links); + read_unlock(&css_set_lock); + if (!empty) return -EBUSY; /* - * Block new css_tryget() by deactivating refcnt and mark @cgrp - * removed. This makes future css_tryget() and child creation - * attempts fail thus maintaining the removal conditions verified - * above. + * Make sure there's no live children. We can't test ->children + * emptiness as dead children linger on it while being destroyed; + * otherwise, "rmdir parent/child parent" may fail with -EBUSY. */ - for_each_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - - WARN_ON(atomic_read(&css->refcnt) < 0); - atomic_add(CSS_DEACT_BIAS, &css->refcnt); + empty = true; + rcu_read_lock(); + list_for_each_entry_rcu(child, &cgrp->children, sibling) { + empty = cgroup_is_dead(child); + if (!empty) + break; } - set_bit(CGRP_REMOVED, &cgrp->flags); + rcu_read_unlock(); + if (!empty) + return -EBUSY; - /* tell subsystems to initate destruction */ - for_each_subsys(cgrp->root, ss) - offline_css(ss, cgrp); + /* + * Initiate massacre of all css's. cgroup_destroy_css_killed() + * will be invoked to perform the rest of destruction once the + * percpu refs of all css's are confirmed to be killed. + */ + for_each_root_subsys(cgrp->root, ss) + kill_css(cgroup_css(cgrp, ss)); /* - * Put all the base refs. Each css holds an extra reference to the - * cgroup's dentry and cgroup removal proceeds regardless of css - * refs. On the last put of each css, whenever that may be, the - * extra dentry ref is put so that dentry destruction happens only - * after all css's are released. + * Mark @cgrp dead. This prevents further task migration and child + * creation by disabling cgroup_lock_live_group(). Note that + * CGRP_DEAD assertion is depended upon by css_next_child() to + * resume iteration after dropping RCU read lock. See + * css_next_child() for details. */ - for_each_subsys(cgrp->root, ss) - css_put(cgrp->subsys[ss->subsys_id]); + set_bit(CGRP_DEAD, &cgrp->flags); + /* CGRP_DEAD is set, remove from ->release_list for the last time */ raw_spin_lock(&release_list_lock); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); - /* delete this cgroup from parent->children */ - list_del_rcu(&cgrp->sibling); - list_del_init(&cgrp->allcg_node); + /* + * If @cgrp has css's attached, the second stage of cgroup + * destruction is kicked off from css_killed_work_fn() after the + * refs of all attached css's are killed. If @cgrp doesn't have + * any css, we kick it off here. + */ + if (!cgrp->nr_css) + cgroup_destroy_css_killed(cgrp); + /* + * Clear the base files and remove @cgrp directory. The removal + * puts the base ref but we aren't quite done with @cgrp yet, so + * hold onto it. + */ + cgroup_addrm_files(cgrp, cgroup_base_files, false); dget(d); cgroup_d_remove_dir(d); - dput(d); - - set_bit(CGRP_RELEASABLE, &parent->flags); - check_for_release(parent); /* * Unregister events and notify userspace. @@ -4322,6 +4754,39 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) spin_unlock(&cgrp->event_list_lock); return 0; +}; + +/** + * cgroup_destroy_css_killed - the second step of cgroup destruction + * @work: cgroup->destroy_free_work + * + * This function is invoked from a work item for a cgroup which is being + * destroyed after all css's are offlined and performs the rest of + * destruction. This is the second step of destruction described in the + * comment above cgroup_destroy_locked(). + */ +static void cgroup_destroy_css_killed(struct cgroup *cgrp) +{ + struct cgroup *parent = cgrp->parent; + struct dentry *d = cgrp->dentry; + + lockdep_assert_held(&cgroup_mutex); + + /* delete this cgroup from parent->children */ + list_del_rcu(&cgrp->sibling); + + /* + * We should remove the cgroup object from idr before its grace + * period starts, so we won't be looking up a cgroup while the + * cgroup is being freed. + */ + idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + cgrp->id = -1; + + dput(d); + + set_bit(CGRP_RELEASABLE, &parent->flags); + check_for_release(parent); } static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) @@ -4344,6 +4809,11 @@ static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) * deregistration. */ if (ss->base_cftypes) { + struct cftype *cft; + + for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) + cft->ss = ss; + ss->base_cftset.cfts = ss->base_cftypes; list_add_tail(&ss->base_cftset.node, &ss->cftsets); } @@ -4361,12 +4831,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) cgroup_init_cftsets(ss); /* Create the top cgroup state for this subsystem */ - list_add(&ss->sibling, &rootnode.subsys_list); - ss->root = &rootnode; - css = ss->css_alloc(dummytop); + list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); + ss->root = &cgroup_dummy_root; + css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); - init_cgroup_css(css, ss, dummytop); + init_css(css, ss, cgroup_dummy_top); /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is @@ -4381,7 +4851,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); - BUG_ON(online_css(ss, dummytop)); + BUG_ON(online_css(css)); mutex_unlock(&cgroup_mutex); @@ -4404,7 +4874,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) struct cgroup_subsys_state *css; int i, ret; struct hlist_node *tmp; - struct css_set *cg; + struct css_set *cset; unsigned long key; /* check name and function validity */ @@ -4427,7 +4897,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) */ if (ss->module == NULL) { /* a sanity check */ - BUG_ON(subsys[ss->subsys_id] != ss); + BUG_ON(cgroup_subsys[ss->subsys_id] != ss); return 0; } @@ -4435,27 +4905,27 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) cgroup_init_cftsets(ss); mutex_lock(&cgroup_mutex); - subsys[ss->subsys_id] = ss; + cgroup_subsys[ss->subsys_id] = ss; /* * no ss->css_alloc seems to need anything important in the ss - * struct, so this can happen first (i.e. before the rootnode + * struct, so this can happen first (i.e. before the dummy root * attachment). */ - css = ss->css_alloc(dummytop); + css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); if (IS_ERR(css)) { - /* failure case - need to deassign the subsys[] slot. */ - subsys[ss->subsys_id] = NULL; + /* failure case - need to deassign the cgroup_subsys[] slot. */ + cgroup_subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); return PTR_ERR(css); } - list_add(&ss->sibling, &rootnode.subsys_list); - ss->root = &rootnode; + list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); + ss->root = &cgroup_dummy_root; /* our new subsystem will be attached to the dummy hierarchy. */ - init_cgroup_css(css, ss, dummytop); - /* init_idr must be after init_cgroup_css because it sets css->id. */ + init_css(css, ss, cgroup_dummy_top); + /* init_idr must be after init_css() because it sets css->id. */ if (ss->use_id) { ret = cgroup_init_idr(ss, css); if (ret) @@ -4471,21 +4941,21 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * this is all done under the css_set_lock. */ write_lock(&css_set_lock); - hash_for_each_safe(css_set_table, i, tmp, cg, hlist) { + hash_for_each_safe(css_set_table, i, tmp, cset, hlist) { /* skip entries that we already rehashed */ - if (cg->subsys[ss->subsys_id]) + if (cset->subsys[ss->subsys_id]) continue; /* remove existing entry */ - hash_del(&cg->hlist); + hash_del(&cset->hlist); /* set new value */ - cg->subsys[ss->subsys_id] = css; + cset->subsys[ss->subsys_id] = css; /* recompute hash and restore entry */ - key = css_set_hash(cg->subsys); - hash_add(css_set_table, &cg->hlist, key); + key = css_set_hash(cset->subsys); + hash_add(css_set_table, &cset->hlist, key); } write_unlock(&css_set_lock); - ret = online_css(ss, dummytop); + ret = online_css(css); if (ret) goto err_unload; @@ -4511,54 +4981,55 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys); */ void cgroup_unload_subsys(struct cgroup_subsys *ss) { - struct cg_cgroup_link *link; + struct cgrp_cset_link *link; BUG_ON(ss->module == NULL); /* * we shouldn't be called if the subsystem is in use, and the use of - * try_module_get in parse_cgroupfs_options should ensure that it + * try_module_get() in rebind_subsystems() should ensure that it * doesn't start being used while we're killing it off. */ - BUG_ON(ss->root != &rootnode); + BUG_ON(ss->root != &cgroup_dummy_root); mutex_lock(&cgroup_mutex); - offline_css(ss, dummytop); + offline_css(cgroup_css(cgroup_dummy_top, ss)); if (ss->use_id) idr_destroy(&ss->idr); /* deassign the subsys_id */ - subsys[ss->subsys_id] = NULL; + cgroup_subsys[ss->subsys_id] = NULL; - /* remove subsystem from rootnode's list of subsystems */ + /* remove subsystem from the dummy root's list of subsystems */ list_del_init(&ss->sibling); /* - * disentangle the css from all css_sets attached to the dummytop. as - * in loading, we need to pay our respects to the hashtable gods. + * disentangle the css from all css_sets attached to the dummy + * top. as in loading, we need to pay our respects to the hashtable + * gods. */ write_lock(&css_set_lock); - list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { - struct css_set *cg = link->cg; + list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) { + struct css_set *cset = link->cset; unsigned long key; - hash_del(&cg->hlist); - cg->subsys[ss->subsys_id] = NULL; - key = css_set_hash(cg->subsys); - hash_add(css_set_table, &cg->hlist, key); + hash_del(&cset->hlist); + cset->subsys[ss->subsys_id] = NULL; + key = css_set_hash(cset->subsys); + hash_add(css_set_table, &cset->hlist, key); } write_unlock(&css_set_lock); /* - * remove subsystem's css from the dummytop and free it - need to - * free before marking as null because ss->css_free needs the - * cgrp->subsys pointer to find their state. note that this also - * takes care of freeing the css_id. + * remove subsystem's css from the cgroup_dummy_top and free it - + * need to free before marking as null because ss->css_free needs + * the cgrp->subsys pointer to find their state. note that this + * also takes care of freeing the css_id. */ - ss->css_free(dummytop); - dummytop->subsys[ss->subsys_id] = NULL; + ss->css_free(cgroup_css(cgroup_dummy_top, ss)); + RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); mutex_unlock(&cgroup_mutex); } @@ -4572,30 +5043,25 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys); */ int __init cgroup_init_early(void) { + struct cgroup_subsys *ss; int i; + atomic_set(&init_css_set.refcount, 1); - INIT_LIST_HEAD(&init_css_set.cg_links); + INIT_LIST_HEAD(&init_css_set.cgrp_links); INIT_LIST_HEAD(&init_css_set.tasks); INIT_HLIST_NODE(&init_css_set.hlist); css_set_count = 1; - init_cgroup_root(&rootnode); - root_count = 1; - init_task.cgroups = &init_css_set; - - init_css_set_link.cg = &init_css_set; - init_css_set_link.cgrp = dummytop; - list_add(&init_css_set_link.cgrp_link_list, - &rootnode.top_cgroup.css_sets); - list_add(&init_css_set_link.cg_link_list, - &init_css_set.cg_links); - - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - - /* at bootup time, we don't worry about modular subsystems */ - if (!ss || ss->module) - continue; + init_cgroup_root(&cgroup_dummy_root); + cgroup_root_count = 1; + RCU_INIT_POINTER(init_task.cgroups, &init_css_set); + + init_cgrp_cset_link.cset = &init_css_set; + init_cgrp_cset_link.cgrp = cgroup_dummy_top; + list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); + list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); + /* at bootup time, we don't worry about modular subsystems */ + for_each_builtin_subsys(ss, i) { BUG_ON(!ss->name); BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); BUG_ON(!ss->css_alloc); @@ -4620,30 +5086,37 @@ int __init cgroup_init_early(void) */ int __init cgroup_init(void) { - int err; - int i; + struct cgroup_subsys *ss; unsigned long key; + int i, err; err = bdi_init(&cgroup_backing_dev_info); if (err) return err; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - - /* at bootup time, we don't worry about modular subsystems */ - if (!ss || ss->module) - continue; + for_each_builtin_subsys(ss, i) { if (!ss->early_init) cgroup_init_subsys(ss); if (ss->use_id) cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); } + /* allocate id for the dummy hierarchy */ + mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); + /* Add init_css_set to the hash table */ key = css_set_hash(init_css_set.subsys); hash_add(css_set_table, &init_css_set.hlist, key); - BUG_ON(!init_root_id(&rootnode)); + + BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); + + err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top, + 0, 1, GFP_KERNEL); + BUG_ON(err < 0); + + mutex_unlock(&cgroup_root_mutex); + mutex_unlock(&cgroup_mutex); cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); if (!cgroup_kobj) { @@ -4708,7 +5181,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) int count = 0; seq_printf(m, "%d:", root->hierarchy_id); - for_each_subsys(root, ss) + for_each_root_subsys(root, ss) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", @@ -4734,6 +5207,7 @@ out: /* Display information about each subsystem and each hierarchy */ static int proc_cgroupstats_show(struct seq_file *m, void *v) { + struct cgroup_subsys *ss; int i; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); @@ -4743,14 +5217,12 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) * subsys/hierarchy state. */ mutex_lock(&cgroup_mutex); - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; + + for_each_subsys(ss, i) seq_printf(m, "%s\t%d\t%d\t%d\n", ss->name, ss->root->hierarchy_id, ss->root->number_of_cgroups, !ss->disabled); - } + mutex_unlock(&cgroup_mutex); return 0; } @@ -4786,8 +5258,8 @@ static const struct file_operations proc_cgroupstats_operations = { void cgroup_fork(struct task_struct *child) { task_lock(current); + get_css_set(task_css_set(current)); child->cgroups = current->cgroups; - get_css_set(child->cgroups); task_unlock(current); INIT_LIST_HEAD(&child->cg_list); } @@ -4799,11 +5271,12 @@ void cgroup_fork(struct task_struct *child) * Adds the task to the list running through its css_set if necessary and * call the subsystem fork() callbacks. Has to be after the task is * visible on the task list in case we race with the first call to - * cgroup_iter_start() - to guarantee that the new task ends up on its + * cgroup_task_iter_start() - to guarantee that the new task ends up on its * list. */ void cgroup_post_fork(struct task_struct *child) { + struct cgroup_subsys *ss; int i; /* @@ -4821,7 +5294,7 @@ void cgroup_post_fork(struct task_struct *child) write_lock(&css_set_lock); task_lock(child); if (list_empty(&child->cg_list)) - list_add(&child->cg_list, &child->cgroups->tasks); + list_add(&child->cg_list, &task_css_set(child)->tasks); task_unlock(child); write_unlock(&css_set_lock); } @@ -4840,12 +5313,9 @@ void cgroup_post_fork(struct task_struct *child) * of the array can be freed at module unload, so we * can't touch that. */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - + for_each_builtin_subsys(ss, i) if (ss->fork) ss->fork(child); - } } } @@ -4886,7 +5356,8 @@ void cgroup_post_fork(struct task_struct *child) */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { - struct css_set *cg; + struct cgroup_subsys *ss; + struct css_set *cset; int i; /* @@ -4903,36 +5374,32 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) /* Reassign the task to the init_css_set. */ task_lock(tsk); - cg = tsk->cgroups; - tsk->cgroups = &init_css_set; + cset = task_css_set(tsk); + RCU_INIT_POINTER(tsk->cgroups, &init_css_set); if (run_callbacks && need_forkexit_callback) { /* * fork/exit callbacks are supported only for builtin * subsystems, see cgroup_post_fork() for details. */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - + for_each_builtin_subsys(ss, i) { if (ss->exit) { - struct cgroup *old_cgrp = - rcu_dereference_raw(cg->subsys[i])->cgroup; - struct cgroup *cgrp = task_cgroup(tsk, i); - ss->exit(cgrp, old_cgrp, tsk); + struct cgroup_subsys_state *old_css = cset->subsys[i]; + struct cgroup_subsys_state *css = task_css(tsk, i); + + ss->exit(css, old_css, tsk); } } } task_unlock(tsk); - put_css_set_taskexit(cg); + put_css_set_taskexit(cset); } static void check_for_release(struct cgroup *cgrp) { - /* All of these checks rely on RCU to keep the cgroup - * structure alive */ if (cgroup_is_releasable(cgrp) && - !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) { + list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { /* * Control Group is currently removeable. If it's not * already queued for a userspace notification, queue @@ -4941,7 +5408,7 @@ static void check_for_release(struct cgroup *cgrp) int need_schedule_work = 0; raw_spin_lock(&release_list_lock); - if (!cgroup_is_removed(cgrp) && + if (!cgroup_is_dead(cgrp) && list_empty(&cgrp->release_list)) { list_add(&cgrp->release_list, &release_list); need_schedule_work = 1; @@ -4952,34 +5419,6 @@ static void check_for_release(struct cgroup *cgrp) } } -/* Caller must verify that the css is not for root cgroup */ -bool __css_tryget(struct cgroup_subsys_state *css) -{ - while (true) { - int t, v; - - v = css_refcnt(css); - t = atomic_cmpxchg(&css->refcnt, v, v + 1); - if (likely(t == v)) - return true; - else if (t < 0) - return false; - cpu_relax(); - } -} -EXPORT_SYMBOL_GPL(__css_tryget); - -/* Caller must verify that the css is not for root cgroup */ -void __css_put(struct cgroup_subsys_state *css) -{ - int v; - - v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); - if (v == 0) - schedule_work(&css->dput_work); -} -EXPORT_SYMBOL_GPL(__css_put); - /* * Notify userspace when a cgroup is released, by running the * configured release agent with the name of the cgroup (path @@ -5054,23 +5493,19 @@ static void cgroup_release_agent(struct work_struct *work) static int __init cgroup_disable(char *str) { - int i; + struct cgroup_subsys *ss; char *token; + int i; while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - - /* - * cgroup_disable, being at boot time, can't - * know about module subsystems, so we don't - * worry about them. - */ - if (!ss || ss->module) - continue; + /* + * cgroup_disable, being at boot time, can't know about + * module subsystems, so we don't worry about them. + */ + for_each_builtin_subsys(ss, i) { if (!strcmp(token, ss->name)) { ss->disabled = 1; printk(KERN_INFO "Disabling %s control group" @@ -5087,9 +5522,7 @@ __setup("cgroup_disable=", cgroup_disable); * Functons for CSS ID. */ -/* - *To get ID other than 0, this should be called when !cgroup_is_removed(). - */ +/* to get ID other than 0, this should be called when !cgroup_is_dead() */ unsigned short css_id(struct cgroup_subsys_state *css) { struct css_id *cssid; @@ -5099,7 +5532,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) * on this or this is under rcu_read_lock(). Once css->id is allocated, * it's unchanged until freed. */ - cssid = rcu_dereference_check(css->id, css_refcnt(css)); + cssid = rcu_dereference_raw(css->id); if (cssid) return cssid->id; @@ -5107,18 +5540,6 @@ unsigned short css_id(struct cgroup_subsys_state *css) } EXPORT_SYMBOL_GPL(css_id); -unsigned short css_depth(struct cgroup_subsys_state *css) -{ - struct css_id *cssid; - - cssid = rcu_dereference_check(css->id, css_refcnt(css)); - - if (cssid) - return cssid->depth; - return 0; -} -EXPORT_SYMBOL_GPL(css_depth); - /** * css_is_ancestor - test "root" css is an ancestor of "child" * @child: the css to be tested. @@ -5153,7 +5574,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *child, void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) { - struct css_id *id = css->id; + struct css_id *id = rcu_dereference_protected(css->id, true); + /* When this is called before css_id initialization, id can be NULL */ if (!id) return; @@ -5219,25 +5641,21 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, return PTR_ERR(newid); newid->stack[0] = newid->id; - newid->css = rootcss; - rootcss->id = newid; + RCU_INIT_POINTER(newid->css, rootcss); + RCU_INIT_POINTER(rootcss->id, newid); return 0; } -static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, - struct cgroup *child) +static int alloc_css_id(struct cgroup_subsys_state *child_css) { - int subsys_id, i, depth = 0; - struct cgroup_subsys_state *parent_css, *child_css; + struct cgroup_subsys_state *parent_css = css_parent(child_css); struct css_id *child_id, *parent_id; + int i, depth; - subsys_id = ss->subsys_id; - parent_css = parent->subsys[subsys_id]; - child_css = child->subsys[subsys_id]; - parent_id = parent_css->id; + parent_id = rcu_dereference_protected(parent_css->id, true); depth = parent_id->depth + 1; - child_id = get_new_cssid(ss, depth); + child_id = get_new_cssid(child_css->ss, depth); if (IS_ERR(child_id)) return PTR_ERR(child_id); @@ -5275,31 +5693,56 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) } EXPORT_SYMBOL_GPL(css_lookup); -/* - * get corresponding css from file open on cgroupfs directory +/** + * css_from_dir - get corresponding css from the dentry of a cgroup dir + * @dentry: directory dentry of interest + * @ss: subsystem of interest + * + * Must be called under RCU read lock. The caller is responsible for + * pinning the returned css if it needs to be accessed outside the RCU + * critical section. */ -struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) +struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss) { struct cgroup *cgrp; - struct inode *inode; - struct cgroup_subsys_state *css; - inode = file_inode(f); - /* check in cgroup filesystem dir */ - if (inode->i_op != &cgroup_dir_inode_operations) + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* is @dentry a cgroup dir? */ + if (!dentry->d_inode || + dentry->d_inode->i_op != &cgroup_dir_inode_operations) return ERR_PTR(-EBADF); - if (id < 0 || id >= CGROUP_SUBSYS_COUNT) - return ERR_PTR(-EINVAL); + cgrp = __d_cgrp(dentry); + return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); +} - /* get cgroup */ - cgrp = __d_cgrp(f->f_dentry); - css = cgrp->subsys[id]; - return css ? css : ERR_PTR(-ENOENT); +/** + * css_from_id - lookup css by id + * @id: the cgroup id + * @ss: cgroup subsys to be looked into + * + * Returns the css if there's valid one with @id, otherwise returns NULL. + * Should be called under rcu_read_lock(). + */ +struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) +{ + struct cgroup *cgrp; + + rcu_lockdep_assert(rcu_read_lock_held() || + lockdep_is_held(&cgroup_mutex), + "css_from_id() needs proper protection"); + + cgrp = idr_find(&ss->root->cgroup_idr, id); + if (cgrp) + return cgroup_css(cgrp, ss); + return NULL; } #ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) +static struct cgroup_subsys_state * +debug_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); @@ -5309,48 +5752,45 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) return css; } -static void debug_css_free(struct cgroup *cont) +static void debug_css_free(struct cgroup_subsys_state *css) { - kfree(cont->subsys[debug_subsys_id]); + kfree(css); } -static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) +static u64 debug_taskcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return atomic_read(&cont->count); + return cgroup_task_count(css->cgroup); } -static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) -{ - return cgroup_task_count(cont); -} - -static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) +static u64 current_css_set_read(struct cgroup_subsys_state *css, + struct cftype *cft) { return (u64)(unsigned long)current->cgroups; } -static u64 current_css_set_refcount_read(struct cgroup *cont, - struct cftype *cft) +static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) { u64 count; rcu_read_lock(); - count = atomic_read(¤t->cgroups->refcount); + count = atomic_read(&task_css_set(current)->refcount); rcu_read_unlock(); return count; } -static int current_css_set_cg_links_read(struct cgroup *cont, +static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *seq) { - struct cg_cgroup_link *link; - struct css_set *cg; + struct cgrp_cset_link *link; + struct css_set *cset; read_lock(&css_set_lock); rcu_read_lock(); - cg = rcu_dereference(current->cgroups); - list_for_each_entry(link, &cg->cg_links, cg_link_list) { + cset = rcu_dereference(current->cgroups); + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; const char *name; @@ -5367,19 +5807,18 @@ static int current_css_set_cg_links_read(struct cgroup *cont, } #define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct cgroup *cont, - struct cftype *cft, - struct seq_file *seq) +static int cgroup_css_links_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *seq) { - struct cg_cgroup_link *link; + struct cgrp_cset_link *link; read_lock(&css_set_lock); - list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { - struct css_set *cg = link->cg; + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { + struct css_set *cset = link->cset; struct task_struct *task; int count = 0; - seq_printf(seq, "css_set %p\n", cg); - list_for_each_entry(task, &cg->tasks, cg_list) { + seq_printf(seq, "css_set %p\n", cset); + list_for_each_entry(task, &cset->tasks, cg_list) { if (count++ > MAX_TASKS_SHOWN_PER_CSS) { seq_puts(seq, " ...\n"); break; @@ -5393,17 +5832,13 @@ static int cgroup_css_links_read(struct cgroup *cont, return 0; } -static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) +static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return test_bit(CGRP_RELEASABLE, &cgrp->flags); + return test_bit(CGRP_RELEASABLE, &css->cgroup->flags); } static struct cftype debug_files[] = { { - .name = "cgroup_refcount", - .read_u64 = cgroup_refcount_read, - }, - { .name = "taskcount", .read_u64 = debug_taskcount_read, }, diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 75dda1ea502..f0ff64d0eba 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -45,25 +45,19 @@ struct freezer { spinlock_t lock; }; -static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) +static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) { - return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), - struct freezer, css); + return css ? container_of(css, struct freezer, css) : NULL; } static inline struct freezer *task_freezer(struct task_struct *task) { - return container_of(task_subsys_state(task, freezer_subsys_id), - struct freezer, css); + return css_freezer(task_css(task, freezer_subsys_id)); } static struct freezer *parent_freezer(struct freezer *freezer) { - struct cgroup *pcg = freezer->css.cgroup->parent; - - if (pcg) - return cgroup_freezer(pcg); - return NULL; + return css_freezer(css_parent(&freezer->css)); } bool cgroup_freezing(struct task_struct *task) @@ -92,7 +86,8 @@ static const char *freezer_state_strs(unsigned int state) struct cgroup_subsys freezer_subsys; -static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) +static struct cgroup_subsys_state * +freezer_css_alloc(struct cgroup_subsys_state *parent_css) { struct freezer *freezer; @@ -105,22 +100,22 @@ static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) } /** - * freezer_css_online - commit creation of a freezer cgroup - * @cgroup: cgroup being created + * freezer_css_online - commit creation of a freezer css + * @css: css being created * - * We're committing to creation of @cgroup. Mark it online and inherit + * We're committing to creation of @css. Mark it online and inherit * parent's freezing state while holding both parent's and our * freezer->lock. */ -static int freezer_css_online(struct cgroup *cgroup) +static int freezer_css_online(struct cgroup_subsys_state *css) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); struct freezer *parent = parent_freezer(freezer); /* * The following double locking and freezing state inheritance * guarantee that @cgroup can never escape ancestors' freezing - * states. See cgroup_for_each_descendant_pre() for details. + * states. See css_for_each_descendant_pre() for details. */ if (parent) spin_lock_irq(&parent->lock); @@ -141,15 +136,15 @@ static int freezer_css_online(struct cgroup *cgroup) } /** - * freezer_css_offline - initiate destruction of @cgroup - * @cgroup: cgroup being destroyed + * freezer_css_offline - initiate destruction of a freezer css + * @css: css being destroyed * - * @cgroup is going away. Mark it dead and decrement system_freezing_count - * if it was holding one. + * @css is going away. Mark it dead and decrement system_freezing_count if + * it was holding one. */ -static void freezer_css_offline(struct cgroup *cgroup) +static void freezer_css_offline(struct cgroup_subsys_state *css) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); spin_lock_irq(&freezer->lock); @@ -161,9 +156,9 @@ static void freezer_css_offline(struct cgroup *cgroup) spin_unlock_irq(&freezer->lock); } -static void freezer_css_free(struct cgroup *cgroup) +static void freezer_css_free(struct cgroup_subsys_state *css) { - kfree(cgroup_freezer(cgroup)); + kfree(css_freezer(css)); } /* @@ -175,25 +170,26 @@ static void freezer_css_free(struct cgroup *cgroup) * @freezer->lock. freezer_attach() makes the new tasks conform to the * current state and all following state changes can see the new tasks. */ -static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) +static void freezer_attach(struct cgroup_subsys_state *new_css, + struct cgroup_taskset *tset) { - struct freezer *freezer = cgroup_freezer(new_cgrp); + struct freezer *freezer = css_freezer(new_css); struct task_struct *task; bool clear_frozen = false; spin_lock_irq(&freezer->lock); /* - * Make the new tasks conform to the current state of @new_cgrp. + * Make the new tasks conform to the current state of @new_css. * For simplicity, when migrating any task to a FROZEN cgroup, we * revert it to FREEZING and let update_if_frozen() determine the * correct state later. * - * Tasks in @tset are on @new_cgrp but may not conform to its + * Tasks in @tset are on @new_css but may not conform to its * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, new_cgrp, tset) { + cgroup_taskset_for_each(task, new_css, tset) { if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { @@ -231,7 +227,7 @@ static void freezer_fork(struct task_struct *task) * The root cgroup is non-freezable, so we can skip the * following check. */ - if (!freezer->css.cgroup->parent) + if (!parent_freezer(freezer)) goto out; spin_lock_irq(&freezer->lock); @@ -244,7 +240,7 @@ out: /** * update_if_frozen - update whether a cgroup finished freezing - * @cgroup: cgroup of interest + * @css: css of interest * * Once FREEZING is initiated, transition to FROZEN is lazily updated by * calling this function. If the current state is FREEZING but not FROZEN, @@ -255,14 +251,14 @@ out: * update_if_frozen() on all descendants prior to invoking this function. * * Task states and freezer state might disagree while tasks are being - * migrated into or out of @cgroup, so we can't verify task states against + * migrated into or out of @css, so we can't verify task states against * @freezer state here. See freezer_attach() for details. */ -static void update_if_frozen(struct cgroup *cgroup) +static void update_if_frozen(struct cgroup_subsys_state *css) { - struct freezer *freezer = cgroup_freezer(cgroup); - struct cgroup *pos; - struct cgroup_iter it; + struct freezer *freezer = css_freezer(css); + struct cgroup_subsys_state *pos; + struct css_task_iter it; struct task_struct *task; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -274,8 +270,8 @@ static void update_if_frozen(struct cgroup *cgroup) goto out_unlock; /* are all (live) children frozen? */ - cgroup_for_each_child(pos, cgroup) { - struct freezer *child = cgroup_freezer(pos); + css_for_each_child(pos, css) { + struct freezer *child = css_freezer(pos); if ((child->state & CGROUP_FREEZER_ONLINE) && !(child->state & CGROUP_FROZEN)) @@ -283,9 +279,9 @@ static void update_if_frozen(struct cgroup *cgroup) } /* are all tasks frozen? */ - cgroup_iter_start(cgroup, &it); + css_task_iter_start(css, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { + while ((task = css_task_iter_next(&it))) { if (freezing(task)) { /* * freezer_should_skip() indicates that the task @@ -300,52 +296,49 @@ static void update_if_frozen(struct cgroup *cgroup) freezer->state |= CGROUP_FROZEN; out_iter_end: - cgroup_iter_end(cgroup, &it); + css_task_iter_end(&it); out_unlock: spin_unlock_irq(&freezer->lock); } -static int freezer_read(struct cgroup *cgroup, struct cftype *cft, +static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *m) { - struct cgroup *pos; + struct cgroup_subsys_state *pos; rcu_read_lock(); /* update states bottom-up */ - cgroup_for_each_descendant_post(pos, cgroup) + css_for_each_descendant_post(pos, css) update_if_frozen(pos); - update_if_frozen(cgroup); rcu_read_unlock(); - seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); + seq_puts(m, freezer_state_strs(css_freezer(css)->state)); seq_putc(m, '\n'); return 0; } static void freeze_cgroup(struct freezer *freezer) { - struct cgroup *cgroup = freezer->css.cgroup; - struct cgroup_iter it; + struct css_task_iter it; struct task_struct *task; - cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) + css_task_iter_start(&freezer->css, &it); + while ((task = css_task_iter_next(&it))) freeze_task(task); - cgroup_iter_end(cgroup, &it); + css_task_iter_end(&it); } static void unfreeze_cgroup(struct freezer *freezer) { - struct cgroup *cgroup = freezer->css.cgroup; - struct cgroup_iter it; + struct css_task_iter it; struct task_struct *task; - cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) + css_task_iter_start(&freezer->css, &it); + while ((task = css_task_iter_next(&it))) __thaw_task(task); - cgroup_iter_end(cgroup, &it); + css_task_iter_end(&it); } /** @@ -395,12 +388,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, */ static void freezer_change_state(struct freezer *freezer, bool freeze) { - struct cgroup *pos; - - /* update @freezer */ - spin_lock_irq(&freezer->lock); - freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF); - spin_unlock_irq(&freezer->lock); + struct cgroup_subsys_state *pos; /* * Update all its descendants in pre-order traversal. Each @@ -408,24 +396,33 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) * CGROUP_FREEZING_PARENT. */ rcu_read_lock(); - cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { - struct freezer *pos_f = cgroup_freezer(pos); + css_for_each_descendant_pre(pos, &freezer->css) { + struct freezer *pos_f = css_freezer(pos); struct freezer *parent = parent_freezer(pos_f); - /* - * Our update to @parent->state is already visible which is - * all we need. No need to lock @parent. For more info on - * synchronization, see freezer_post_create(). - */ spin_lock_irq(&pos_f->lock); - freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, - CGROUP_FREEZING_PARENT); + + if (pos_f == freezer) { + freezer_apply_state(pos_f, freeze, + CGROUP_FREEZING_SELF); + } else { + /* + * Our update to @parent->state is already visible + * which is all we need. No need to lock @parent. + * For more info on synchronization, see + * freezer_post_create(). + */ + freezer_apply_state(pos_f, + parent->state & CGROUP_FREEZING, + CGROUP_FREEZING_PARENT); + } + spin_unlock_irq(&pos_f->lock); } rcu_read_unlock(); } -static int freezer_write(struct cgroup *cgroup, struct cftype *cft, +static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { bool freeze; @@ -437,20 +434,22 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft, else return -EINVAL; - freezer_change_state(cgroup_freezer(cgroup), freeze); + freezer_change_state(css_freezer(css), freeze); return 0; } -static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) +static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); return (bool)(freezer->state & CGROUP_FREEZING_SELF); } -static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) +static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); return (bool)(freezer->state & CGROUP_FREEZING_PARENT); } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 65349f07b87..859c8dfd78a 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -15,32 +15,51 @@ */ #include <linux/context_tracking.h> -#include <linux/kvm_host.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/hardirq.h> #include <linux/export.h> -DEFINE_PER_CPU(struct context_tracking, context_tracking) = { -#ifdef CONFIG_CONTEXT_TRACKING_FORCE - .active = true, -#endif -}; +#define CREATE_TRACE_POINTS +#include <trace/events/context_tracking.h> + +struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL_GPL(context_tracking_enabled); + +DEFINE_PER_CPU(struct context_tracking, context_tracking); +EXPORT_SYMBOL_GPL(context_tracking); + +void context_tracking_cpu_set(int cpu) +{ + if (!per_cpu(context_tracking.active, cpu)) { + per_cpu(context_tracking.active, cpu) = true; + static_key_slow_inc(&context_tracking_enabled); + } +} /** - * user_enter - Inform the context tracking that the CPU is going to - * enter userspace mode. + * context_tracking_user_enter - Inform the context tracking that the CPU is going to + * enter userspace mode. * * This function must be called right before we switch from the kernel * to userspace, when it's guaranteed the remaining kernel instructions * to execute won't use any RCU read side critical section because this * function sets RCU in extended quiescent state. */ -void user_enter(void) +void context_tracking_user_enter(void) { unsigned long flags; /* + * Repeat the user_enter() check here because some archs may be calling + * this from asm and if no CPU needs context tracking, they shouldn't + * go further. Repeat the check here until they support the static key + * check. + */ + if (!static_key_false(&context_tracking_enabled)) + return; + + /* * Some contexts may involve an exception occuring in an irq, * leading to that nesting: * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() @@ -55,26 +74,80 @@ void user_enter(void) WARN_ON_ONCE(!current->mm); local_irq_save(flags); - if (__this_cpu_read(context_tracking.active) && - __this_cpu_read(context_tracking.state) != IN_USER) { + if ( __this_cpu_read(context_tracking.state) != IN_USER) { + if (__this_cpu_read(context_tracking.active)) { + trace_user_enter(0); + /* + * At this stage, only low level arch entry code remains and + * then we'll run in userspace. We can assume there won't be + * any RCU read-side critical section until the next call to + * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency + * on the tick. + */ + vtime_user_enter(current); + rcu_user_enter(); + } /* - * At this stage, only low level arch entry code remains and - * then we'll run in userspace. We can assume there won't be - * any RCU read-side critical section until the next call to - * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency - * on the tick. + * Even if context tracking is disabled on this CPU, because it's outside + * the full dynticks mask for example, we still have to keep track of the + * context transitions and states to prevent inconsistency on those of + * other CPUs. + * If a task triggers an exception in userspace, sleep on the exception + * handler and then migrate to another CPU, that new CPU must know where + * the exception returns by the time we call exception_exit(). + * This information can only be provided by the previous CPU when it called + * exception_enter(). + * OTOH we can spare the calls to vtime and RCU when context_tracking.active + * is false because we know that CPU is not tickless. */ - vtime_user_enter(current); - rcu_user_enter(); __this_cpu_write(context_tracking.state, IN_USER); } local_irq_restore(flags); } +#ifdef CONFIG_PREEMPT +/** + * preempt_schedule_context - preempt_schedule called by tracing + * + * The tracing infrastructure uses preempt_enable_notrace to prevent + * recursion and tracing preempt enabling caused by the tracing + * infrastructure itself. But as tracing can happen in areas coming + * from userspace or just about to enter userspace, a preempt enable + * can occur before user_exit() is called. This will cause the scheduler + * to be called when the system is still in usermode. + * + * To prevent this, the preempt_enable_notrace will use this function + * instead of preempt_schedule() to exit user context if needed before + * calling the scheduler. + */ +void __sched notrace preempt_schedule_context(void) +{ + enum ctx_state prev_ctx; + + if (likely(!preemptible())) + return; + + /* + * Need to disable preemption in case user_exit() is traced + * and the tracer calls preempt_enable_notrace() causing + * an infinite recursion. + */ + preempt_disable_notrace(); + prev_ctx = exception_enter(); + preempt_enable_no_resched_notrace(); + + preempt_schedule(); + + preempt_disable_notrace(); + exception_exit(prev_ctx); + preempt_enable_notrace(); +} +EXPORT_SYMBOL_GPL(preempt_schedule_context); +#endif /* CONFIG_PREEMPT */ /** - * user_exit - Inform the context tracking that the CPU is - * exiting userspace mode and entering the kernel. + * context_tracking_user_exit - Inform the context tracking that the CPU is + * exiting userspace mode and entering the kernel. * * This function must be called after we entered the kernel from userspace * before any use of RCU read side critical section. This potentially include @@ -83,47 +156,34 @@ void user_enter(void) * This call supports re-entrancy. This way it can be called from any exception * handler without needing to know if we came from userspace or not. */ -void user_exit(void) +void context_tracking_user_exit(void) { unsigned long flags; + if (!static_key_false(&context_tracking_enabled)) + return; + if (in_interrupt()) return; local_irq_save(flags); if (__this_cpu_read(context_tracking.state) == IN_USER) { - /* - * We are going to run code that may use RCU. Inform - * RCU core about that (ie: we may need the tick again). - */ - rcu_user_exit(); - vtime_user_exit(current); + if (__this_cpu_read(context_tracking.active)) { + /* + * We are going to run code that may use RCU. Inform + * RCU core about that (ie: we may need the tick again). + */ + rcu_user_exit(); + vtime_user_exit(current); + trace_user_exit(0); + } __this_cpu_write(context_tracking.state, IN_KERNEL); } local_irq_restore(flags); } -void guest_enter(void) -{ - if (vtime_accounting_enabled()) - vtime_guest_enter(current); - else - __guest_enter(); -} -EXPORT_SYMBOL_GPL(guest_enter); - -void guest_exit(void) -{ - if (vtime_accounting_enabled()) - vtime_guest_exit(current); - else - __guest_exit(); -} -EXPORT_SYMBOL_GPL(guest_exit); - - /** - * context_tracking_task_switch - context switch the syscall callbacks + * __context_tracking_task_switch - context switch the syscall callbacks * @prev: the task that is being switched out * @next: the task that is being switched in * @@ -135,11 +195,19 @@ EXPORT_SYMBOL_GPL(guest_exit); * migrate to some CPU that doesn't do the context tracking. As such the TIF * flag may not be desired there. */ -void context_tracking_task_switch(struct task_struct *prev, - struct task_struct *next) +void __context_tracking_task_switch(struct task_struct *prev, + struct task_struct *next) { - if (__this_cpu_read(context_tracking.active)) { - clear_tsk_thread_flag(prev, TIF_NOHZ); - set_tsk_thread_flag(next, TIF_NOHZ); - } + clear_tsk_thread_flag(prev, TIF_NOHZ); + set_tsk_thread_flag(next, TIF_NOHZ); +} + +#ifdef CONFIG_CONTEXT_TRACKING_FORCE +void __init context_tracking_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + context_tracking_cpu_set(cpu); } +#endif diff --git a/kernel/cpu.c b/kernel/cpu.c index b5e4ab2d427..d7f07a2da5a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus); * get_online_cpus() not an api which is called all that often. * */ -static void cpu_hotplug_begin(void) +void cpu_hotplug_begin(void) { cpu_hotplug.active_writer = current; @@ -127,16 +127,34 @@ static void cpu_hotplug_begin(void) } } -static void cpu_hotplug_done(void) +void cpu_hotplug_done(void) { cpu_hotplug.active_writer = NULL; mutex_unlock(&cpu_hotplug.lock); } -#else /* #if CONFIG_HOTPLUG_CPU */ -static void cpu_hotplug_begin(void) {} -static void cpu_hotplug_done(void) {} -#endif /* #else #if CONFIG_HOTPLUG_CPU */ +/* + * Wait for currently running CPU hotplug operations to complete (if any) and + * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects + * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the + * hotplug path before performing hotplug operations. So acquiring that lock + * guarantees mutual exclusion from any currently running hotplug operations. + */ +void cpu_hotplug_disable(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 1; + cpu_maps_update_done(); +} + +void cpu_hotplug_enable(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 0; + cpu_maps_update_done(); +} + +#endif /* CONFIG_HOTPLUG_CPU */ /* Need to know about CPUs going up/down? */ int __ref register_cpu_notifier(struct notifier_block *nb) @@ -345,7 +363,7 @@ EXPORT_SYMBOL(cpu_down); #endif /*CONFIG_HOTPLUG_CPU*/ /* Requires cpu_add_remove_lock to be held */ -static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) +static int _cpu_up(unsigned int cpu, int tasks_frozen) { int ret, nr_calls = 0; void *hcpu = (void *)(long)cpu; @@ -398,7 +416,7 @@ out: return ret; } -int __cpuinit cpu_up(unsigned int cpu) +int cpu_up(unsigned int cpu) { int err = 0; @@ -541,36 +559,6 @@ static int __init alloc_frozen_cpus(void) core_initcall(alloc_frozen_cpus); /* - * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU - * hotplug when tasks are about to be frozen. Also, don't allow the freezer - * to continue until any currently running CPU hotplug operation gets - * completed. - * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the - * 'cpu_add_remove_lock'. And this same lock is also taken by the regular - * CPU hotplug path and released only after it is complete. Thus, we - * (and hence the freezer) will block here until any currently running CPU - * hotplug operation gets completed. - */ -void cpu_hotplug_disable_before_freeze(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 1; - cpu_maps_update_done(); -} - - -/* - * When tasks have been thawed, re-enable regular CPU hotplug (which had been - * disabled while beginning to freeze tasks). - */ -void cpu_hotplug_enable_after_thaw(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; - cpu_maps_update_done(); -} - -/* * When callbacks for CPU hotplug notifications are being executed, we must * ensure that the state of the system with respect to the tasks being frozen * or not, as reported by the notification, remains unchanged *throughout the @@ -589,12 +577,12 @@ cpu_hotplug_pm_callback(struct notifier_block *nb, case PM_SUSPEND_PREPARE: case PM_HIBERNATION_PREPARE: - cpu_hotplug_disable_before_freeze(); + cpu_hotplug_disable(); break; case PM_POST_SUSPEND: case PM_POST_HIBERNATION: - cpu_hotplug_enable_after_thaw(); + cpu_hotplug_enable(); break; default: @@ -627,7 +615,7 @@ core_initcall(cpu_hotplug_pm_sync_init); * It must be called by the arch code on the new cpu, before the new cpu * enables interrupts and before the "boot" cpu returns from __cpu_up(). */ -void __cpuinit notify_cpu_starting(unsigned int cpu) +void notify_cpu_starting(unsigned int cpu) { unsigned long val = CPU_STARTING; diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index d5585f5e038..e695c0a0bcb 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -5,6 +5,7 @@ #include <linux/cpu.h> #include <linux/tick.h> #include <linux/mm.h> +#include <linux/stackprotector.h> #include <asm/tlb.h> @@ -58,6 +59,7 @@ void __weak arch_cpu_idle_dead(void) { } void __weak arch_cpu_idle(void) { cpu_idle_force_poll = 1; + local_irq_enable(); } /* @@ -112,6 +114,21 @@ static void cpu_idle_loop(void) void cpu_startup_entry(enum cpuhp_state state) { + /* + * This #ifdef needs to die, but it's too late in the cycle to + * make this generic (arm and sh have never invoked the canary + * init for the non boot cpus!). Will be fixed in 3.11 + */ +#ifdef CONFIG_X86 + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. The boot CPU already has it initialized but no harm + * in doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); +#endif current_set_polling(); arch_cpu_idle_prepare(); cpu_idle_loop(); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 64b3f791bbe..6bf981e13c4 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -59,6 +59,7 @@ #include <linux/mutex.h> #include <linux/workqueue.h> #include <linux/cgroup.h> +#include <linux/wait.h> /* * Tracks how many cpusets are currently defined in system. @@ -67,10 +68,6 @@ */ int number_of_cpusets __read_mostly; -/* Forward declare cgroup structures */ -struct cgroup_subsys cpuset_subsys; -struct cpuset; - /* See "Frequency meter" comments, below. */ struct fmeter { @@ -87,6 +84,18 @@ struct cpuset { cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ + /* + * This is old Memory Nodes tasks took on. + * + * - top_cpuset.old_mems_allowed is initialized to mems_allowed. + * - A new cpuset's old_mems_allowed is initialized when some + * task is moved into it. + * - old_mems_allowed is used in cpuset_migrate_mm() when we change + * cpuset.mems_allowed and have tasks' nodemask updated, and + * then old_mems_allowed is updated to mems_allowed. + */ + nodemask_t old_mems_allowed; + struct fmeter fmeter; /* memory_pressure filter */ /* @@ -100,31 +109,22 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - - struct work_struct hotplug_work; }; -/* Retrieve the cpuset for a cgroup */ -static inline struct cpuset *cgroup_cs(struct cgroup *cont) +static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) { - return container_of(cgroup_subsys_state(cont, cpuset_subsys_id), - struct cpuset, css); + return css ? container_of(css, struct cpuset, css) : NULL; } /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { - return container_of(task_subsys_state(task, cpuset_subsys_id), - struct cpuset, css); + return css_cs(task_css(task, cpuset_subsys_id)); } -static inline struct cpuset *parent_cs(const struct cpuset *cs) +static inline struct cpuset *parent_cs(struct cpuset *cs) { - struct cgroup *pcgrp = cs->css.cgroup->parent; - - if (pcgrp) - return cgroup_cs(pcgrp); - return NULL; + return css_cs(css_parent(&cs->css)); } #ifdef CONFIG_NUMA @@ -201,29 +201,30 @@ static struct cpuset top_cpuset = { /** * cpuset_for_each_child - traverse online children of a cpuset * @child_cs: loop cursor pointing to the current child - * @pos_cgrp: used for iteration + * @pos_css: used for iteration * @parent_cs: target cpuset to walk children of * * Walk @child_cs through the online children of @parent_cs. Must be used * with RCU read locked. */ -#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ - cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ - if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) +#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ + css_for_each_child((pos_css), &(parent_cs)->css) \ + if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) /** * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants * @des_cs: loop cursor pointing to the current descendant - * @pos_cgrp: used for iteration + * @pos_css: used for iteration * @root_cs: target cpuset to walk ancestor of * * Walk @des_cs through the online descendants of @root_cs. Must be used - * with RCU read locked. The caller may modify @pos_cgrp by calling - * cgroup_rightmost_descendant() to skip subtree. + * with RCU read locked. The caller may modify @pos_css by calling + * css_rightmost_descendant() to skip subtree. @root_cs is included in the + * iteration and the first node to be visited. */ -#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ - cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ - if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) +#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ + css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ + if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) /* * There are two global mutexes guarding cpuset structures - cpuset_mutex @@ -267,14 +268,11 @@ static DEFINE_MUTEX(callback_mutex); /* * CPU / memory hotplug is handled asynchronously. */ -static struct workqueue_struct *cpuset_propagate_hotplug_wq; - static void cpuset_hotplug_workfn(struct work_struct *work); -static void cpuset_propagate_hotplug_workfn(struct work_struct *work); -static void schedule_cpuset_propagate_hotplug(struct cpuset *cs); - static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); +static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); + /* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we @@ -304,53 +302,37 @@ static struct file_system_type cpuset_fs_type = { /* * Return in pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. If we get - * all the way to the top and still haven't found any online cpus, - * return cpu_online_mask. Or if passed a NULL cs from an exit'ing - * task, return cpu_online_mask. + * until we find one that does have some online cpus. The top + * cpuset always has some cpus online. * * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * * Call with callback_mutex held. */ - -static void guarantee_online_cpus(const struct cpuset *cs, - struct cpumask *pmask) +static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { - while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) + while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) cs = parent_cs(cs); - if (cs) - cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); - else - cpumask_copy(pmask, cpu_online_mask); - BUG_ON(!cpumask_intersects(pmask, cpu_online_mask)); + cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); } /* * Return in *pmask the portion of a cpusets's mems_allowed that * are online, with memory. If none are online with memory, walk * up the cpuset hierarchy until we find one that does have some - * online mems. If we get all the way to the top and still haven't - * found any online mems, return node_states[N_MEMORY]. + * online mems. The top cpuset always has some mems online. * * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * * Call with callback_mutex held. */ - -static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) +static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { - while (cs && !nodes_intersects(cs->mems_allowed, - node_states[N_MEMORY])) + while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) cs = parent_cs(cs); - if (cs) - nodes_and(*pmask, cs->mems_allowed, - node_states[N_MEMORY]); - else - *pmask = node_states[N_MEMORY]; - BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY])); + nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); } /* @@ -391,7 +373,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) * alloc_trial_cpuset - allocate a trial cpuset * @cs: the cpuset that the trial cpuset duplicates */ -static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) +static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) { struct cpuset *trial; @@ -438,9 +420,9 @@ static void free_trial_cpuset(struct cpuset *trial) * Return 0 if valid, -errno if not. */ -static int validate_change(const struct cpuset *cur, const struct cpuset *trial) +static int validate_change(struct cpuset *cur, struct cpuset *trial) { - struct cgroup *cont; + struct cgroup_subsys_state *css; struct cpuset *c, *par; int ret; @@ -448,7 +430,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) /* Each of our child cpusets must be a subset of us */ ret = -EBUSY; - cpuset_for_each_child(c, cont, cur) + cpuset_for_each_child(c, css, cur) if (!is_cpuset_subset(c, trial)) goto out; @@ -469,7 +451,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) * overlap */ ret = -EINVAL; - cpuset_for_each_child(c, cont, par) { + cpuset_for_each_child(c, css, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) @@ -482,13 +464,17 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) /* * Cpusets with tasks - existing or newly being attached - can't - * have empty cpus_allowed or mems_allowed. + * be changed to have empty cpus_allowed or mems_allowed. */ ret = -ENOSPC; - if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && - (cpumask_empty(trial->cpus_allowed) || - nodes_empty(trial->mems_allowed))) - goto out; + if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) { + if (!cpumask_empty(cur->cpus_allowed) && + cpumask_empty(trial->cpus_allowed)) + goto out; + if (!nodes_empty(cur->mems_allowed) && + nodes_empty(trial->mems_allowed)) + goto out; + } ret = 0; out: @@ -518,13 +504,16 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *root_cs) { struct cpuset *cp; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { + cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { + if (cp == root_cs) + continue; + /* skip the whole subtree if @cp doesn't have any CPU */ if (cpumask_empty(cp->cpus_allowed)) { - pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + pos_css = css_rightmost_descendant(pos_css); continue; } @@ -540,7 +529,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, * This function builds a partial partition of the systems CPUs * A 'partial partition' is a set of non-overlapping subsets whose * union is a subset of that set. - * The output of this function needs to be passed to kernel/sched.c + * The output of this function needs to be passed to kernel/sched/core.c * partition_sched_domains() routine, which will rebuild the scheduler's * load balancing domains (sched domains) as specified by that partial * partition. @@ -569,7 +558,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, * is a subset of one of these domains, while there are as * many such domains as possible, each as small as possible. * doms - Conversion of 'csa' to an array of cpumasks, for passing to - * the kernel/sched.c routine partition_sched_domains() in a + * the kernel/sched/core.c routine partition_sched_domains() in a * convenient format, that can be easily compared to the prior * value to determine what partition elements (sched domains) * were changed (added or removed.) @@ -599,7 +588,7 @@ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; doms = NULL; dattr = NULL; @@ -628,7 +617,9 @@ static int generate_sched_domains(cpumask_var_t **domains, csn = 0; rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { + cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { + if (cp == &top_cpuset) + continue; /* * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The @@ -645,7 +636,7 @@ static int generate_sched_domains(cpumask_var_t **domains, csa[csn++] = cp; /* skip @cp's subtree */ - pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); @@ -798,62 +789,121 @@ void rebuild_sched_domains(void) mutex_unlock(&cpuset_mutex); } -/** - * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's - * @tsk: task to test - * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner +/* + * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus + * @cs: the cpuset in interest * - * Call with cpuset_mutex held. May take callback_mutex during call. - * Called for each task in a cgroup by cgroup_scan_tasks(). - * Return nonzero if this tasks's cpus_allowed mask should be changed (in other - * words, if its mask is not equal to its cpuset's mask). + * A cpuset's effective cpumask is the cpumask of the nearest ancestor + * with non-empty cpus. We use effective cpumask whenever: + * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask + * if the cpuset they reside in has no cpus) + * - we want to retrieve task_cs(tsk)'s cpus_allowed. + * + * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an + * exception. See comments there. */ -static int cpuset_test_cpumask(struct task_struct *tsk, - struct cgroup_scanner *scan) +static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs) { - return !cpumask_equal(&tsk->cpus_allowed, - (cgroup_cs(scan->cg))->cpus_allowed); + while (cpumask_empty(cs->cpus_allowed)) + cs = parent_cs(cs); + return cs; +} + +/* + * effective_nodemask_cpuset - return nearest ancestor with non-empty mems + * @cs: the cpuset in interest + * + * A cpuset's effective nodemask is the nodemask of the nearest ancestor + * with non-empty memss. We use effective nodemask whenever: + * - we update tasks' mems_allowed. (they take on the ancestor's nodemask + * if the cpuset they reside in has no mems) + * - we want to retrieve task_cs(tsk)'s mems_allowed. + * + * Called with cpuset_mutex held. + */ +static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) +{ + while (nodes_empty(cs->mems_allowed)) + cs = parent_cs(cs); + return cs; } /** * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's * @tsk: task to test - * @scan: struct cgroup_scanner containing the cgroup of the task + * @data: cpuset to @tsk belongs to * - * Called by cgroup_scan_tasks() for each task in a cgroup whose - * cpus_allowed mask needs to be changed. + * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed + * mask needs to be changed. * * We don't need to re-check for the cgroup/cpuset membership, since we're * holding cpuset_mutex at this point. */ -static void cpuset_change_cpumask(struct task_struct *tsk, - struct cgroup_scanner *scan) +static void cpuset_change_cpumask(struct task_struct *tsk, void *data) { - set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed)); + struct cpuset *cs = data; + struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); + + set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); } /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed - * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() + * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * * Called with cpuset_mutex held * - * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, + * The css_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. * - * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 + * No return value. It's guaranteed that css_scan_tasks() always returns 0 * if @heap != NULL. */ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) { - struct cgroup_scanner scan; + css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap); +} - scan.cg = cs->css.cgroup; - scan.test_task = cpuset_test_cpumask; - scan.process_task = cpuset_change_cpumask; - scan.heap = heap; - cgroup_scan_tasks(&scan); +/* + * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. + * @root_cs: the root cpuset of the hierarchy + * @update_root: update root cpuset or not? + * @heap: the heap used by css_scan_tasks() + * + * This will update cpumasks of tasks in @root_cs and all other empty cpusets + * which take on cpumask of @root_cs. + * + * Called with cpuset_mutex held + */ +static void update_tasks_cpumask_hier(struct cpuset *root_cs, + bool update_root, struct ptr_heap *heap) +{ + struct cpuset *cp; + struct cgroup_subsys_state *pos_css; + + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { + if (cp == root_cs) { + if (!update_root) + continue; + } else { + /* skip the whole subtree if @cp have some CPU */ + if (!cpumask_empty(cp->cpus_allowed)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + } + if (!css_tryget(&cp->css)) + continue; + rcu_read_unlock(); + + update_tasks_cpumask(cp, heap); + + rcu_read_lock(); + css_put(&cp->css); + } + rcu_read_unlock(); } /** @@ -888,14 +938,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) return -EINVAL; } - retval = validate_change(cs, trialcs); - if (retval < 0) - return retval; /* Nothing to do if the cpus didn't change */ if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; + retval = validate_change(cs, trialcs); + if (retval < 0) + return retval; + retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); if (retval) return retval; @@ -906,11 +957,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); mutex_unlock(&callback_mutex); - /* - * Scan tasks in the cpuset, and update the cpumasks of any - * that need an update. - */ - update_tasks_cpumask(cs, &heap); + update_tasks_cpumask_hier(cs, true, &heap); heap_free(&heap); @@ -943,12 +990,14 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to) { struct task_struct *tsk = current; + struct cpuset *mems_cs; tsk->mems_allowed = *to; do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); - guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); + mems_cs = effective_nodemask_cpuset(task_cs(tsk)); + guarantee_online_mems(mems_cs, &tsk->mems_allowed); } /* @@ -999,24 +1048,24 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, task_unlock(tsk); } +struct cpuset_change_nodemask_arg { + struct cpuset *cs; + nodemask_t *newmems; +}; + /* * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy * of it to cpuset's new mems_allowed, and migrate pages to new nodes if * memory_migrate flag is set. Called with cpuset_mutex held. */ -static void cpuset_change_nodemask(struct task_struct *p, - struct cgroup_scanner *scan) +static void cpuset_change_nodemask(struct task_struct *p, void *data) { + struct cpuset_change_nodemask_arg *arg = data; + struct cpuset *cs = arg->cs; struct mm_struct *mm; - struct cpuset *cs; int migrate; - const nodemask_t *oldmem = scan->data; - static nodemask_t newmems; /* protected by cpuset_mutex */ - - cs = cgroup_cs(scan->cg); - guarantee_online_mems(cs, &newmems); - cpuset_change_task_nodemask(p, &newmems); + cpuset_change_task_nodemask(p, arg->newmems); mm = get_task_mm(p); if (!mm) @@ -1026,7 +1075,7 @@ static void cpuset_change_nodemask(struct task_struct *p, mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) - cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); + cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems); mmput(mm); } @@ -1035,25 +1084,21 @@ static void *cpuset_being_rebound; /** * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed - * @oldmem: old mems_allowed of cpuset cs - * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() + * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * - * Called with cpuset_mutex held - * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 - * if @heap != NULL. + * Called with cpuset_mutex held. No return value. It's guaranteed that + * css_scan_tasks() always returns 0 if @heap != NULL. */ -static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, - struct ptr_heap *heap) +static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) { - struct cgroup_scanner scan; + static nodemask_t newmems; /* protected by cpuset_mutex */ + struct cpuset *mems_cs = effective_nodemask_cpuset(cs); + struct cpuset_change_nodemask_arg arg = { .cs = cs, + .newmems = &newmems }; cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ - scan.cg = cs->css.cgroup; - scan.test_task = NULL; - scan.process_task = cpuset_change_nodemask; - scan.heap = heap; - scan.data = (nodemask_t *)oldmem; + guarantee_online_mems(mems_cs, &newmems); /* * The mpol_rebind_mm() call takes mmap_sem, which we couldn't @@ -1065,13 +1110,60 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ - cgroup_scan_tasks(&scan); + css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap); + + /* + * All the tasks' nodemasks have been updated, update + * cs->old_mems_allowed. + */ + cs->old_mems_allowed = newmems; /* We're done rebinding vmas to this cpuset's new mems_allowed. */ cpuset_being_rebound = NULL; } /* + * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. + * @cs: the root cpuset of the hierarchy + * @update_root: update the root cpuset or not? + * @heap: the heap used by css_scan_tasks() + * + * This will update nodemasks of tasks in @root_cs and all other empty cpusets + * which take on nodemask of @root_cs. + * + * Called with cpuset_mutex held + */ +static void update_tasks_nodemask_hier(struct cpuset *root_cs, + bool update_root, struct ptr_heap *heap) +{ + struct cpuset *cp; + struct cgroup_subsys_state *pos_css; + + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { + if (cp == root_cs) { + if (!update_root) + continue; + } else { + /* skip the whole subtree if @cp have some CPU */ + if (!nodes_empty(cp->mems_allowed)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + } + if (!css_tryget(&cp->css)) + continue; + rcu_read_unlock(); + + update_tasks_nodemask(cp, heap); + + rcu_read_lock(); + css_put(&cp->css); + } + rcu_read_unlock(); +} + +/* * Handle user request to change the 'mems' memory placement * of a cpuset. Needs to validate the request, update the * cpusets mems_allowed, and for each task in the cpuset, @@ -1087,13 +1179,9 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { - NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL); int retval; struct ptr_heap heap; - if (!oldmem) - return -ENOMEM; - /* * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; * it's read-only @@ -1122,8 +1210,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, goto done; } } - *oldmem = cs->mems_allowed; - if (nodes_equal(*oldmem, trialcs->mems_allowed)) { + + if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { retval = 0; /* Too easy - nothing to do */ goto done; } @@ -1139,11 +1227,10 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, cs->mems_allowed = trialcs->mems_allowed; mutex_unlock(&callback_mutex); - update_tasks_nodemask(cs, oldmem, &heap); + update_tasks_nodemask_hier(cs, true, &heap); heap_free(&heap); done: - NODEMASK_FREE(oldmem); return retval; } @@ -1169,44 +1256,39 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) return 0; } -/* +/** * cpuset_change_flag - make a task's spread flags the same as its cpuset's * @tsk: task to be updated - * @scan: struct cgroup_scanner containing the cgroup of the task + * @data: cpuset to @tsk belongs to * - * Called by cgroup_scan_tasks() for each task in a cgroup. + * Called by css_scan_tasks() for each task in a cgroup. * * We don't need to re-check for the cgroup/cpuset membership, since we're * holding cpuset_mutex at this point. */ -static void cpuset_change_flag(struct task_struct *tsk, - struct cgroup_scanner *scan) +static void cpuset_change_flag(struct task_struct *tsk, void *data) { - cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); + struct cpuset *cs = data; + + cpuset_update_task_spread_flag(cs, tsk); } -/* +/** * update_tasks_flags - update the spread flags of tasks in the cpuset. * @cs: the cpuset in which each task's spread flags needs to be changed - * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() + * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * * Called with cpuset_mutex held * - * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, + * The css_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. * - * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 + * No return value. It's guaranteed that css_scan_tasks() always returns 0 * if @heap != NULL. */ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) { - struct cgroup_scanner scan; - - scan.cg = cs->css.cgroup; - scan.test_task = NULL; - scan.process_task = cpuset_change_flag; - scan.heap = heap; - cgroup_scan_tasks(&scan); + css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap); } /* @@ -1364,19 +1446,25 @@ static int fmeter_getrate(struct fmeter *fmp) } /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ -static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static int cpuset_can_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); struct task_struct *task; int ret; mutex_lock(&cpuset_mutex); + /* + * We allow to move tasks into an empty cpuset if sane_behavior + * flag is set. + */ ret = -ENOSPC; - if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) + if (!cgroup_sane_behavior(css->cgroup) && + (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, css, tset) { /* * Kthreads which disallow setaffinity shouldn't be moved * to a new cpuset; we don't want to change their cpu @@ -1405,11 +1493,11 @@ out_unlock: return ret; } -static void cpuset_cancel_attach(struct cgroup *cgrp, +static void cpuset_cancel_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { mutex_lock(&cpuset_mutex); - cgroup_cs(cgrp)->attach_in_progress--; + css_cs(css)->attach_in_progress--; mutex_unlock(&cpuset_mutex); } @@ -1420,17 +1508,20 @@ static void cpuset_cancel_attach(struct cgroup *cgrp, */ static cpumask_var_t cpus_attach; -static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void cpuset_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { - /* static bufs protected by cpuset_mutex */ - static nodemask_t cpuset_attach_nodemask_from; + /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; struct mm_struct *mm; struct task_struct *task; struct task_struct *leader = cgroup_taskset_first(tset); - struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); - struct cpuset *cs = cgroup_cs(cgrp); - struct cpuset *oldcs = cgroup_cs(oldcgrp); + struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset, + cpuset_subsys_id); + struct cpuset *cs = css_cs(css); + struct cpuset *oldcs = css_cs(oldcss); + struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); + struct cpuset *mems_cs = effective_nodemask_cpuset(cs); mutex_lock(&cpuset_mutex); @@ -1438,11 +1529,11 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) if (cs == &top_cpuset) cpumask_copy(cpus_attach, cpu_possible_mask); else - guarantee_online_cpus(cs, cpus_attach); + guarantee_online_cpus(cpus_cs, cpus_attach); - guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, css, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -1457,26 +1548,32 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) * Change mm, possibly for multiple threads in a threadgroup. This is * expensive and may sleep. */ - cpuset_attach_nodemask_from = oldcs->mems_allowed; cpuset_attach_nodemask_to = cs->mems_allowed; mm = get_task_mm(leader); if (mm) { + struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs); + mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); - if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, + + /* + * old_mems_allowed is the same with mems_allowed here, except + * if this task is being moved automatically due to hotplug. + * In that case @mems_allowed has been updated and is empty, + * so @old_mems_allowed is the right nodesets that we migrate + * mm from. + */ + if (is_memory_migrate(cs)) { + cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); + } mmput(mm); } - cs->attach_in_progress--; + cs->old_mems_allowed = cpuset_attach_nodemask_to; - /* - * We may have raced with CPU/memory hotunplug. Trigger hotplug - * propagation if @cs doesn't have any CPU or memory. It will move - * the newly added tasks to the nearest parent which can execute. - */ - if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) - schedule_cpuset_propagate_hotplug(cs); + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); mutex_unlock(&cpuset_mutex); } @@ -1498,15 +1595,18 @@ typedef enum { FILE_SPREAD_SLAB, } cpuset_filetype_t; -static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) +static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; - int retval = -ENODEV; + int retval = 0; mutex_lock(&cpuset_mutex); - if (!is_cpuset_online(cs)) + if (!is_cpuset_online(cs)) { + retval = -ENODEV; goto out_unlock; + } switch (type) { case FILE_CPU_EXCLUSIVE: @@ -1545,9 +1645,10 @@ out_unlock: return retval; } -static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) +static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, + s64 val) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; int retval = -ENODEV; @@ -1571,10 +1672,10 @@ out_unlock: /* * Common handling for a write to a "cpus" or "mems" file. */ -static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, - const char *buf) +static int cpuset_write_resmask(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buf) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); struct cpuset *trialcs; int retval = -ENODEV; @@ -1588,13 +1689,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, * resources, wait for the previously scheduled operations before * proceeding, so that we don't end up keep removing tasks added * after execution capability is restored. - * - * Flushing cpuset_hotplug_work is enough to synchronize against - * hotplug hanlding; however, cpuset_attach() may schedule - * propagation work directly. Flush the workqueue too. */ flush_work(&cpuset_hotplug_work); - flush_workqueue(cpuset_propagate_hotplug_wq); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) @@ -1658,13 +1754,12 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) return count; } -static ssize_t cpuset_common_file_read(struct cgroup *cont, - struct cftype *cft, - struct file *file, - char __user *buf, - size_t nbytes, loff_t *ppos) +static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, + loff_t *ppos) { - struct cpuset *cs = cgroup_cs(cont); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; char *page; ssize_t retval = 0; @@ -1694,9 +1789,9 @@ out: return retval; } -static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) +static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { - struct cpuset *cs = cgroup_cs(cont); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; switch (type) { case FILE_CPU_EXCLUSIVE: @@ -1725,9 +1820,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) return 0; } -static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) +static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { - struct cpuset *cs = cgroup_cs(cont); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: @@ -1839,14 +1934,15 @@ static struct cftype files[] = { /* * cpuset_css_alloc - allocate a cpuset css - * cont: control group that the new cpuset will be part of + * cgrp: control group that the new cpuset will be part of */ -static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) +static struct cgroup_subsys_state * +cpuset_css_alloc(struct cgroup_subsys_state *parent_css) { struct cpuset *cs; - if (!cont->parent) + if (!parent_css) return &top_cpuset.css; cs = kzalloc(sizeof(*cs), GFP_KERNEL); @@ -1861,18 +1957,17 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); fmeter_init(&cs->fmeter); - INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn); cs->relax_domain_level = -1; return &cs->css; } -static int cpuset_css_online(struct cgroup *cgrp) +static int cpuset_css_online(struct cgroup_subsys_state *css) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); struct cpuset *parent = parent_cs(cs); struct cpuset *tmp_cs; - struct cgroup *pos_cg; + struct cgroup_subsys_state *pos_css; if (!parent) return 0; @@ -1887,7 +1982,7 @@ static int cpuset_css_online(struct cgroup *cgrp) number_of_cpusets++; - if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; /* @@ -1904,7 +1999,7 @@ static int cpuset_css_online(struct cgroup *cgrp) * (and likewise for mems) to the new cgroup. */ rcu_read_lock(); - cpuset_for_each_child(tmp_cs, pos_cg, parent) { + cpuset_for_each_child(tmp_cs, pos_css, parent) { if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { rcu_read_unlock(); goto out_unlock; @@ -1921,9 +2016,15 @@ out_unlock: return 0; } -static void cpuset_css_offline(struct cgroup *cgrp) +/* + * If the cpuset being removed has its flag 'sched_load_balance' + * enabled, then simulate turning sched_load_balance off, which + * will call rebuild_sched_domains_locked(). + */ + +static void cpuset_css_offline(struct cgroup_subsys_state *css) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); mutex_lock(&cpuset_mutex); @@ -1936,15 +2037,9 @@ static void cpuset_css_offline(struct cgroup *cgrp) mutex_unlock(&cpuset_mutex); } -/* - * If the cpuset being removed has its flag 'sched_load_balance' - * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_locked(). - */ - -static void cpuset_css_free(struct cgroup *cont) +static void cpuset_css_free(struct cgroup_subsys_state *css) { - struct cpuset *cs = cgroup_cs(cont); + struct cpuset *cs = css_cs(css); free_cpumask_var(cs->cpus_allowed); kfree(cs); @@ -2024,41 +2119,64 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) } /** - * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset + * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest * * Compare @cs's cpu and mem masks against top_cpuset and if some have gone * offline, update @cs accordingly. If @cs ends up with no CPU or memory, * all its tasks are moved to the nearest ancestor with both resources. */ -static void cpuset_propagate_hotplug_workfn(struct work_struct *work) +static void cpuset_hotplug_update_tasks(struct cpuset *cs) { static cpumask_t off_cpus; - static nodemask_t off_mems, tmp_mems; - struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); + static nodemask_t off_mems; bool is_empty; + bool sane = cgroup_sane_behavior(cs->css.cgroup); + +retry: + wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); mutex_lock(&cpuset_mutex); + /* + * We have raced with task attaching. We wait until attaching + * is finished, so we won't attach a task to an empty cpuset. + */ + if (cs->attach_in_progress) { + mutex_unlock(&cpuset_mutex); + goto retry; + } + cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); - /* remove offline cpus from @cs */ - if (!cpumask_empty(&off_cpus)) { - mutex_lock(&callback_mutex); - cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); - mutex_unlock(&callback_mutex); + mutex_lock(&callback_mutex); + cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); + mutex_unlock(&callback_mutex); + + /* + * If sane_behavior flag is set, we need to update tasks' cpumask + * for empty cpuset to take on ancestor's cpumask. Otherwise, don't + * call update_tasks_cpumask() if the cpuset becomes empty, as + * the tasks in it will be migrated to an ancestor. + */ + if ((sane && cpumask_empty(cs->cpus_allowed)) || + (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) update_tasks_cpumask(cs, NULL); - } - /* remove offline mems from @cs */ - if (!nodes_empty(off_mems)) { - tmp_mems = cs->mems_allowed; - mutex_lock(&callback_mutex); - nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); - mutex_unlock(&callback_mutex); - update_tasks_nodemask(cs, &tmp_mems, NULL); - } + mutex_lock(&callback_mutex); + nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); + mutex_unlock(&callback_mutex); + + /* + * If sane_behavior flag is set, we need to update tasks' nodemask + * for empty cpuset to take on ancestor's nodemask. Otherwise, don't + * call update_tasks_nodemask() if the cpuset becomes empty, as + * the tasks in it will be migratd to an ancestor. + */ + if ((sane && nodes_empty(cs->mems_allowed)) || + (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) + update_tasks_nodemask(cs, NULL); is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); @@ -2066,40 +2184,14 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work) mutex_unlock(&cpuset_mutex); /* - * If @cs became empty, move tasks to the nearest ancestor with - * execution resources. This is full cgroup operation which will + * If sane_behavior flag is set, we'll keep tasks in empty cpusets. + * + * Otherwise move tasks to the nearest ancestor with execution + * resources. This is full cgroup operation which will * also call back into cpuset. Should be done outside any lock. */ - if (is_empty) + if (!sane && is_empty) remove_tasks_in_empty_cpuset(cs); - - /* the following may free @cs, should be the last operation */ - css_put(&cs->css); -} - -/** - * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset - * @cs: cpuset of interest - * - * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and - * memory masks according to top_cpuset. - */ -static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) -{ - /* - * Pin @cs. The refcnt will be released when the work item - * finishes executing. - */ - if (!css_tryget(&cs->css)) - return; - - /* - * Queue @cs->hotplug_work. If already pending, lose the css ref. - * cpuset_propagate_hotplug_wq is ordered and propagation will - * happen in the order this function is called. - */ - if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work)) - css_put(&cs->css); } /** @@ -2112,18 +2204,17 @@ static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) * actively using CPU hotplug but making no active use of cpusets. * * Non-root cpusets are only affected by offlining. If any CPUs or memory - * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all - * descendants. + * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on + * all descendants. * * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. */ static void cpuset_hotplug_workfn(struct work_struct *work) { - static cpumask_t new_cpus, tmp_cpus; - static nodemask_t new_mems, tmp_mems; + static cpumask_t new_cpus; + static nodemask_t new_mems; bool cpus_updated, mems_updated; - bool cpus_offlined, mems_offlined; mutex_lock(&cpuset_mutex); @@ -2132,12 +2223,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) new_mems = node_states[N_MEMORY]; cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); - cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed, - &new_cpus); - mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); - nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems); - mems_offlined = !nodes_empty(tmp_mems); /* synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { @@ -2149,28 +2235,32 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* synchronize mems_allowed to N_MEMORY */ if (mems_updated) { - tmp_mems = top_cpuset.mems_allowed; mutex_lock(&callback_mutex); top_cpuset.mems_allowed = new_mems; mutex_unlock(&callback_mutex); - update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); + update_tasks_nodemask(&top_cpuset, NULL); } - /* if cpus or mems went down, we need to propagate to descendants */ - if (cpus_offlined || mems_offlined) { + mutex_unlock(&cpuset_mutex); + + /* if cpus or mems changed, we need to propagate to descendants */ + if (cpus_updated || mems_updated) { struct cpuset *cs; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; rcu_read_lock(); - cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) - schedule_cpuset_propagate_hotplug(cs); - rcu_read_unlock(); - } + cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { + if (cs == &top_cpuset || !css_tryget(&cs->css)) + continue; + rcu_read_unlock(); - mutex_unlock(&cpuset_mutex); + cpuset_hotplug_update_tasks(cs); - /* wait for propagations to finish */ - flush_workqueue(cpuset_propagate_hotplug_wq); + rcu_read_lock(); + css_put(&cs->css); + } + rcu_read_unlock(); + } /* rebuild sched domains if cpus_allowed has changed */ if (cpus_updated) @@ -2219,12 +2309,9 @@ void __init cpuset_init_smp(void) { cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); top_cpuset.mems_allowed = node_states[N_MEMORY]; + top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; register_hotmemory_notifier(&cpuset_track_online_nodes_nb); - - cpuset_propagate_hotplug_wq = - alloc_ordered_workqueue("cpuset_hotplug", 0); - BUG_ON(!cpuset_propagate_hotplug_wq); } /** @@ -2240,21 +2327,23 @@ void __init cpuset_init_smp(void) void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { + struct cpuset *cpus_cs; + mutex_lock(&callback_mutex); task_lock(tsk); - guarantee_online_cpus(task_cs(tsk), pmask); + cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); + guarantee_online_cpus(cpus_cs, pmask); task_unlock(tsk); mutex_unlock(&callback_mutex); } void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { - const struct cpuset *cs; + struct cpuset *cpus_cs; rcu_read_lock(); - cs = task_cs(tsk); - if (cs) - do_set_cpus_allowed(tsk, cs->cpus_allowed); + cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); + do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed); rcu_read_unlock(); /* @@ -2293,11 +2382,13 @@ void cpuset_init_current_mems_allowed(void) nodemask_t cpuset_mems_allowed(struct task_struct *tsk) { + struct cpuset *mems_cs; nodemask_t mask; mutex_lock(&callback_mutex); task_lock(tsk); - guarantee_online_mems(task_cs(tsk), &mask); + mems_cs = effective_nodemask_cpuset(task_cs(tsk)); + guarantee_online_mems(mems_cs, &mask); task_unlock(tsk); mutex_unlock(&callback_mutex); @@ -2321,7 +2412,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall * (an unusual configuration), then returns the root cpuset. */ -static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) +static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) { while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) cs = parent_cs(cs); @@ -2391,7 +2482,7 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) */ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) { - const struct cpuset *cs; /* current cpuset ancestors */ + struct cpuset *cs; /* current cpuset ancestors */ int allowed; /* is allocation in zone z allowed? */ if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) @@ -2629,7 +2720,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) goto out_free; rcu_read_lock(); - css = task_subsys_state(tsk, cpuset_subsys_id); + css = task_css(tsk, cpuset_subsys_id); retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); rcu_read_unlock(); if (retval < 0) diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c77206184b8..97b67df8fbf 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -116,6 +116,9 @@ int get_callchain_buffers(void) err = alloc_callchain_buffers(); exit: + if (err) + atomic_dec(&nr_callchain_events); + mutex_unlock(&callchain_mutex); return err; diff --git a/kernel/events/core.c b/kernel/events/core.c index 9dc297faf7c..953c1434837 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -145,6 +145,7 @@ static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; +static atomic_t nr_freq_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -165,10 +166,28 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' /* * max perf event sample rate */ -#define DEFAULT_MAX_SAMPLE_RATE 100000 -int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; -static int max_samples_per_tick __read_mostly = - DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); +#define DEFAULT_MAX_SAMPLE_RATE 100000 +#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE) +#define DEFAULT_CPU_TIME_MAX_PERCENT 25 + +int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; + +static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); +static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; + +static atomic_t perf_sample_allowed_ns __read_mostly = + ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); + +void update_perf_cpu_limits(void) +{ + u64 tmp = perf_sample_period_ns; + + tmp *= sysctl_perf_cpu_time_max_percent; + do_div(tmp, 100); + atomic_set(&perf_sample_allowed_ns, tmp); +} + +static int perf_rotate_context(struct perf_cpu_context *cpuctx); int perf_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -180,10 +199,78 @@ int perf_proc_update_handler(struct ctl_table *table, int write, return ret; max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); + perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; + update_perf_cpu_limits(); return 0; } +int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; + +int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + update_perf_cpu_limits(); + + return 0; +} + +/* + * perf samples are done in some very critical code paths (NMIs). + * If they take too much CPU time, the system can lock up and not + * get any real work done. This will drop the sample rate when + * we detect that events are taking too long. + */ +#define NR_ACCUMULATED_SAMPLES 128 +DEFINE_PER_CPU(u64, running_sample_length); + +void perf_sample_event_took(u64 sample_len_ns) +{ + u64 avg_local_sample_len; + u64 local_samples_len; + + if (atomic_read(&perf_sample_allowed_ns) == 0) + return; + + /* decay the counter by 1 average sample */ + local_samples_len = __get_cpu_var(running_sample_length); + local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; + local_samples_len += sample_len_ns; + __get_cpu_var(running_sample_length) = local_samples_len; + + /* + * note: this will be biased artifically low until we have + * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us + * from having to maintain a count. + */ + avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; + + if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) + return; + + if (max_samples_per_tick <= 1) + return; + + max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2); + sysctl_perf_event_sample_rate = max_samples_per_tick * HZ; + perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; + + printk_ratelimited(KERN_WARNING + "perf samples too long (%lld > %d), lowering " + "kernel.perf_event_max_sample_rate to %d\n", + avg_local_sample_len, + atomic_read(&perf_sample_allowed_ns), + sysctl_perf_event_sample_rate); + + update_perf_cpu_limits(); +} + static atomic64_t perf_event_id; static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, @@ -196,9 +283,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); -static void ring_buffer_attach(struct perf_event *event, - struct ring_buffer *rb); - void __weak perf_event_print_debug(void) { } extern __weak const char *perf_pmu_name(void) @@ -257,8 +341,8 @@ struct perf_cgroup { static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task) { - return container_of(task_subsys_state(task, perf_subsys_id), - struct perf_cgroup, css); + return container_of(task_css(task, perf_subsys_id), + struct perf_cgroup, css); } static inline bool @@ -508,7 +592,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, if (!f.file) return -EBADF; - css = cgroup_css_from_dir(f.file, perf_subsys_id); + rcu_read_lock(); + + css = css_from_dir(f.file->f_dentry, &perf_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; @@ -534,6 +620,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, ret = -EINVAL; } out: + rcu_read_unlock(); fdput(f); return ret; } @@ -658,6 +745,106 @@ perf_cgroup_mark_enabled(struct perf_event *event, } #endif +/* + * set default to be dependent on timer tick just + * like original code + */ +#define PERF_CPU_HRTIMER (1000 / HZ) +/* + * function must be called with interrupts disbled + */ +static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr) +{ + struct perf_cpu_context *cpuctx; + enum hrtimer_restart ret = HRTIMER_NORESTART; + int rotations = 0; + + WARN_ON(!irqs_disabled()); + + cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); + + rotations = perf_rotate_context(cpuctx); + + /* + * arm timer if needed + */ + if (rotations) { + hrtimer_forward_now(hr, cpuctx->hrtimer_interval); + ret = HRTIMER_RESTART; + } + + return ret; +} + +/* CPU is going down */ +void perf_cpu_hrtimer_cancel(int cpu) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + if (WARN_ON(cpu != smp_processor_id())) + return; + + local_irq_save(flags); + + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + if (pmu->task_ctx_nr == perf_sw_context) + continue; + + hrtimer_cancel(&cpuctx->hrtimer); + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + +static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) +{ + struct hrtimer *hr = &cpuctx->hrtimer; + struct pmu *pmu = cpuctx->ctx.pmu; + int timer; + + /* no multiplexing needed for SW PMU */ + if (pmu->task_ctx_nr == perf_sw_context) + return; + + /* + * check default is sane, if not set then force to + * default interval (1/tick) + */ + timer = pmu->hrtimer_interval_ms; + if (timer < 1) + timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; + + cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); + + hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + hr->function = perf_cpu_hrtimer_handler; +} + +static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx) +{ + struct hrtimer *hr = &cpuctx->hrtimer; + struct pmu *pmu = cpuctx->ctx.pmu; + + /* not for SW PMU */ + if (pmu->task_ctx_nr == perf_sw_context) + return; + + if (hrtimer_active(hr)) + return; + + if (!hrtimer_callback_running(hr)) + __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval, + 0, HRTIMER_MODE_REL_PINNED, 0); +} + void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); @@ -686,12 +873,8 @@ static void perf_pmu_rotate_start(struct pmu *pmu) WARN_ON(!irqs_disabled()); - if (list_empty(&cpuctx->rotation_list)) { - int was_empty = list_empty(head); + if (list_empty(&cpuctx->rotation_list)) list_add(&cpuctx->rotation_list, head); - if (was_empty) - tick_nohz_full_kick(); - } } static void get_ctx(struct perf_event_context *ctx) @@ -764,8 +947,18 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) { struct perf_event_context *ctx; - rcu_read_lock(); retry: + /* + * One of the few rules of preemptible RCU is that one cannot do + * rcu_read_unlock() while holding a scheduler (or nested) lock when + * part of the read side critical section was preemptible -- see + * rcu_read_unlock_special(). + * + * Since ctx->lock nests under rq->lock we must ensure the entire read + * side critical section is non-preemptible. + */ + preempt_disable(); + rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); if (ctx) { /* @@ -781,6 +974,8 @@ retry: raw_spin_lock_irqsave(&ctx->lock, *flags); if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { raw_spin_unlock_irqrestore(&ctx->lock, *flags); + rcu_read_unlock(); + preempt_enable(); goto retry; } @@ -790,6 +985,7 @@ retry: } } rcu_read_unlock(); + preempt_enable(); return ctx; } @@ -1020,6 +1216,9 @@ static void perf_event__id_header_size(struct perf_event *event) if (sample_type & PERF_SAMPLE_TIME) size += sizeof(data->time); + if (sample_type & PERF_SAMPLE_IDENTIFIER) + size += sizeof(data->id); + if (sample_type & PERF_SAMPLE_ID) size += sizeof(data->id); @@ -1506,6 +1705,7 @@ group_sched_in(struct perf_event *group_event, if (event_sched_in(group_event, cpuctx, ctx)) { pmu->cancel_txn(pmu); + perf_cpu_hrtimer_restart(cpuctx); return -EAGAIN; } @@ -1552,6 +1752,8 @@ group_error: pmu->cancel_txn(pmu); + perf_cpu_hrtimer_restart(cpuctx); + return -EAGAIN; } @@ -1764,7 +1966,16 @@ static int __perf_event_enable(void *info) struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int err; - if (WARN_ON_ONCE(!ctx->is_active)) + /* + * There's a time window between 'ctx->is_active' check + * in perf_event_enable function and this place having: + * - IRQs on + * - ctx->lock unlocked + * + * where the task could be killed and 'ctx' deactivated + * by perf_event_exit_task. + */ + if (!ctx->is_active) return -EINVAL; raw_spin_lock(&ctx->lock); @@ -1807,8 +2018,10 @@ static int __perf_event_enable(void *info) * If this event can't go on and it's part of a * group, then the whole group has to come off. */ - if (leader != event) + if (leader != event) { group_sched_out(leader, cpuctx, ctx); + perf_cpu_hrtimer_restart(cpuctx); + } if (leader->attr.pinned) { update_group_times(leader); leader->state = PERF_EVENT_STATE_ERROR; @@ -2502,7 +2715,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, hwc = &event->hw; - if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) { + if (hwc->interrupts == MAX_INTERRUPTS) { hwc->interrupts = 0; perf_log_throttle(event, 1); event->pmu->start(event, 0); @@ -2555,7 +2768,7 @@ static void rotate_ctx(struct perf_event_context *ctx) * because they're strictly cpu affine and rotate_start is called with IRQs * disabled, while rotate_context is called from IRQ context. */ -static void perf_rotate_context(struct perf_cpu_context *cpuctx) +static int perf_rotate_context(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = NULL; int rotate = 0, remove = 1; @@ -2594,15 +2807,18 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) done: if (remove) list_del_init(&cpuctx->rotation_list); + + return rotate; } #ifdef CONFIG_NO_HZ_FULL bool perf_event_can_stop_tick(void) { - if (list_empty(&__get_cpu_var(rotation_list))) - return true; - else + if (atomic_read(&nr_freq_events) || + __this_cpu_read(perf_throttled_count)) return false; + else + return true; } #endif @@ -2625,10 +2841,6 @@ void perf_event_task_tick(void) ctx = cpuctx->task_ctx; if (ctx) perf_adjust_freq_unthr_context(ctx, throttled); - - if (cpuctx->jiffies_interval == 1 || - !(jiffies % cpuctx->jiffies_interval)) - perf_rotate_context(cpuctx); } } @@ -2918,43 +3130,50 @@ static void free_event_rcu(struct rcu_head *head) } static void ring_buffer_put(struct ring_buffer *rb); +static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); -static void free_event(struct perf_event *event) +static void unaccount_event_cpu(struct perf_event *event, int cpu) { - irq_work_sync(&event->pending); - - if (!event->parent) { - if (event->attach_state & PERF_ATTACH_TASK) - static_key_slow_dec_deferred(&perf_sched_events); - if (event->attr.mmap || event->attr.mmap_data) - atomic_dec(&nr_mmap_events); - if (event->attr.comm) - atomic_dec(&nr_comm_events); - if (event->attr.task) - atomic_dec(&nr_task_events); - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - if (is_cgroup_event(event)) { - atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); - static_key_slow_dec_deferred(&perf_sched_events); - } + if (event->parent) + return; - if (has_branch_stack(event)) { - static_key_slow_dec_deferred(&perf_sched_events); - /* is system-wide event */ - if (!(event->attach_state & PERF_ATTACH_TASK)) - atomic_dec(&per_cpu(perf_branch_stack_events, - event->cpu)); - } + if (has_branch_stack(event)) { + if (!(event->attach_state & PERF_ATTACH_TASK)) + atomic_dec(&per_cpu(perf_branch_stack_events, cpu)); } + if (is_cgroup_event(event)) + atomic_dec(&per_cpu(perf_cgroup_events, cpu)); +} - if (event->rb) { - ring_buffer_put(event->rb); - event->rb = NULL; - } +static void unaccount_event(struct perf_event *event) +{ + if (event->parent) + return; + if (event->attach_state & PERF_ATTACH_TASK) + static_key_slow_dec_deferred(&perf_sched_events); + if (event->attr.mmap || event->attr.mmap_data) + atomic_dec(&nr_mmap_events); + if (event->attr.comm) + atomic_dec(&nr_comm_events); + if (event->attr.task) + atomic_dec(&nr_task_events); + if (event->attr.freq) + atomic_dec(&nr_freq_events); if (is_cgroup_event(event)) - perf_detach_cgroup(event); + static_key_slow_dec_deferred(&perf_sched_events); + if (has_branch_stack(event)) + static_key_slow_dec_deferred(&perf_sched_events); + + unaccount_event_cpu(event, event->cpu); +} + +static void __free_event(struct perf_event *event) +{ + if (!event->parent) { + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + put_callchain_buffers(); + } if (event->destroy) event->destroy(event); @@ -2964,6 +3183,37 @@ static void free_event(struct perf_event *event) call_rcu(&event->rcu_head, free_event_rcu); } +static void free_event(struct perf_event *event) +{ + irq_work_sync(&event->pending); + + unaccount_event(event); + + if (event->rb) { + struct ring_buffer *rb; + + /* + * Can happen when we close an event with re-directed output. + * + * Since we have a 0 refcount, perf_mmap_close() will skip + * over us; possibly making our ring_buffer_put() the last. + */ + mutex_lock(&event->mmap_mutex); + rb = event->rb; + if (rb) { + rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); + ring_buffer_put(rb); /* could be last */ + } + mutex_unlock(&event->mmap_mutex); + } + + if (is_cgroup_event(event)) + perf_detach_cgroup(event); + + + __free_event(event); +} int perf_event_release_kernel(struct perf_event *event) { @@ -3188,30 +3438,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) unsigned int events = POLL_HUP; /* - * Race between perf_event_set_output() and perf_poll(): perf_poll() - * grabs the rb reference but perf_event_set_output() overrides it. - * Here is the timeline for two threads T1, T2: - * t0: T1, rb = rcu_dereference(event->rb) - * t1: T2, old_rb = event->rb - * t2: T2, event->rb = new rb - * t3: T2, ring_buffer_detach(old_rb) - * t4: T1, ring_buffer_attach(rb1) - * t5: T1, poll_wait(event->waitq) - * - * To avoid this problem, we grab mmap_mutex in perf_poll() - * thereby ensuring that the assignment of the new ring buffer - * and the detachment of the old buffer appear atomic to perf_poll() + * Pin the event->rb by taking event->mmap_mutex; otherwise + * perf_event_set_output() can swizzle our rb and make us miss wakeups. */ mutex_lock(&event->mmap_mutex); - - rcu_read_lock(); - rb = rcu_dereference(event->rb); - if (rb) { - ring_buffer_attach(event, rb); + rb = event->rb; + if (rb) events = atomic_xchg(&rb->poll, 0); - } - rcu_read_unlock(); - mutex_unlock(&event->mmap_mutex); poll_wait(file, &event->waitq, wait); @@ -3337,6 +3570,15 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_PERIOD: return perf_event_period(event, (u64 __user *)arg); + case PERF_EVENT_IOC_ID: + { + u64 id = primary_event_id(event); + + if (copy_to_user((void __user *)arg, &id, sizeof(id))) + return -EFAULT; + return 0; + } + case PERF_EVENT_IOC_SET_OUTPUT: { int ret; @@ -3418,6 +3660,26 @@ static void calc_timer_values(struct perf_event *event, *running = ctx_time - event->tstamp_running; } +static void perf_event_init_userpage(struct perf_event *event) +{ + struct perf_event_mmap_page *userpg; + struct ring_buffer *rb; + + rcu_read_lock(); + rb = rcu_dereference(event->rb); + if (!rb) + goto unlock; + + userpg = rb->user_page; + + /* Allow new userspace to detect that bit 0 is deprecated */ + userpg->cap_bit0_is_deprecated = 1; + userpg->size = offsetof(struct perf_event_mmap_page, __reserved); + +unlock: + rcu_read_unlock(); +} + void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { } @@ -3434,6 +3696,10 @@ void perf_event_update_userpage(struct perf_event *event) u64 enabled, running, now; rcu_read_lock(); + rb = rcu_dereference(event->rb); + if (!rb) + goto unlock; + /* * compute total_time_enabled, total_time_running * based on snapshot values taken when the event @@ -3444,12 +3710,8 @@ void perf_event_update_userpage(struct perf_event *event) * NMI context */ calc_timer_values(event, &now, &enabled, &running); - rb = rcu_dereference(event->rb); - if (!rb) - goto unlock; userpg = rb->user_page; - /* * Disable preemption so as to not let the corresponding user-space * spin too long if we get preempted. @@ -3521,16 +3783,12 @@ static void ring_buffer_attach(struct perf_event *event, return; spin_lock_irqsave(&rb->event_lock, flags); - if (!list_empty(&event->rb_entry)) - goto unlock; - - list_add(&event->rb_entry, &rb->event_list); -unlock: + if (list_empty(&event->rb_entry)) + list_add(&event->rb_entry, &rb->event_list); spin_unlock_irqrestore(&rb->event_lock, flags); } -static void ring_buffer_detach(struct perf_event *event, - struct ring_buffer *rb) +static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) { unsigned long flags; @@ -3549,13 +3807,10 @@ static void ring_buffer_wakeup(struct perf_event *event) rcu_read_lock(); rb = rcu_dereference(event->rb); - if (!rb) - goto unlock; - - list_for_each_entry_rcu(event, &rb->event_list, rb_entry) - wake_up_all(&event->waitq); - -unlock: + if (rb) { + list_for_each_entry_rcu(event, &rb->event_list, rb_entry) + wake_up_all(&event->waitq); + } rcu_read_unlock(); } @@ -3584,18 +3839,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) static void ring_buffer_put(struct ring_buffer *rb) { - struct perf_event *event, *n; - unsigned long flags; - if (!atomic_dec_and_test(&rb->refcount)) return; - spin_lock_irqsave(&rb->event_lock, flags); - list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { - list_del_init(&event->rb_entry); - wake_up_all(&event->waitq); - } - spin_unlock_irqrestore(&rb->event_lock, flags); + WARN_ON_ONCE(!list_empty(&rb->event_list)); call_rcu(&rb->rcu_head, rb_free_rcu); } @@ -3605,26 +3852,100 @@ static void perf_mmap_open(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; atomic_inc(&event->mmap_count); + atomic_inc(&event->rb->mmap_count); } +/* + * A buffer can be mmap()ed multiple times; either directly through the same + * event, or through other events by use of perf_event_set_output(). + * + * In order to undo the VM accounting done by perf_mmap() we need to destroy + * the buffer here, where we still have a VM context. This means we need + * to detach all events redirecting to us. + */ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - unsigned long size = perf_data_size(event->rb); - struct user_struct *user = event->mmap_user; - struct ring_buffer *rb = event->rb; + struct ring_buffer *rb = event->rb; + struct user_struct *mmap_user = rb->mmap_user; + int mmap_locked = rb->mmap_locked; + unsigned long size = perf_data_size(rb); + + atomic_dec(&rb->mmap_count); - atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); - vma->vm_mm->pinned_vm -= event->mmap_locked; - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); + if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) + return; + + /* Detach current event from the buffer. */ + rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); + mutex_unlock(&event->mmap_mutex); + + /* If there's still other mmap()s of this buffer, we're done. */ + if (atomic_read(&rb->mmap_count)) { + ring_buffer_put(rb); /* can't be last */ + return; + } + + /* + * No other mmap()s, detach from all other events that might redirect + * into the now unreachable buffer. Somewhat complicated by the + * fact that rb::event_lock otherwise nests inside mmap_mutex. + */ +again: + rcu_read_lock(); + list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { + if (!atomic_long_inc_not_zero(&event->refcount)) { + /* + * This event is en-route to free_event() which will + * detach it and remove it from the list. + */ + continue; + } + rcu_read_unlock(); + + mutex_lock(&event->mmap_mutex); + /* + * Check we didn't race with perf_event_set_output() which can + * swizzle the rb from under us while we were waiting to + * acquire mmap_mutex. + * + * If we find a different rb; ignore this event, a next + * iteration will no longer find it on the list. We have to + * still restart the iteration to make sure we're not now + * iterating the wrong list. + */ + if (event->rb == rb) { + rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); + ring_buffer_put(rb); /* can't be last, we still have one */ + } mutex_unlock(&event->mmap_mutex); + put_event(event); - ring_buffer_put(rb); - free_uid(user); + /* + * Restart the iteration; either we're on the wrong list or + * destroyed its integrity by doing a deletion. + */ + goto again; } + rcu_read_unlock(); + + /* + * It could be there's still a few 0-ref events on the list; they'll + * get cleaned up by free_event() -- they'll also still have their + * ref on the rb and will free it whenever they are done with it. + * + * Aside from that, this buffer is 'fully' detached and unmapped, + * undo the VM accounting. + */ + + atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); + vma->vm_mm->pinned_vm -= mmap_locked; + free_uid(mmap_user); + + ring_buffer_put(rb); /* could be last */ } static const struct vm_operations_struct perf_mmap_vmops = { @@ -3674,12 +3995,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; WARN_ON_ONCE(event->ctx->parent_ctx); +again: mutex_lock(&event->mmap_mutex); if (event->rb) { - if (event->rb->nr_pages == nr_pages) - atomic_inc(&event->rb->refcount); - else + if (event->rb->nr_pages != nr_pages) { ret = -EINVAL; + goto unlock; + } + + if (!atomic_inc_not_zero(&event->rb->mmap_count)) { + /* + * Raced against perf_mmap_close() through + * perf_event_set_output(). Try again, hope for better + * luck. + */ + mutex_unlock(&event->mmap_mutex); + goto again; + } + goto unlock; } @@ -3720,13 +4053,18 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ret = -ENOMEM; goto unlock; } - rcu_assign_pointer(event->rb, rb); + + atomic_set(&rb->mmap_count, 1); + rb->mmap_locked = extra; + rb->mmap_user = get_current_user(); atomic_long_add(user_extra, &user->locked_vm); - event->mmap_locked = extra; - event->mmap_user = get_current_user(); - vma->vm_mm->pinned_vm += event->mmap_locked; + vma->vm_mm->pinned_vm += extra; + ring_buffer_attach(event, rb); + rcu_assign_pointer(event->rb, rb); + + perf_event_init_userpage(event); perf_event_update_userpage(event); unlock: @@ -3734,7 +4072,11 @@ unlock: atomic_inc(&event->mmap_count); mutex_unlock(&event->mmap_mutex); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + /* + * Since pinned accounting is per vm we cannot allow fork() to copy our + * vma. + */ + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &perf_mmap_vmops; return ret; @@ -3965,7 +4307,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_TIME) data->time = perf_clock(); - if (sample_type & PERF_SAMPLE_ID) + if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) data->id = primary_event_id(event); if (sample_type & PERF_SAMPLE_STREAM_ID) @@ -4004,6 +4346,9 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_CPU) perf_output_put(handle, data->cpu_entry); + + if (sample_type & PERF_SAMPLE_IDENTIFIER) + perf_output_put(handle, data->id); } void perf_event__output_id_sample(struct perf_event *event, @@ -4069,7 +4414,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, list_for_each_entry(sub, &leader->sibling_list, group_entry) { n = 0; - if (sub != event) + if ((sub != event) && + (sub->state == PERF_EVENT_STATE_ACTIVE)) sub->pmu->read(sub); values[n++] = perf_event_count(sub); @@ -4116,6 +4462,9 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_put(handle, *header); + if (sample_type & PERF_SAMPLE_IDENTIFIER) + perf_output_put(handle, data->id); + if (sample_type & PERF_SAMPLE_IP) perf_output_put(handle, data->ip); @@ -4176,20 +4525,6 @@ void perf_output_sample(struct perf_output_handle *handle, } } - if (!event->attr.watermark) { - int wakeup_events = event->attr.wakeup_events; - - if (wakeup_events) { - struct ring_buffer *rb = handle->rb; - int events = local_inc_return(&rb->events); - - if (events >= wakeup_events) { - local_sub(wakeup_events, &rb->events); - local_inc(&rb->wakeup); - } - } - } - if (sample_type & PERF_SAMPLE_BRANCH_STACK) { if (data->br_stack) { size_t size; @@ -4225,16 +4560,31 @@ void perf_output_sample(struct perf_output_handle *handle, } } - if (sample_type & PERF_SAMPLE_STACK_USER) + if (sample_type & PERF_SAMPLE_STACK_USER) { perf_output_sample_ustack(handle, data->stack_user_size, data->regs_user.regs); + } if (sample_type & PERF_SAMPLE_WEIGHT) perf_output_put(handle, data->weight); if (sample_type & PERF_SAMPLE_DATA_SRC) perf_output_put(handle, data->data_src.val); + + if (!event->attr.watermark) { + int wakeup_events = event->attr.wakeup_events; + + if (wakeup_events) { + struct ring_buffer *rb = handle->rb; + int events = local_inc_return(&rb->events); + + if (events >= wakeup_events) { + local_sub(wakeup_events, &rb->events); + local_inc(&rb->wakeup); + } + } + } } void perf_prepare_sample(struct perf_event_header *header, @@ -4394,12 +4744,10 @@ perf_event_read_event(struct perf_event *event, perf_output_end(&handle); } -typedef int (perf_event_aux_match_cb)(struct perf_event *event, void *data); typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); static void perf_event_aux_ctx(struct perf_event_context *ctx, - perf_event_aux_match_cb match, perf_event_aux_output_cb output, void *data) { @@ -4410,15 +4758,12 @@ perf_event_aux_ctx(struct perf_event_context *ctx, continue; if (!event_filter_match(event)) continue; - if (match(event, data)) - output(event, data); + output(event, data); } } static void -perf_event_aux(perf_event_aux_match_cb match, - perf_event_aux_output_cb output, - void *data, +perf_event_aux(perf_event_aux_output_cb output, void *data, struct perf_event_context *task_ctx) { struct perf_cpu_context *cpuctx; @@ -4431,7 +4776,7 @@ perf_event_aux(perf_event_aux_match_cb match, cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); if (cpuctx->unique_pmu != pmu) goto next; - perf_event_aux_ctx(&cpuctx->ctx, match, output, data); + perf_event_aux_ctx(&cpuctx->ctx, output, data); if (task_ctx) goto next; ctxn = pmu->task_ctx_nr; @@ -4439,14 +4784,14 @@ perf_event_aux(perf_event_aux_match_cb match, goto next; ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); if (ctx) - perf_event_aux_ctx(ctx, match, output, data); + perf_event_aux_ctx(ctx, output, data); next: put_cpu_ptr(pmu->pmu_cpu_context); } if (task_ctx) { preempt_disable(); - perf_event_aux_ctx(task_ctx, match, output, data); + perf_event_aux_ctx(task_ctx, output, data); preempt_enable(); } rcu_read_unlock(); @@ -4455,7 +4800,7 @@ next: /* * task tracking -- fork/exit * - * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task + * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task */ struct perf_task_event { @@ -4473,6 +4818,13 @@ struct perf_task_event { } event_id; }; +static int perf_event_task_match(struct perf_event *event) +{ + return event->attr.comm || event->attr.mmap || + event->attr.mmap2 || event->attr.mmap_data || + event->attr.task; +} + static void perf_event_task_output(struct perf_event *event, void *data) { @@ -4482,6 +4834,9 @@ static void perf_event_task_output(struct perf_event *event, struct task_struct *task = task_event->task; int ret, size = task_event->event_id.header.size; + if (!perf_event_task_match(event)) + return; + perf_event_header__init_id(&task_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, @@ -4504,13 +4859,6 @@ out: task_event->event_id.header.size = size; } -static int perf_event_task_match(struct perf_event *event, - void *data __maybe_unused) -{ - return event->attr.comm || event->attr.mmap || - event->attr.mmap_data || event->attr.task; -} - static void perf_event_task(struct task_struct *task, struct perf_event_context *task_ctx, int new) @@ -4539,8 +4887,7 @@ static void perf_event_task(struct task_struct *task, }, }; - perf_event_aux(perf_event_task_match, - perf_event_task_output, + perf_event_aux(perf_event_task_output, &task_event, task_ctx); } @@ -4567,6 +4914,11 @@ struct perf_comm_event { } event_id; }; +static int perf_event_comm_match(struct perf_event *event) +{ + return event->attr.comm; +} + static void perf_event_comm_output(struct perf_event *event, void *data) { @@ -4576,6 +4928,9 @@ static void perf_event_comm_output(struct perf_event *event, int size = comm_event->event_id.header.size; int ret; + if (!perf_event_comm_match(event)) + return; + perf_event_header__init_id(&comm_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, comm_event->event_id.header.size); @@ -4597,12 +4952,6 @@ out: comm_event->event_id.header.size = size; } -static int perf_event_comm_match(struct perf_event *event, - void *data __maybe_unused) -{ - return event->attr.comm; -} - static void perf_event_comm_event(struct perf_comm_event *comm_event) { char comm[TASK_COMM_LEN]; @@ -4617,8 +4966,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - perf_event_aux(perf_event_comm_match, - perf_event_comm_output, + perf_event_aux(perf_event_comm_output, comm_event, NULL); } @@ -4669,6 +5017,9 @@ struct perf_mmap_event { const char *file_name; int file_size; + int maj, min; + u64 ino; + u64 ino_generation; struct { struct perf_event_header header; @@ -4681,6 +5032,17 @@ struct perf_mmap_event { } event_id; }; +static int perf_event_mmap_match(struct perf_event *event, + void *data) +{ + struct perf_mmap_event *mmap_event = data; + struct vm_area_struct *vma = mmap_event->vma; + int executable = vma->vm_flags & VM_EXEC; + + return (!executable && event->attr.mmap_data) || + (executable && (event->attr.mmap || event->attr.mmap2)); +} + static void perf_event_mmap_output(struct perf_event *event, void *data) { @@ -4690,6 +5052,17 @@ static void perf_event_mmap_output(struct perf_event *event, int size = mmap_event->event_id.header.size; int ret; + if (!perf_event_mmap_match(event, data)) + return; + + if (event->attr.mmap2) { + mmap_event->event_id.header.type = PERF_RECORD_MMAP2; + mmap_event->event_id.header.size += sizeof(mmap_event->maj); + mmap_event->event_id.header.size += sizeof(mmap_event->min); + mmap_event->event_id.header.size += sizeof(mmap_event->ino); + mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); + } + perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, mmap_event->event_id.header.size); @@ -4700,6 +5073,14 @@ static void perf_event_mmap_output(struct perf_event *event, mmap_event->event_id.tid = perf_event_tid(event, current); perf_output_put(&handle, mmap_event->event_id); + + if (event->attr.mmap2) { + perf_output_put(&handle, mmap_event->maj); + perf_output_put(&handle, mmap_event->min); + perf_output_put(&handle, mmap_event->ino); + perf_output_put(&handle, mmap_event->ino_generation); + } + __output_copy(&handle, mmap_event->file_name, mmap_event->file_size); @@ -4710,21 +5091,12 @@ out: mmap_event->event_id.header.size = size; } -static int perf_event_mmap_match(struct perf_event *event, - void *data) -{ - struct perf_mmap_event *mmap_event = data; - struct vm_area_struct *vma = mmap_event->vma; - int executable = vma->vm_flags & VM_EXEC; - - return (!executable && event->attr.mmap_data) || - (executable && event->attr.mmap); -} - static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) { struct vm_area_struct *vma = mmap_event->vma; struct file *file = vma->vm_file; + int maj = 0, min = 0; + u64 ino = 0, gen = 0; unsigned int size; char tmp[16]; char *buf = NULL; @@ -4733,6 +5105,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) memset(tmp, 0, sizeof(tmp)); if (file) { + struct inode *inode; + dev_t dev; /* * d_path works from the end of the rb backwards, so we * need to add enough zero bytes after the string to handle @@ -4748,6 +5122,13 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) name = strncpy(tmp, "//toolong", sizeof(tmp)); goto got_name; } + inode = file_inode(vma->vm_file); + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + gen = inode->i_generation; + maj = MAJOR(dev); + min = MINOR(dev); + } else { if (arch_vma_name(mmap_event->vma)) { name = strncpy(tmp, arch_vma_name(mmap_event->vma), @@ -4778,14 +5159,17 @@ got_name: mmap_event->file_name = name; mmap_event->file_size = size; + mmap_event->maj = maj; + mmap_event->min = min; + mmap_event->ino = ino; + mmap_event->ino_generation = gen; if (!(vma->vm_flags & VM_EXEC)) mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; - perf_event_aux(perf_event_mmap_match, - perf_event_mmap_output, + perf_event_aux(perf_event_mmap_output, mmap_event, NULL); @@ -4815,6 +5199,10 @@ void perf_event_mmap(struct vm_area_struct *vma) .len = vma->vm_end - vma->vm_start, .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, }, + /* .maj (attr_mmap2 only) */ + /* .min (attr_mmap2 only) */ + /* .ino (attr_mmap2 only) */ + /* .ino_generation (attr_mmap2 only) */ }; perf_event_mmap_event(&mmap_event); @@ -4892,6 +5280,7 @@ static int __perf_event_overflow(struct perf_event *event, __this_cpu_inc(perf_throttled_count); hwc->interrupts = MAX_INTERRUPTS; perf_log_throttle(event, 0); + tick_nohz_full_kick(); ret = 1; } } @@ -4961,7 +5350,7 @@ static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); * sign as trigger. */ -static u64 perf_swevent_set_period(struct perf_event *event) +u64 perf_swevent_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; u64 period = hwc->last_period; @@ -5904,9 +6293,54 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); } +static ssize_t +perf_event_mux_interval_ms_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct pmu *pmu = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); +} + +static ssize_t +perf_event_mux_interval_ms_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pmu *pmu = dev_get_drvdata(dev); + int timer, cpu, ret; + + ret = kstrtoint(buf, 0, &timer); + if (ret) + return ret; + + if (timer < 1) + return -EINVAL; + + /* same value, noting to do */ + if (timer == pmu->hrtimer_interval_ms) + return count; + + pmu->hrtimer_interval_ms = timer; + + /* update all cpuctx for this PMU */ + for_each_possible_cpu(cpu) { + struct perf_cpu_context *cpuctx; + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); + + if (hrtimer_active(&cpuctx->hrtimer)) + hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval); + } + + return count; +} + static struct device_attribute pmu_dev_attrs[] = { - __ATTR_RO(type), - __ATTR_NULL, + __ATTR_RO(type), + __ATTR_RW(perf_event_mux_interval_ms), + __ATTR_NULL, }; static int pmu_bus_running; @@ -5952,7 +6386,7 @@ free_dev: static struct lock_class_key cpuctx_mutex; static struct lock_class_key cpuctx_lock; -int perf_pmu_register(struct pmu *pmu, char *name, int type) +int perf_pmu_register(struct pmu *pmu, const char *name, int type) { int cpu, ret; @@ -6001,7 +6435,9 @@ skip_type: lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->ctx.type = cpu_context; cpuctx->ctx.pmu = pmu; - cpuctx->jiffies_interval = 1; + + __perf_cpu_hrtimer_init(cpuctx, cpu); + INIT_LIST_HEAD(&cpuctx->rotation_list); cpuctx->unique_pmu = pmu; } @@ -6110,6 +6546,44 @@ unlock: return pmu; } +static void account_event_cpu(struct perf_event *event, int cpu) +{ + if (event->parent) + return; + + if (has_branch_stack(event)) { + if (!(event->attach_state & PERF_ATTACH_TASK)) + atomic_inc(&per_cpu(perf_branch_stack_events, cpu)); + } + if (is_cgroup_event(event)) + atomic_inc(&per_cpu(perf_cgroup_events, cpu)); +} + +static void account_event(struct perf_event *event) +{ + if (event->parent) + return; + + if (event->attach_state & PERF_ATTACH_TASK) + static_key_slow_inc(&perf_sched_events.key); + if (event->attr.mmap || event->attr.mmap_data) + atomic_inc(&nr_mmap_events); + if (event->attr.comm) + atomic_inc(&nr_comm_events); + if (event->attr.task) + atomic_inc(&nr_task_events); + if (event->attr.freq) { + if (atomic_inc_return(&nr_freq_events) == 1) + tick_nohz_full_kick_all(); + } + if (has_branch_stack(event)) + static_key_slow_inc(&perf_sched_events.key); + if (is_cgroup_event(event)) + static_key_slow_inc(&perf_sched_events.key); + + account_event_cpu(event, event->cpu); +} + /* * Allocate and initialize a event structure */ @@ -6124,7 +6598,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, struct pmu *pmu; struct perf_event *event; struct hw_perf_event *hwc; - long err; + long err = -EINVAL; if ((unsigned)cpu >= nr_cpu_ids) { if (!task || cpu != -1) @@ -6207,49 +6681,35 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * we currently do not support PERF_FORMAT_GROUP on inherited events */ if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) - goto done; + goto err_ns; pmu = perf_init_event(event); - -done: - err = 0; if (!pmu) - err = -EINVAL; - else if (IS_ERR(pmu)) + goto err_ns; + else if (IS_ERR(pmu)) { err = PTR_ERR(pmu); - - if (err) { - if (event->ns) - put_pid_ns(event->ns); - kfree(event); - return ERR_PTR(err); + goto err_ns; } if (!event->parent) { - if (event->attach_state & PERF_ATTACH_TASK) - static_key_slow_inc(&perf_sched_events.key); - if (event->attr.mmap || event->attr.mmap_data) - atomic_inc(&nr_mmap_events); - if (event->attr.comm) - atomic_inc(&nr_comm_events); - if (event->attr.task) - atomic_inc(&nr_task_events); if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(); - if (err) { - free_event(event); - return ERR_PTR(err); - } - } - if (has_branch_stack(event)) { - static_key_slow_inc(&perf_sched_events.key); - if (!(event->attach_state & PERF_ATTACH_TASK)) - atomic_inc(&per_cpu(perf_branch_stack_events, - event->cpu)); + if (err) + goto err_pmu; } } return event; + +err_pmu: + if (event->destroy) + event->destroy(event); +err_ns: + if (event->ns) + put_pid_ns(event->ns); + kfree(event); + + return ERR_PTR(err); } static int perf_copy_attr(struct perf_event_attr __user *uattr, @@ -6307,6 +6767,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (ret) return -EFAULT; + /* disabled for now */ + if (attr->mmap2) + return -EINVAL; + if (attr->__reserved_1) return -EINVAL; @@ -6327,11 +6791,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) return -EINVAL; - /* kernel level capture: check permissions */ - if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) - && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; - /* propagate priv level, when not set for branch */ if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { @@ -6349,6 +6808,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, */ attr->branch_sample_type = mask; } + /* privileged levels capture (kernel, hv): check permissions */ + if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) + && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; } if (attr->sample_type & PERF_SAMPLE_REGS_USER) { @@ -6412,6 +6875,8 @@ set: if (atomic_read(&event->mmap_count)) goto unlock; + old_rb = event->rb; + if (output_event) { /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); @@ -6419,16 +6884,28 @@ set: goto unlock; } - old_rb = event->rb; - rcu_assign_pointer(event->rb, rb); if (old_rb) ring_buffer_detach(event, old_rb); + + if (rb) + ring_buffer_attach(event, rb); + + rcu_assign_pointer(event->rb, rb); + + if (old_rb) { + ring_buffer_put(old_rb); + /* + * Since we detached before setting the new rb, so that we + * could attach the new rb, we could have missed a wakeup. + * Provide it now. + */ + wake_up_all(&event->waitq); + } + ret = 0; unlock: mutex_unlock(&event->mmap_mutex); - if (old_rb) - ring_buffer_put(old_rb); out: return ret; } @@ -6518,17 +6995,14 @@ SYSCALL_DEFINE5(perf_event_open, if (flags & PERF_FLAG_PID_CGROUP) { err = perf_cgroup_connect(pid, event, &attr, group_leader); - if (err) - goto err_alloc; - /* - * one more event: - * - that has cgroup constraint on event->cpu - * - that may need work on context switch - */ - atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); - static_key_slow_inc(&perf_sched_events.key); + if (err) { + __free_event(event); + goto err_task; + } } + account_event(event); + /* * Special case software events and allow them to be part of * any hardware group. @@ -6724,6 +7198,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err; } + account_event(event); + ctx = find_get_context(event->pmu, task, cpu); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); @@ -6760,18 +7236,20 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) list_for_each_entry_safe(event, tmp, &src_ctx->event_list, event_entry) { perf_remove_from_context(event); + unaccount_event_cpu(event, src_cpu); put_ctx(src_ctx); - list_add(&event->event_entry, &events); + list_add(&event->migrate_entry, &events); } mutex_unlock(&src_ctx->mutex); synchronize_rcu(); mutex_lock(&dst_ctx->mutex); - list_for_each_entry_safe(event, tmp, &events, event_entry) { - list_del(&event->event_entry); + list_for_each_entry_safe(event, tmp, &events, migrate_entry) { + list_del(&event->migrate_entry); if (event->state >= PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_INACTIVE; + account_event_cpu(event, dst_cpu); perf_install_in_context(dst_ctx, event, dst_cpu); get_ctx(dst_ctx); } @@ -7139,7 +7617,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, * child. */ - child_ctx = alloc_perf_context(event->pmu, child); + child_ctx = alloc_perf_context(parent_ctx->pmu, child); if (!child_ctx) return -ENOMEM; @@ -7282,7 +7760,7 @@ static void __init perf_event_init_all_cpus(void) } } -static void __cpuinit perf_event_init_cpu(int cpu) +static void perf_event_init_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); @@ -7371,7 +7849,7 @@ static struct notifier_block perf_reboot_notifier = { .priority = INT_MIN, }; -static int __cpuinit +static int perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; @@ -7387,7 +7865,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) case CPU_DOWN_PREPARE: perf_event_exit_cpu(cpu); break; - default: break; } @@ -7453,7 +7930,8 @@ unlock: device_initcall(perf_event_sysfs_init); #ifdef CONFIG_CGROUP_PERF -static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) +static struct cgroup_subsys_state * +perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct perf_cgroup *jc; @@ -7470,11 +7948,10 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) return &jc->css; } -static void perf_cgroup_css_free(struct cgroup *cont) +static void perf_cgroup_css_free(struct cgroup_subsys_state *css) { - struct perf_cgroup *jc; - jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), - struct perf_cgroup, css); + struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css); + free_percpu(jc->info); kfree(jc); } @@ -7486,15 +7963,17 @@ static int __perf_cgroup_move(void *info) return 0; } -static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void perf_cgroup_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { struct task_struct *task; - cgroup_taskset_for_each(task, cgrp, tset) + cgroup_taskset_for_each(task, css, tset) task_function_call(task, __perf_cgroup_move, task); } -static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, +static void perf_cgroup_exit(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, struct task_struct *task) { /* diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index a64f8aeb5c1..1559fb0b929 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -46,23 +46,26 @@ #include <linux/smp.h> #include <linux/hw_breakpoint.h> - - /* * Constraints data */ +struct bp_cpuinfo { + /* Number of pinned cpu breakpoints in a cpu */ + unsigned int cpu_pinned; + /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */ + unsigned int *tsk_pinned; + /* Number of non-pinned cpu/task breakpoints in a cpu */ + unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */ +}; -/* Number of pinned cpu breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]); - -/* Number of pinned task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]); - -/* Number of non-pinned cpu/task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); - +static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); static int nr_slots[TYPE_MAX]; +static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) +{ + return per_cpu_ptr(bp_cpuinfo + type, cpu); +} + /* Keep track of the breakpoints attached to tasks */ static LIST_HEAD(bp_task_head); @@ -96,8 +99,8 @@ static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) */ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) { + unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; int i; - unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); for (i = nr_slots[type] - 1; i >= 0; i--) { if (tsk_pinned[i] > 0) @@ -120,13 +123,20 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) list_for_each_entry(iter, &bp_task_head, hw.bp_list) { if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type && - cpu == iter->cpu) + (iter->cpu < 0 || cpu == iter->cpu)) count += hw_breakpoint_weight(iter); } return count; } +static const struct cpumask *cpumask_of_bp(struct perf_event *bp) +{ + if (bp->cpu >= 0) + return cpumask_of(bp->cpu); + return cpu_possible_mask; +} + /* * Report the number of pinned/un-pinned breakpoints we have in * a given cpu (cpu > -1) or in all of them (cpu = -1). @@ -135,25 +145,15 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, enum bp_type_idx type) { - int cpu = bp->cpu; - struct task_struct *tsk = bp->hw.bp_target; - - if (cpu >= 0) { - slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); - if (!tsk) - slots->pinned += max_task_bp_pinned(cpu, type); - else - slots->pinned += task_bp_pinned(cpu, bp, type); - slots->flexible = per_cpu(nr_bp_flexible[type], cpu); - - return; - } + const struct cpumask *cpumask = cpumask_of_bp(bp); + int cpu; - for_each_online_cpu(cpu) { - unsigned int nr; + for_each_cpu(cpu, cpumask) { + struct bp_cpuinfo *info = get_bp_info(cpu, type); + int nr; - nr = per_cpu(nr_cpu_bp_pinned[type], cpu); - if (!tsk) + nr = info->cpu_pinned; + if (!bp->hw.bp_target) nr += max_task_bp_pinned(cpu, type); else nr += task_bp_pinned(cpu, bp, type); @@ -161,8 +161,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, if (nr > slots->pinned) slots->pinned = nr; - nr = per_cpu(nr_bp_flexible[type], cpu); - + nr = info->flexible; if (nr > slots->flexible) slots->flexible = nr; } @@ -182,29 +181,19 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight) /* * Add a pinned breakpoint for the given task in our constraint table */ -static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, +static void toggle_bp_task_slot(struct perf_event *bp, int cpu, enum bp_type_idx type, int weight) { - unsigned int *tsk_pinned; - int old_count = 0; - int old_idx = 0; - int idx = 0; - - old_count = task_bp_pinned(cpu, bp, type); - old_idx = old_count - 1; - idx = old_idx + weight; - - /* tsk_pinned[n] is the number of tasks having n breakpoints */ - tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); - if (enable) { - tsk_pinned[idx]++; - if (old_count > 0) - tsk_pinned[old_idx]--; - } else { - tsk_pinned[idx]--; - if (old_count > 0) - tsk_pinned[old_idx]++; - } + unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; + int old_idx, new_idx; + + old_idx = task_bp_pinned(cpu, bp, type) - 1; + new_idx = old_idx + weight; + + if (old_idx >= 0) + tsk_pinned[old_idx]--; + if (new_idx >= 0) + tsk_pinned[new_idx]++; } /* @@ -214,33 +203,26 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { - int cpu = bp->cpu; - struct task_struct *tsk = bp->hw.bp_target; + const struct cpumask *cpumask = cpumask_of_bp(bp); + int cpu; - /* Pinned counter cpu profiling */ - if (!tsk) { + if (!enable) + weight = -weight; - if (enable) - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; - else - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; + /* Pinned counter cpu profiling */ + if (!bp->hw.bp_target) { + get_bp_info(bp->cpu, type)->cpu_pinned += weight; return; } /* Pinned counter task profiling */ - - if (!enable) - list_del(&bp->hw.bp_list); - - if (cpu >= 0) { - toggle_bp_task_slot(bp, cpu, enable, type, weight); - } else { - for_each_online_cpu(cpu) - toggle_bp_task_slot(bp, cpu, enable, type, weight); - } + for_each_cpu(cpu, cpumask) + toggle_bp_task_slot(bp, cpu, type, weight); if (enable) list_add_tail(&bp->hw.bp_list, &bp_task_head); + else + list_del(&bp->hw.bp_list); } /* @@ -261,8 +243,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) * * - If attached to a single cpu, check: * - * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) - * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM + * (per_cpu(info->flexible, cpu) || (per_cpu(info->cpu_pinned, cpu) + * + max(per_cpu(info->tsk_pinned, cpu)))) < HBP_NUM * * -> If there are already non-pinned counters in this cpu, it means * there is already a free slot for them. @@ -272,8 +254,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) * * - If attached to every cpus, check: * - * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) - * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM + * (per_cpu(info->flexible, *) || (max(per_cpu(info->cpu_pinned, *)) + * + max(per_cpu(info->tsk_pinned, *)))) < HBP_NUM * * -> This is roughly the same, except we check the number of per cpu * bp for every cpu and we keep the max one. Same for the per tasks @@ -284,16 +266,16 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) * * - If attached to a single cpu, check: * - * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) - * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM + * ((per_cpu(info->flexible, cpu) > 1) + per_cpu(info->cpu_pinned, cpu) + * + max(per_cpu(info->tsk_pinned, cpu))) < HBP_NUM * - * -> Same checks as before. But now the nr_bp_flexible, if any, must keep + * -> Same checks as before. But now the info->flexible, if any, must keep * one register at least (or they will never be fed). * * - If attached to every cpus, check: * - * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) - * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM + * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *)) + * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM */ static int __reserve_bp_slot(struct perf_event *bp) { @@ -518,8 +500,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context) { - struct perf_event * __percpu *cpu_events, **pevent, *bp; - long err; + struct perf_event * __percpu *cpu_events, *bp; + long err = 0; int cpu; cpu_events = alloc_percpu(typeof(*cpu_events)); @@ -528,31 +510,21 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, get_online_cpus(); for_each_online_cpu(cpu) { - pevent = per_cpu_ptr(cpu_events, cpu); bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered, context); - - *pevent = bp; - if (IS_ERR(bp)) { err = PTR_ERR(bp); - goto fail; + break; } - } - put_online_cpus(); - return cpu_events; - -fail: - for_each_online_cpu(cpu) { - pevent = per_cpu_ptr(cpu_events, cpu); - if (IS_ERR(*pevent)) - break; - unregister_hw_breakpoint(*pevent); + per_cpu(*cpu_events, cpu) = bp; } put_online_cpus(); - free_percpu(cpu_events); + if (likely(!err)) + return cpu_events; + + unregister_wide_hw_breakpoint(cpu_events); return (void __percpu __force *)ERR_PTR(err); } EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); @@ -564,12 +536,10 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { int cpu; - struct perf_event **pevent; - for_each_possible_cpu(cpu) { - pevent = per_cpu_ptr(cpu_events, cpu); - unregister_hw_breakpoint(*pevent); - } + for_each_possible_cpu(cpu) + unregister_hw_breakpoint(per_cpu(*cpu_events, cpu)); + free_percpu(cpu_events); } EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); @@ -612,6 +582,11 @@ static int hw_breakpoint_add(struct perf_event *bp, int flags) if (!(flags & PERF_EF_START)) bp->hw.state = PERF_HES_STOPPED; + if (is_sampling_event(bp)) { + bp->hw.last_period = bp->hw.sample_period; + perf_swevent_set_period(bp); + } + return arch_install_hw_breakpoint(bp); } @@ -650,7 +625,6 @@ static struct pmu perf_breakpoint = { int __init init_hw_breakpoint(void) { - unsigned int **task_bp_pinned; int cpu, err_cpu; int i; @@ -659,10 +633,11 @@ int __init init_hw_breakpoint(void) for_each_possible_cpu(cpu) { for (i = 0; i < TYPE_MAX; i++) { - task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu); - *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i], - GFP_KERNEL); - if (!*task_bp_pinned) + struct bp_cpuinfo *info = get_bp_info(cpu, i); + + info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), + GFP_KERNEL); + if (!info->tsk_pinned) goto err_alloc; } } @@ -676,7 +651,7 @@ int __init init_hw_breakpoint(void) err_alloc: for_each_possible_cpu(err_cpu) { for (i = 0; i < TYPE_MAX; i++) - kfree(per_cpu(nr_task_bp_pinned[i], err_cpu)); + kfree(get_bp_info(err_cpu, i)->tsk_pinned); if (err_cpu == cpu) break; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index eb675c4d59d..ca6599723be 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -31,6 +31,10 @@ struct ring_buffer { spinlock_t event_lock; struct list_head event_list; + atomic_t mmap_count; + unsigned long mmap_locked; + struct user_struct *mmap_user; + struct perf_event_mmap_page *user_page; void *data_pages[0]; }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f3569747d62..ad8e1bdca70 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1682,12 +1682,10 @@ static bool handle_trampoline(struct pt_regs *regs) tmp = ri; ri = ri->next; kfree(tmp); + utask->depth--; if (!chained) break; - - utask->depth--; - BUG_ON(!ri); } diff --git a/kernel/exit.c b/kernel/exit.c index af2eb3cbd49..a949819055d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -312,17 +312,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } -void __set_special_pids(struct pid *pid) -{ - struct task_struct *curr = current->group_leader; - - if (task_session(curr) != pid) - change_pid(curr, PIDTYPE_SID, pid); - - if (task_pgrp(curr) != pid) - change_pid(curr, PIDTYPE_PGID, pid); -} - /* * Let kernel threads use this to say that they allow a certain signal. * Must not be used if kthread was cloned with CLONE_SIGHAND. @@ -649,7 +638,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ forget_original_parent(tsk); - exit_task_namespaces(tsk); write_lock_irq(&tasklist_lock); if (group_dead) @@ -795,6 +783,7 @@ void do_exit(long code) exit_shm(tsk); exit_files(tsk); exit_fs(tsk); + exit_task_namespaces(tsk); exit_task_work(tsk); check_stack_usage(); exit_thread(); @@ -819,7 +808,7 @@ void do_exit(long code) /* * FIXME: do that only when needed, using sched_exit tracepoint */ - ptrace_put_breakpoints(tsk); + flush_ptrace_hw_breakpoint(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA @@ -835,7 +824,7 @@ void do_exit(long code) /* * Make sure we are holding no locks: */ - debug_check_no_locks_held(tsk); + debug_check_no_locks_held(); /* * We can do this unlocked here. The futex code uses this flag * just to verify whether the pi state cleanup has been done diff --git a/kernel/extable.c b/kernel/extable.c index 67460b93b1a..832cb28105b 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -41,7 +41,7 @@ u32 __initdata main_extable_sort_needed = 1; /* Sort the kernel's built-in exception table */ void __init sort_main_extable(void) { - if (main_extable_sort_needed) { + if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) { pr_notice("Sorting __ex_table...\n"); sort_extable(__start___ex_table, __stop___ex_table); } diff --git a/kernel/fork.c b/kernel/fork.c index 987b28a1f01..086fe73ad6b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -351,7 +351,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; - struct mempolicy *pol; uprobe_start_dup_mmap(); down_write(&oldmm->mmap_sem); @@ -365,8 +364,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; - mm->free_area_cache = oldmm->mmap_base; - mm->cached_hole_size = ~0UL; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; @@ -402,11 +399,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) goto fail_nomem; *tmp = *mpnt; INIT_LIST_HEAD(&tmp->anon_vma_chain); - pol = mpol_dup(vma_policy(mpnt)); - retval = PTR_ERR(pol); - if (IS_ERR(pol)) + retval = vma_dup_policy(mpnt, tmp); + if (retval) goto fail_nomem_policy; - vma_set_policy(tmp, pol); tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; @@ -474,7 +469,7 @@ out: uprobe_end_dup_mmap(); return retval; fail_nomem_anon_vma_fork: - mpol_put(pol); + mpol_put(vma_policy(tmp)); fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: @@ -524,7 +519,7 @@ static void mm_init_aio(struct mm_struct *mm) { #ifdef CONFIG_AIO spin_lock_init(&mm->ioctx_lock); - INIT_HLIST_HEAD(&mm->ioctx_list); + mm->ioctx_table = NULL; #endif } @@ -540,8 +535,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->nr_ptes = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); - mm->free_area_cache = TASK_UNMAPPED_BASE; - mm->cached_hole_size = ~0UL; mm_init_aio(mm); mm_init_owner(mm, p); @@ -1121,6 +1114,12 @@ static void posix_cpu_timers_init(struct task_struct *tsk) INIT_LIST_HEAD(&tsk->cpu_timers[2]); } +static inline void +init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) +{ + task->pids[type].pid = pid; +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1171,12 +1170,16 @@ static struct task_struct *copy_process(unsigned long clone_flags, return ERR_PTR(-EINVAL); /* - * If the new process will be in a different pid namespace - * don't allow the creation of threads. + * If the new process will be in a different pid or user namespace + * do not allow it to share a thread group or signal handlers or + * parent with the forking task. */ - if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && - (task_active_pid_ns(current) != current->nsproxy->pid_ns)) - return ERR_PTR(-EINVAL); + if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) { + if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || + (task_active_pid_ns(current) != + current->nsproxy->pid_ns_for_children)) + return ERR_PTR(-EINVAL); + } retval = security_task_create(clone_flags); if (retval) @@ -1199,8 +1202,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, retval = -EAGAIN; if (atomic_read(&p->real_cred->user->processes) >= task_rlimit(p, RLIMIT_NPROC)) { - if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && - p->real_cred->user != INIT_USER) + if (p->real_cred->user != INIT_USER && + !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_free; } current->flags &= ~PF_NPROC_EXCEEDED; @@ -1349,16 +1352,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (pid != &init_struct_pid) { retval = -ENOMEM; - pid = alloc_pid(p->nsproxy->pid_ns); + pid = alloc_pid(p->nsproxy->pid_ns_for_children); if (!pid) goto bad_fork_cleanup_io; } - p->pid = pid_nr(pid); - p->tgid = p->pid; - if (clone_flags & CLONE_THREAD) - p->tgid = current->tgid; - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? @@ -1394,12 +1392,19 @@ static struct task_struct *copy_process(unsigned long clone_flags, clear_all_latency_tracing(p); /* ok, now we should be set up.. */ - if (clone_flags & CLONE_THREAD) + p->pid = pid_nr(pid); + if (clone_flags & CLONE_THREAD) { p->exit_signal = -1; - else if (clone_flags & CLONE_PARENT) - p->exit_signal = current->group_leader->exit_signal; - else - p->exit_signal = (clone_flags & CSIGNAL); + p->group_leader = current->group_leader; + p->tgid = current->tgid; + } else { + if (clone_flags & CLONE_PARENT) + p->exit_signal = current->group_leader->exit_signal; + else + p->exit_signal = (clone_flags & CSIGNAL); + p->group_leader = p; + p->tgid = p->pid; + } p->pdeath_signal = 0; p->exit_state = 0; @@ -1408,15 +1413,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); p->dirty_paused_when = 0; - /* - * Ok, make it visible to the rest of the system. - * We dont wake it up yet. - */ - p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; - /* Need tasklist lock for parent etc handling! */ + /* + * Make it visible to the rest of the system, but dont wake it up yet. + * Need tasklist lock for parent etc handling! + */ write_lock_irq(&tasklist_lock); /* CLONE_PARENT re-uses the old parent */ @@ -1446,18 +1449,14 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_free_pid; } - if (clone_flags & CLONE_THREAD) { - current->signal->nr_threads++; - atomic_inc(¤t->signal->live); - atomic_inc(¤t->signal->sigcnt); - p->group_leader = current->group_leader; - list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); - } - if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); + init_task_pid(p, PIDTYPE_PID, pid); if (thread_group_leader(p)) { + init_task_pid(p, PIDTYPE_PGID, task_pgrp(current)); + init_task_pid(p, PIDTYPE_SID, task_session(current)); + if (is_child_reaper(pid)) { ns_of_pid(pid)->child_reaper = p; p->signal->flags |= SIGNAL_UNKILLABLE; @@ -1465,13 +1464,19 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); - attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); - attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); + attach_pid(p, PIDTYPE_PGID); + attach_pid(p, PIDTYPE_SID); __this_cpu_inc(process_counts); + } else { + current->signal->nr_threads++; + atomic_inc(¤t->signal->live); + atomic_inc(¤t->signal->sigcnt); + list_add_tail_rcu(&p->thread_group, + &p->group_leader->thread_group); } - attach_pid(p, PIDTYPE_PID, pid); + attach_pid(p, PIDTYPE_PID); nr_threads++; } @@ -1542,7 +1547,7 @@ static inline void init_idle_pids(struct pid_link *links) } } -struct task_struct * __cpuinit fork_idle(int cpu) +struct task_struct *fork_idle(int cpu) { struct task_struct *task; task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); @@ -1571,15 +1576,6 @@ long do_fork(unsigned long clone_flags, long nr; /* - * Do some preliminary argument and permissions checking before we - * actually start allocating stuff - */ - if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) { - if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) - return -EINVAL; - } - - /* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly * requested, no event is reported; otherwise, report if the event @@ -1675,6 +1671,12 @@ SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, int, tls_val) +#elif defined(CONFIG_CLONE_BACKWARDS3) +SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp, + int, stack_size, + int __user *, parent_tidptr, + int __user *, child_tidptr, + int, tls_val) #else SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, @@ -1814,11 +1816,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) if (unshare_flags & CLONE_NEWUSER) unshare_flags |= CLONE_THREAD | CLONE_FS; /* - * If unsharing a pid namespace must also unshare the thread. - */ - if (unshare_flags & CLONE_NEWPID) - unshare_flags |= CLONE_THREAD; - /* * If unsharing a thread from a thread group, must also unshare vm. */ if (unshare_flags & CLONE_THREAD) diff --git a/kernel/freezer.c b/kernel/freezer.c index c38893b0efb..b462fa19751 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -33,7 +33,7 @@ static DEFINE_SPINLOCK(freezer_lock); */ bool freezing_slow_path(struct task_struct *p) { - if (p->flags & PF_NOFREEZE) + if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; if (pm_nosig_freezing || cgroup_freezing(p)) @@ -110,6 +110,18 @@ bool freeze_task(struct task_struct *p) { unsigned long flags; + /* + * This check can race with freezer_do_not_count, but worst case that + * will result in an extra wakeup being sent to the task. It does not + * race with freezer_count(), the barriers in freezer_count() and + * freezer_should_skip() ensure that either freezer_count() sees + * freezing == true in try_to_freeze() and freezes, or + * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task + * normally. + */ + if (freezer_should_skip(p)) + return false; + spin_lock_irqsave(&freezer_lock, flags); if (!freezing(p) || frozen(p)) { spin_unlock_irqrestore(&freezer_lock, flags); diff --git a/kernel/futex.c b/kernel/futex.c index b26dcfc02c9..c3a1a55a521 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -61,6 +61,8 @@ #include <linux/nsproxy.h> #include <linux/ptrace.h> #include <linux/sched/rt.h> +#include <linux/hugetlb.h> +#include <linux/freezer.h> #include <asm/futex.h> @@ -365,7 +367,7 @@ again: } else { key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.inode = page_head->mapping->host; - key->shared.pgoff = page_head->index; + key->shared.pgoff = basepage_index(page); } get_futex_key_refs(key); @@ -1807,7 +1809,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) - schedule(); + freezable_schedule(); } __set_current_state(TASK_RUNNING); } diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 9bd0934f6c3..7a7d2ee96d4 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -74,7 +74,7 @@ static int __init gcov_persist_setup(char *str) { unsigned long val; - if (strict_strtoul(str, 0, &val)) { + if (kstrtoul(str, 0, &val)) { pr_warning("invalid gcov_persist parameter '%s'\n", str); return 0; } diff --git a/kernel/groups.c b/kernel/groups.c index 6b2588dd04f..90cf1c38c8e 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!nsown_capable(CAP_SETGID)) + if (!ns_capable(current_user_ns(), CAP_SETGID)) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index fd4b13b131f..383319bae3f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -47,6 +47,7 @@ #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> #include <linux/timer.h> +#include <linux/freezer.h> #include <asm/uaccess.h> @@ -721,17 +722,20 @@ static int hrtimer_switch_to_hres(void) return 1; } +static void clock_was_set_work(struct work_struct *work) +{ + clock_was_set(); +} + +static DECLARE_WORK(hrtimer_work, clock_was_set_work); + /* - * Called from timekeeping code to reprogramm the hrtimer interrupt - * device. If called from the timer interrupt context we defer it to - * softirq context. + * Called from timekeeping and resume code to reprogramm the hrtimer + * interrupt device on all cpus. */ void clock_was_set_delayed(void) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - - cpu_base->clock_was_set = 1; - __raise_softirq_irqoff(HRTIMER_SOFTIRQ); + schedule_work(&hrtimer_work); } #else @@ -773,15 +777,19 @@ void clock_was_set(void) /* * During resume we might have to reprogram the high resolution timer - * interrupt (on the local CPU): + * interrupt on all online CPUs. However, all other CPUs will be + * stopped with IRQs interrupts disabled so the clock_was_set() call + * must be deferred. */ void hrtimers_resume(void) { WARN_ONCE(!irqs_disabled(), KERN_INFO "hrtimers_resume() called with IRQs enabled!"); + /* Retrigger on the local CPU */ retrigger_next_event(NULL); - timerfd_clock_was_set(); + /* And schedule a retrigger for all others */ + clock_was_set_delayed(); } static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) @@ -1432,13 +1440,6 @@ void hrtimer_peek_ahead_timers(void) static void run_hrtimer_softirq(struct softirq_action *h) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - - if (cpu_base->clock_was_set) { - cpu_base->clock_was_set = 0; - clock_was_set(); - } - hrtimer_peek_ahead_timers(); } @@ -1545,7 +1546,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod t->task = NULL; if (likely(t->task)) - schedule(); + freezable_schedule(); hrtimer_cancel(&t->timer); mode = HRTIMER_MODE_ABS; @@ -1658,7 +1659,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, /* * Functions related to boot-time initialization: */ -static void __cpuinit init_hrtimers_cpu(int cpu) +static void init_hrtimers_cpu(int cpu) { struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; @@ -1739,7 +1740,7 @@ static void migrate_hrtimers(int scpu) #endif /* CONFIG_HOTPLUG_CPU */ -static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, +static int hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { int scpu = (long)hcpu; @@ -1772,7 +1773,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata hrtimers_nb = { +static struct notifier_block hrtimers_nb = { .notifier_call = hrtimer_cpu_notify, }; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 6df614912b9..3e97fb126e6 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -15,6 +15,7 @@ #include <linux/lockdep.h> #include <linux/export.h> #include <linux/sysctl.h> +#include <linux/utsname.h> /* * The number of tasks checked: @@ -99,10 +100,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * Ok, the task did not get scheduled for more than 2 minutes, * complain: */ - printk(KERN_ERR "INFO: task %s:%d blocked for more than " - "%ld seconds.\n", t->comm, t->pid, timeout); - printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" - " disables this message.\n"); + pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", + t->comm, t->pid, timeout); + pr_err(" %s %s %.*s\n", + print_tainted(), init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" + " disables this message.\n"); sched_show_task(t); debug_show_held_locks(t); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index d1a758bc972..4a1fef09f65 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -1,15 +1,4 @@ -# Select this to activate the generic irq options below -config HAVE_GENERIC_HARDIRQS - bool - -if HAVE_GENERIC_HARDIRQS menu "IRQ subsystem" -# -# Interrupt subsystem related configuration options -# -config GENERIC_HARDIRQS - def_bool y - # Options selectable by the architecture code # Make sparse irq Kconfig switch below available @@ -84,4 +73,3 @@ config SPARSE_IRQ If you don't know what to do here, say N. endmenu -endif diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index cbd97ce0b00..a3bb14fbe5c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -213,6 +213,19 @@ void irq_enable(struct irq_desc *desc) irq_state_clr_masked(desc); } +/** + * irq_disable - Mark interupt disabled + * @desc: irq descriptor which should be disabled + * + * If the chip does not implement the irq_disable callback, we + * use a lazy disable approach. That means we mark the interrupt + * disabled, but leave the hardware unmasked. That's an + * optimization because we avoid the hardware access for the + * common case where no interrupt happens after we marked it + * disabled. If an interrupt happens, then the interrupt flow + * handler masks the line at the hardware level and marks it + * pending. + */ void irq_disable(struct irq_desc *desc) { irq_state_set_disabled(desc); diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index c89295a8f66..452d6f2ba21 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -7,6 +7,7 @@ #include <linux/irq.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/irqdomain.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/syscore_ops.h> @@ -16,11 +17,6 @@ static LIST_HEAD(gc_list); static DEFINE_RAW_SPINLOCK(gc_lock); -static inline struct irq_chip_regs *cur_regs(struct irq_data *d) -{ - return &container_of(d->chip, struct irq_chip_type, chip)->regs; -} - /** * irq_gc_noop - NOOP function * @d: irq_data @@ -39,16 +35,17 @@ void irq_gc_noop(struct irq_data *d) void irq_gc_mask_disable_reg(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); + struct irq_chip_type *ct = irq_data_get_chip_type(d); + u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable); - gc->mask_cache &= ~mask; + irq_reg_writel(mask, gc->reg_base + ct->regs.disable); + *ct->mask_cache &= ~mask; irq_gc_unlock(gc); } /** - * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register + * irq_gc_mask_set_bit - Mask chip via setting bit in mask register * @d: irq_data * * Chip has a single mask register. Values of this register are cached @@ -57,16 +54,18 @@ void irq_gc_mask_disable_reg(struct irq_data *d) void irq_gc_mask_set_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); + struct irq_chip_type *ct = irq_data_get_chip_type(d); + u32 mask = d->mask; irq_gc_lock(gc); - gc->mask_cache |= mask; - irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); + *ct->mask_cache |= mask; + irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); irq_gc_unlock(gc); } +EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit); /** - * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register + * irq_gc_mask_clr_bit - Mask chip via clearing bit in mask register * @d: irq_data * * Chip has a single mask register. Values of this register are cached @@ -75,13 +74,15 @@ void irq_gc_mask_set_bit(struct irq_data *d) void irq_gc_mask_clr_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); + struct irq_chip_type *ct = irq_data_get_chip_type(d); + u32 mask = d->mask; irq_gc_lock(gc); - gc->mask_cache &= ~mask; - irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); + *ct->mask_cache &= ~mask; + irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); irq_gc_unlock(gc); } +EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit); /** * irq_gc_unmask_enable_reg - Unmask chip via enable register @@ -93,11 +94,12 @@ void irq_gc_mask_clr_bit(struct irq_data *d) void irq_gc_unmask_enable_reg(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); + struct irq_chip_type *ct = irq_data_get_chip_type(d); + u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable); - gc->mask_cache |= mask; + irq_reg_writel(mask, gc->reg_base + ct->regs.enable); + *ct->mask_cache |= mask; irq_gc_unlock(gc); } @@ -108,12 +110,14 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) void irq_gc_ack_set_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); + struct irq_chip_type *ct = irq_data_get_chip_type(d); + u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_reg_writel(mask, gc->reg_base + ct->regs.ack); irq_gc_unlock(gc); } +EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit); /** * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit @@ -122,25 +126,27 @@ void irq_gc_ack_set_bit(struct irq_data *d) void irq_gc_ack_clr_bit(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = ~(1 << (d->irq - gc->irq_base)); + struct irq_chip_type *ct = irq_data_get_chip_type(d); + u32 mask = ~d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_reg_writel(mask, gc->reg_base + ct->regs.ack); irq_gc_unlock(gc); } /** - * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt + * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt * @d: irq_data */ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); + struct irq_chip_type *ct = irq_data_get_chip_type(d); + u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_reg_writel(mask, gc->reg_base + ct->regs.mask); + irq_reg_writel(mask, gc->reg_base + ct->regs.ack); irq_gc_unlock(gc); } @@ -151,16 +157,18 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) void irq_gc_eoi(struct irq_data *d) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); + struct irq_chip_type *ct = irq_data_get_chip_type(d); + u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi); + irq_reg_writel(mask, gc->reg_base + ct->regs.eoi); irq_gc_unlock(gc); } /** * irq_gc_set_wake - Set/clr wake bit for an interrupt - * @d: irq_data + * @d: irq_data + * @on: Indicates whether the wake bit should be set or cleared * * For chips where the wake from suspend functionality is not * configured in a separate register and the wakeup active state is @@ -169,7 +177,7 @@ void irq_gc_eoi(struct irq_data *d) int irq_gc_set_wake(struct irq_data *d, unsigned int on) { struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); - u32 mask = 1 << (d->irq - gc->irq_base); + u32 mask = d->mask; if (!(mask & gc->wake_enabled)) return -EINVAL; @@ -183,6 +191,19 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on) return 0; } +static void +irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, + int num_ct, unsigned int irq_base, + void __iomem *reg_base, irq_flow_handler_t handler) +{ + raw_spin_lock_init(&gc->lock); + gc->num_ct = num_ct; + gc->irq_base = irq_base; + gc->reg_base = reg_base; + gc->chip_types->chip.name = name; + gc->chip_types->handler = handler; +} + /** * irq_alloc_generic_chip - Allocate a generic chip and initialize it * @name: Name of the irq chip @@ -203,23 +224,183 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base, gc = kzalloc(sz, GFP_KERNEL); if (gc) { - raw_spin_lock_init(&gc->lock); - gc->num_ct = num_ct; - gc->irq_base = irq_base; - gc->reg_base = reg_base; - gc->chip_types->chip.name = name; - gc->chip_types->handler = handler; + irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base, + handler); } return gc; } EXPORT_SYMBOL_GPL(irq_alloc_generic_chip); +static void +irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) +{ + struct irq_chip_type *ct = gc->chip_types; + u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask; + int i; + + for (i = 0; i < gc->num_ct; i++) { + if (flags & IRQ_GC_MASK_CACHE_PER_TYPE) { + mskptr = &ct[i].mask_cache_priv; + mskreg = ct[i].regs.mask; + } + ct[i].mask_cache = mskptr; + if (flags & IRQ_GC_INIT_MASK_CACHE) + *mskptr = irq_reg_readl(gc->reg_base + mskreg); + } +} + +/** + * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain + * @d: irq domain for which to allocate chips + * @irqs_per_chip: Number of interrupts each chip handles + * @num_ct: Number of irq_chip_type instances associated with this + * @name: Name of the irq chip + * @handler: Default flow handler associated with these chips + * @clr: IRQ_* bits to clear in the mapping function + * @set: IRQ_* bits to set in the mapping function + * @gcflags: Generic chip specific setup flags + */ +int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, + int num_ct, const char *name, + irq_flow_handler_t handler, + unsigned int clr, unsigned int set, + enum irq_gc_flags gcflags) +{ + struct irq_domain_chip_generic *dgc; + struct irq_chip_generic *gc; + int numchips, sz, i; + unsigned long flags; + void *tmp; + + if (d->gc) + return -EBUSY; + + numchips = DIV_ROUND_UP(d->revmap_size, irqs_per_chip); + if (!numchips) + return -EINVAL; + + /* Allocate a pointer, generic chip and chiptypes for each chip */ + sz = sizeof(*dgc) + numchips * sizeof(gc); + sz += numchips * (sizeof(*gc) + num_ct * sizeof(struct irq_chip_type)); + + tmp = dgc = kzalloc(sz, GFP_KERNEL); + if (!dgc) + return -ENOMEM; + dgc->irqs_per_chip = irqs_per_chip; + dgc->num_chips = numchips; + dgc->irq_flags_to_set = set; + dgc->irq_flags_to_clear = clr; + dgc->gc_flags = gcflags; + d->gc = dgc; + + /* Calc pointer to the first generic chip */ + tmp += sizeof(*dgc) + numchips * sizeof(gc); + for (i = 0; i < numchips; i++) { + /* Store the pointer to the generic chip */ + dgc->gc[i] = gc = tmp; + irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip, + NULL, handler); + gc->domain = d; + raw_spin_lock_irqsave(&gc_lock, flags); + list_add_tail(&gc->list, &gc_list); + raw_spin_unlock_irqrestore(&gc_lock, flags); + /* Calc pointer to the next generic chip */ + tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); + } + d->name = name; + return 0; +} +EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); + +/** + * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq + * @d: irq domain pointer + * @hw_irq: Hardware interrupt number + */ +struct irq_chip_generic * +irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) +{ + struct irq_domain_chip_generic *dgc = d->gc; + int idx; + + if (!dgc) + return NULL; + idx = hw_irq / dgc->irqs_per_chip; + if (idx >= dgc->num_chips) + return NULL; + return dgc->gc[idx]; +} +EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); + /* * Separate lockdep class for interrupt chip which can nest irq_desc * lock. */ static struct lock_class_key irq_nested_lock_class; +/* + * irq_map_generic_chip - Map a generic chip for an irq domain + */ +static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, + irq_hw_number_t hw_irq) +{ + struct irq_data *data = irq_get_irq_data(virq); + struct irq_domain_chip_generic *dgc = d->gc; + struct irq_chip_generic *gc; + struct irq_chip_type *ct; + struct irq_chip *chip; + unsigned long flags; + int idx; + + if (!d->gc) + return -ENODEV; + + idx = hw_irq / dgc->irqs_per_chip; + if (idx >= dgc->num_chips) + return -EINVAL; + gc = dgc->gc[idx]; + + idx = hw_irq % dgc->irqs_per_chip; + + if (test_bit(idx, &gc->unused)) + return -ENOTSUPP; + + if (test_bit(idx, &gc->installed)) + return -EBUSY; + + ct = gc->chip_types; + chip = &ct->chip; + + /* We only init the cache for the first mapping of a generic chip */ + if (!gc->installed) { + raw_spin_lock_irqsave(&gc->lock, flags); + irq_gc_init_mask_cache(gc, dgc->gc_flags); + raw_spin_unlock_irqrestore(&gc->lock, flags); + } + + /* Mark the interrupt as installed */ + set_bit(idx, &gc->installed); + + if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK) + irq_set_lockdep_class(virq, &irq_nested_lock_class); + + if (chip->irq_calc_mask) + chip->irq_calc_mask(data); + else + data->mask = 1 << idx; + + irq_set_chip_and_handler(virq, chip, ct->handler); + irq_set_chip_data(virq, gc); + irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); + return 0; +} + +struct irq_domain_ops irq_generic_chip_ops = { + .map = irq_map_generic_chip, + .xlate = irq_domain_xlate_onetwocell, +}; +EXPORT_SYMBOL_GPL(irq_generic_chip_ops); + /** * irq_setup_generic_chip - Setup a range of interrupts with a generic chip * @gc: Generic irq chip holding all data @@ -237,15 +418,14 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int set) { struct irq_chip_type *ct = gc->chip_types; + struct irq_chip *chip = &ct->chip; unsigned int i; raw_spin_lock(&gc_lock); list_add_tail(&gc->list, &gc_list); raw_spin_unlock(&gc_lock); - /* Init mask cache ? */ - if (flags & IRQ_GC_INIT_MASK_CACHE) - gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); + irq_gc_init_mask_cache(gc, flags); for (i = gc->irq_base; msk; msk >>= 1, i++) { if (!(msk & 0x01)) @@ -254,7 +434,15 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, if (flags & IRQ_GC_INIT_NESTED_LOCK) irq_set_lockdep_class(i, &irq_nested_lock_class); - irq_set_chip_and_handler(i, &ct->chip, ct->handler); + if (!(flags & IRQ_GC_NO_MASK)) { + struct irq_data *d = irq_get_irq_data(i); + + if (chip->irq_calc_mask) + chip->irq_calc_mask(d); + else + d->mask = 1 << (i - gc->irq_base); + } + irq_set_chip_and_handler(i, chip, ct->handler); irq_set_chip_data(i, gc); irq_modify_status(i, clr, set); } @@ -265,7 +453,7 @@ EXPORT_SYMBOL_GPL(irq_setup_generic_chip); /** * irq_setup_alt_chip - Switch to alternative chip * @d: irq_data for this interrupt - * @type Flow type to be initialized + * @type: Flow type to be initialized * * Only to be called from chip->irq_set_type() callbacks. */ @@ -317,6 +505,24 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, } EXPORT_SYMBOL_GPL(irq_remove_generic_chip); +static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc) +{ + unsigned int virq; + + if (!gc->domain) + return irq_get_irq_data(gc->irq_base); + + /* + * We don't know which of the irqs has been actually + * installed. Use the first one. + */ + if (!gc->installed) + return NULL; + + virq = irq_find_mapping(gc->domain, gc->irq_base + __ffs(gc->installed)); + return virq ? irq_get_irq_data(virq) : NULL; +} + #ifdef CONFIG_PM static int irq_gc_suspend(void) { @@ -325,8 +531,12 @@ static int irq_gc_suspend(void) list_for_each_entry(gc, &gc_list, list) { struct irq_chip_type *ct = gc->chip_types; - if (ct->chip.irq_suspend) - ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base)); + if (ct->chip.irq_suspend) { + struct irq_data *data = irq_gc_get_irq_data(gc); + + if (data) + ct->chip.irq_suspend(data); + } } return 0; } @@ -338,8 +548,12 @@ static void irq_gc_resume(void) list_for_each_entry(gc, &gc_list, list) { struct irq_chip_type *ct = gc->chip_types; - if (ct->chip.irq_resume) - ct->chip.irq_resume(irq_get_irq_data(gc->irq_base)); + if (ct->chip.irq_resume) { + struct irq_data *data = irq_gc_get_irq_data(gc); + + if (data) + ct->chip.irq_resume(data); + } } } #else @@ -354,8 +568,12 @@ static void irq_gc_shutdown(void) list_for_each_entry(gc, &gc_list, list) { struct irq_chip_type *ct = gc->chip_types; - if (ct->chip.irq_pm_shutdown) - ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base)); + if (ct->chip.irq_pm_shutdown) { + struct irq_data *data = irq_gc_get_irq_data(gc); + + if (data) + ct->chip.irq_pm_shutdown(data); + } } } diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 54a4d522323..706724e9835 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -16,12 +16,6 @@ #include <linux/smp.h> #include <linux/fs.h> -#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs. - * ie. legacy 8259, gets irqs 1..15 */ -#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ -#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ -#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */ - static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); @@ -29,9 +23,11 @@ static DEFINE_MUTEX(revmap_trees_mutex); static struct irq_domain *irq_default_domain; /** - * irq_domain_alloc() - Allocate a new irq_domain data structure + * __irq_domain_add() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller - * @revmap_type: type of reverse mapping to use + * @size: Size of linear map; 0 for radix mapping only + * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no + * direct mapping * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * @@ -39,41 +35,35 @@ static struct irq_domain *irq_default_domain; * register allocated irq_domain with irq_domain_register(). Returns pointer * to IRQ domain, or NULL on failure. */ -static struct irq_domain *irq_domain_alloc(struct device_node *of_node, - unsigned int revmap_type, - const struct irq_domain_ops *ops, - void *host_data) +struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, + irq_hw_number_t hwirq_max, int direct_max, + const struct irq_domain_ops *ops, + void *host_data) { struct irq_domain *domain; - domain = kzalloc_node(sizeof(*domain), GFP_KERNEL, - of_node_to_nid(of_node)); + domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), + GFP_KERNEL, of_node_to_nid(of_node)); if (WARN_ON(!domain)) return NULL; /* Fill structure */ - domain->revmap_type = revmap_type; + INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); domain->ops = ops; domain->host_data = host_data; domain->of_node = of_node_get(of_node); + domain->hwirq_max = hwirq_max; + domain->revmap_size = size; + domain->revmap_direct_max_irq = direct_max; - return domain; -} - -static void irq_domain_free(struct irq_domain *domain) -{ - of_node_put(domain->of_node); - kfree(domain); -} - -static void irq_domain_add(struct irq_domain *domain) -{ mutex_lock(&irq_domain_mutex); list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); - pr_debug("Allocated domain of type %d @0x%p\n", - domain->revmap_type, domain); + + pr_debug("Added domain %s\n", domain->name); + return domain; } +EXPORT_SYMBOL_GPL(__irq_domain_add); /** * irq_domain_remove() - Remove an irq domain. @@ -87,29 +77,12 @@ void irq_domain_remove(struct irq_domain *domain) { mutex_lock(&irq_domain_mutex); - switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_LEGACY: - /* - * Legacy domains don't manage their own irq_desc - * allocations, we expect the caller to handle irq_desc - * freeing on their own. - */ - break; - case IRQ_DOMAIN_MAP_TREE: - /* - * radix_tree_delete() takes care of destroying the root - * node when all entries are removed. Shout if there are - * any mappings left. - */ - WARN_ON(domain->revmap_data.tree.height); - break; - case IRQ_DOMAIN_MAP_LINEAR: - kfree(domain->revmap_data.linear.revmap); - domain->revmap_data.linear.size = 0; - break; - case IRQ_DOMAIN_MAP_NOMAP: - break; - } + /* + * radix_tree_delete() takes care of destroying the root + * node when all entries are removed. Shout if there are + * any mappings left. + */ + WARN_ON(domain->revmap_tree.height); list_del(&domain->link); @@ -121,44 +94,30 @@ void irq_domain_remove(struct irq_domain *domain) mutex_unlock(&irq_domain_mutex); - pr_debug("Removed domain of type %d @0x%p\n", - domain->revmap_type, domain); + pr_debug("Removed domain %s\n", domain->name); - irq_domain_free(domain); + of_node_put(domain->of_node); + kfree(domain); } EXPORT_SYMBOL_GPL(irq_domain_remove); -static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, - irq_hw_number_t hwirq) -{ - irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq; - int size = domain->revmap_data.legacy.size; - - if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size)) - return 0; - return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq; -} - /** - * irq_domain_add_simple() - Allocate and register a simple irq_domain. + * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs * @of_node: pointer to interrupt controller's device tree node. * @size: total number of irqs in mapping * @first_irq: first number of irq block assigned to the domain, - * pass zero to assign irqs on-the-fly. This will result in a - * linear IRQ domain so it is important to use irq_create_mapping() - * for each used IRQ, especially when SPARSE_IRQ is enabled. + * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then + * pre-map all of the irqs in the domain to virqs starting at first_irq. * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * - * Allocates a legacy irq_domain if irq_base is positive or a linear - * domain otherwise. For the legacy domain, IRQ descriptors will also - * be allocated. + * Allocates an irq_domain, and optionally if first_irq is positive then also + * allocate irq_descs and map all of the hwirqs to virqs starting at first_irq. * * This is intended to implement the expected behaviour for most - * interrupt controllers which is that a linear mapping should - * normally be used unless the system requires a legacy mapping in - * order to support supplying interrupt numbers during non-DT - * registration of devices. + * interrupt controllers. If device tree is used, then first_irq will be 0 and + * irqs get mapped dynamically on the fly. However, if the controller requires + * static virq assignments (non-DT boot) then it will set that up correctly. */ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, unsigned int size, @@ -166,33 +125,25 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, const struct irq_domain_ops *ops, void *host_data) { - if (first_irq > 0) { - int irq_base; + struct irq_domain *domain; + domain = __irq_domain_add(of_node, size, size, 0, ops, host_data); + if (!domain) + return NULL; + + if (first_irq > 0) { if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { - /* - * Set the descriptor allocator to search for a - * 1-to-1 mapping, such as irq_alloc_desc_at(). - * Use of_node_to_nid() which is defined to - * numa_node_id() on platforms that have no custom - * implementation. - */ - irq_base = irq_alloc_descs(first_irq, first_irq, size, - of_node_to_nid(of_node)); - if (irq_base < 0) { + /* attempt to allocated irq_descs */ + int rc = irq_alloc_descs(first_irq, first_irq, size, + of_node_to_nid(of_node)); + if (rc < 0) pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", first_irq); - irq_base = first_irq; - } - } else - irq_base = first_irq; - - return irq_domain_add_legacy(of_node, size, irq_base, 0, - ops, host_data); + } + irq_domain_associate_many(domain, first_irq, 0, size); } - /* A linear domain is the default */ - return irq_domain_add_linear(of_node, size, ops, host_data); + return domain; } EXPORT_SYMBOL_GPL(irq_domain_add_simple); @@ -219,131 +170,19 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, void *host_data) { struct irq_domain *domain; - unsigned int i; - domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data); + domain = __irq_domain_add(of_node, first_hwirq + size, + first_hwirq + size, 0, ops, host_data); if (!domain) return NULL; - domain->revmap_data.legacy.first_irq = first_irq; - domain->revmap_data.legacy.first_hwirq = first_hwirq; - domain->revmap_data.legacy.size = size; - - mutex_lock(&irq_domain_mutex); - /* Verify that all the irqs are available */ - for (i = 0; i < size; i++) { - int irq = first_irq + i; - struct irq_data *irq_data = irq_get_irq_data(irq); - - if (WARN_ON(!irq_data || irq_data->domain)) { - mutex_unlock(&irq_domain_mutex); - irq_domain_free(domain); - return NULL; - } - } - - /* Claim all of the irqs before registering a legacy domain */ - for (i = 0; i < size; i++) { - struct irq_data *irq_data = irq_get_irq_data(first_irq + i); - irq_data->hwirq = first_hwirq + i; - irq_data->domain = domain; - } - mutex_unlock(&irq_domain_mutex); - - for (i = 0; i < size; i++) { - int irq = first_irq + i; - int hwirq = first_hwirq + i; - - /* IRQ0 gets ignored */ - if (!irq) - continue; + irq_domain_associate_many(domain, first_irq, first_hwirq, size); - /* Legacy flags are left to default at this point, - * one can then use irq_create_mapping() to - * explicitly change them - */ - if (ops->map) - ops->map(domain, irq, hwirq); - - /* Clear norequest flags */ - irq_clear_status_flags(irq, IRQ_NOREQUEST); - } - - irq_domain_add(domain); return domain; } EXPORT_SYMBOL_GPL(irq_domain_add_legacy); /** - * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain. - * @of_node: pointer to interrupt controller's device tree node. - * @size: Number of interrupts in the domain. - * @ops: map/unmap domain callbacks - * @host_data: Controller private data pointer - */ -struct irq_domain *irq_domain_add_linear(struct device_node *of_node, - unsigned int size, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain *domain; - unsigned int *revmap; - - revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL, - of_node_to_nid(of_node)); - if (WARN_ON(!revmap)) - return NULL; - - domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data); - if (!domain) { - kfree(revmap); - return NULL; - } - domain->revmap_data.linear.size = size; - domain->revmap_data.linear.revmap = revmap; - irq_domain_add(domain); - return domain; -} -EXPORT_SYMBOL_GPL(irq_domain_add_linear); - -struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, - unsigned int max_irq, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain *domain = irq_domain_alloc(of_node, - IRQ_DOMAIN_MAP_NOMAP, ops, host_data); - if (domain) { - domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0; - irq_domain_add(domain); - } - return domain; -} -EXPORT_SYMBOL_GPL(irq_domain_add_nomap); - -/** - * irq_domain_add_tree() - * @of_node: pointer to interrupt controller's device tree node. - * @ops: map/unmap domain callbacks - * - * Note: The radix tree will be allocated later during boot automatically - * (the reverse mapping will use the slow path until that happens). - */ -struct irq_domain *irq_domain_add_tree(struct device_node *of_node, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain *domain = irq_domain_alloc(of_node, - IRQ_DOMAIN_MAP_TREE, ops, host_data); - if (domain) { - INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); - irq_domain_add(domain); - } - return domain; -} -EXPORT_SYMBOL_GPL(irq_domain_add_tree); - -/** * irq_find_host() - Locates a domain for a given device node * @node: device-tree node of the interrupt controller */ @@ -391,125 +230,108 @@ void irq_set_default_host(struct irq_domain *domain) } EXPORT_SYMBOL_GPL(irq_set_default_host); -static void irq_domain_disassociate_many(struct irq_domain *domain, - unsigned int irq_base, int count) +static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) { - /* - * disassociate in reverse order; - * not strictly necessary, but nice for unwinding - */ - while (count--) { - int irq = irq_base + count; - struct irq_data *irq_data = irq_get_irq_data(irq); - irq_hw_number_t hwirq; + struct irq_data *irq_data = irq_get_irq_data(irq); + irq_hw_number_t hwirq; - if (WARN_ON(!irq_data || irq_data->domain != domain)) - continue; + if (WARN(!irq_data || irq_data->domain != domain, + "virq%i doesn't exist; cannot disassociate\n", irq)) + return; - hwirq = irq_data->hwirq; - irq_set_status_flags(irq, IRQ_NOREQUEST); + hwirq = irq_data->hwirq; + irq_set_status_flags(irq, IRQ_NOREQUEST); - /* remove chip and handler */ - irq_set_chip_and_handler(irq, NULL, NULL); + /* remove chip and handler */ + irq_set_chip_and_handler(irq, NULL, NULL); - /* Make sure it's completed */ - synchronize_irq(irq); + /* Make sure it's completed */ + synchronize_irq(irq); - /* Tell the PIC about it */ - if (domain->ops->unmap) - domain->ops->unmap(domain, irq); - smp_mb(); + /* Tell the PIC about it */ + if (domain->ops->unmap) + domain->ops->unmap(domain, irq); + smp_mb(); - irq_data->domain = NULL; - irq_data->hwirq = 0; + irq_data->domain = NULL; + irq_data->hwirq = 0; - /* Clear reverse map */ - switch(domain->revmap_type) { - case IRQ_DOMAIN_MAP_LINEAR: - if (hwirq < domain->revmap_data.linear.size) - domain->revmap_data.linear.revmap[hwirq] = 0; - break; - case IRQ_DOMAIN_MAP_TREE: - mutex_lock(&revmap_trees_mutex); - radix_tree_delete(&domain->revmap_data.tree, hwirq); - mutex_unlock(&revmap_trees_mutex); - break; - } + /* Clear reverse map for this hwirq */ + if (hwirq < domain->revmap_size) { + domain->linear_revmap[hwirq] = 0; + } else { + mutex_lock(&revmap_trees_mutex); + radix_tree_delete(&domain->revmap_tree, hwirq); + mutex_unlock(&revmap_trees_mutex); } } -int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, - irq_hw_number_t hwirq_base, int count) +int irq_domain_associate(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) { - unsigned int virq = irq_base; - irq_hw_number_t hwirq = hwirq_base; - int i, ret; + struct irq_data *irq_data = irq_get_irq_data(virq); + int ret; - pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, - of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); + if (WARN(hwirq >= domain->hwirq_max, + "error: hwirq 0x%x is too large for %s\n", (int)hwirq, domain->name)) + return -EINVAL; + if (WARN(!irq_data, "error: virq%i is not allocated", virq)) + return -EINVAL; + if (WARN(irq_data->domain, "error: virq%i is already associated", virq)) + return -EINVAL; - for (i = 0; i < count; i++) { - struct irq_data *irq_data = irq_get_irq_data(virq + i); - - if (WARN(!irq_data, "error: irq_desc not allocated; " - "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) - return -EINVAL; - if (WARN(irq_data->domain, "error: irq_desc already associated; " - "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) - return -EINVAL; - }; - - for (i = 0; i < count; i++, virq++, hwirq++) { - struct irq_data *irq_data = irq_get_irq_data(virq); - - irq_data->hwirq = hwirq; - irq_data->domain = domain; - if (domain->ops->map) { - ret = domain->ops->map(domain, virq, hwirq); - if (ret != 0) { - /* - * If map() returns -EPERM, this interrupt is protected - * by the firmware or some other service and shall not - * be mapped. - * - * Since on some platforms we blindly try to map everything - * we end up with a log full of backtraces. - * - * So instead, we silently fail on -EPERM, it is the - * responsibility of the PIC driver to display a relevant - * message if needed. - */ - if (ret != -EPERM) { - pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n", - virq, hwirq, ret); - WARN_ON(1); - } - irq_data->domain = NULL; - irq_data->hwirq = 0; - goto err_unmap; + mutex_lock(&irq_domain_mutex); + irq_data->hwirq = hwirq; + irq_data->domain = domain; + if (domain->ops->map) { + ret = domain->ops->map(domain, virq, hwirq); + if (ret != 0) { + /* + * If map() returns -EPERM, this interrupt is protected + * by the firmware or some other service and shall not + * be mapped. Don't bother telling the user about it. + */ + if (ret != -EPERM) { + pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n", + domain->name, hwirq, virq, ret); } + irq_data->domain = NULL; + irq_data->hwirq = 0; + mutex_unlock(&irq_domain_mutex); + return ret; } - switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_LINEAR: - if (hwirq < domain->revmap_data.linear.size) - domain->revmap_data.linear.revmap[hwirq] = virq; - break; - case IRQ_DOMAIN_MAP_TREE: - mutex_lock(&revmap_trees_mutex); - radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); - mutex_unlock(&revmap_trees_mutex); - break; - } + /* If not already assigned, give the domain the chip's name */ + if (!domain->name && irq_data->chip) + domain->name = irq_data->chip->name; + } - irq_clear_status_flags(virq, IRQ_NOREQUEST); + if (hwirq < domain->revmap_size) { + domain->linear_revmap[hwirq] = virq; + } else { + mutex_lock(&revmap_trees_mutex); + radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); + mutex_unlock(&revmap_trees_mutex); } + mutex_unlock(&irq_domain_mutex); + + irq_clear_status_flags(virq, IRQ_NOREQUEST); return 0; +} +EXPORT_SYMBOL_GPL(irq_domain_associate); - err_unmap: - irq_domain_disassociate_many(domain, irq_base, i); - return -EINVAL; +void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, + irq_hw_number_t hwirq_base, int count) +{ + int i; + + pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, + of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); + + for (i = 0; i < count; i++) { + irq_domain_associate(domain, irq_base + i, hwirq_base + i); + } } EXPORT_SYMBOL_GPL(irq_domain_associate_many); @@ -519,7 +341,9 @@ EXPORT_SYMBOL_GPL(irq_domain_associate_many); * * This routine is used for irq controllers which can choose the hardware * interrupt numbers they generate. In such a case it's simplest to use - * the linux irq as the hardware interrupt number. + * the linux irq as the hardware interrupt number. It still uses the linear + * or radix tree to store the mapping, but the irq controller can optimize + * the revmap path by using the hwirq directly. */ unsigned int irq_create_direct_mapping(struct irq_domain *domain) { @@ -528,17 +352,14 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) if (domain == NULL) domain = irq_default_domain; - if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP)) - return 0; - virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); if (!virq) { pr_debug("create_direct virq allocation failed\n"); return 0; } - if (virq >= domain->revmap_data.nomap.max_irq) { + if (virq >= domain->revmap_direct_max_irq) { pr_err("ERROR: no free irqs available below %i maximum\n", - domain->revmap_data.nomap.max_irq); + domain->revmap_direct_max_irq); irq_free_desc(virq); return 0; } @@ -575,9 +396,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, if (domain == NULL) domain = irq_default_domain; if (domain == NULL) { - pr_warning("irq_create_mapping called for" - " NULL domain, hwirq=%lx\n", hwirq); - WARN_ON(1); + WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq); return 0; } pr_debug("-> using domain @%p\n", domain); @@ -589,10 +408,6 @@ unsigned int irq_create_mapping(struct irq_domain *domain, return virq; } - /* Get a virtual interrupt number */ - if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) - return irq_domain_legacy_revmap(domain, hwirq); - /* Allocate a virtual interrupt number */ hint = hwirq % nr_irqs; if (hint == 0) @@ -645,12 +460,7 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, if (unlikely(ret < 0)) return ret; - ret = irq_domain_associate_many(domain, irq_base, hwirq_base, count); - if (unlikely(ret < 0)) { - irq_free_descs(irq_base, count); - return ret; - } - + irq_domain_associate_many(domain, irq_base, hwirq_base, count); return 0; } EXPORT_SYMBOL_GPL(irq_create_strict_mappings); @@ -665,20 +475,8 @@ unsigned int irq_create_of_mapping(struct device_node *controller, domain = controller ? irq_find_host(controller) : irq_default_domain; if (!domain) { -#ifdef CONFIG_MIPS - /* - * Workaround to avoid breaking interrupt controller drivers - * that don't yet register an irq_domain. This is temporary - * code. ~~~gcl, Feb 24, 2012 - * - * Scheduled for removal in Linux v3.6. That should be enough - * time. - */ - if (intsize > 0) - return intspec[0]; -#endif - pr_warning("no irq domain found for %s !\n", - of_node_full_name(controller)); + pr_warn("no irq domain found for %s !\n", + of_node_full_name(controller)); return 0; } @@ -698,7 +496,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller, /* Set type if specified and different than the current one */ if (type != IRQ_TYPE_NONE && - type != (irqd_get_trigger_type(irq_get_irq_data(virq)))) + type != irq_get_trigger_type(virq)) irq_set_irq_type(virq, type); return virq; } @@ -720,11 +518,7 @@ void irq_dispose_mapping(unsigned int virq) if (WARN_ON(domain == NULL)) return; - /* Never unmap legacy interrupts */ - if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) - return; - - irq_domain_disassociate_many(domain, virq, 1); + irq_domain_disassociate(domain, virq); irq_free_desc(virq); } EXPORT_SYMBOL_GPL(irq_dispose_mapping); @@ -745,63 +539,51 @@ unsigned int irq_find_mapping(struct irq_domain *domain, if (domain == NULL) return 0; - switch (domain->revmap_type) { - case IRQ_DOMAIN_MAP_LEGACY: - return irq_domain_legacy_revmap(domain, hwirq); - case IRQ_DOMAIN_MAP_LINEAR: - return irq_linear_revmap(domain, hwirq); - case IRQ_DOMAIN_MAP_TREE: - rcu_read_lock(); - data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); - rcu_read_unlock(); - if (data) - return data->irq; - break; - case IRQ_DOMAIN_MAP_NOMAP: + if (hwirq < domain->revmap_direct_max_irq) { data = irq_get_irq_data(hwirq); if (data && (data->domain == domain) && (data->hwirq == hwirq)) return hwirq; - break; } - return 0; -} -EXPORT_SYMBOL_GPL(irq_find_mapping); - -/** - * irq_linear_revmap() - Find a linux irq from a hw irq number. - * @domain: domain owning this hardware interrupt - * @hwirq: hardware irq number in that domain space - * - * This is a fast path that can be called directly by irq controller code to - * save a handful of instructions. - */ -unsigned int irq_linear_revmap(struct irq_domain *domain, - irq_hw_number_t hwirq) -{ - BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR); - - /* Check revmap bounds; complain if exceeded */ - if (WARN_ON(hwirq >= domain->revmap_data.linear.size)) - return 0; + /* Check if the hwirq is in the linear revmap. */ + if (hwirq < domain->revmap_size) + return domain->linear_revmap[hwirq]; - return domain->revmap_data.linear.revmap[hwirq]; + rcu_read_lock(); + data = radix_tree_lookup(&domain->revmap_tree, hwirq); + rcu_read_unlock(); + return data ? data->irq : 0; } -EXPORT_SYMBOL_GPL(irq_linear_revmap); +EXPORT_SYMBOL_GPL(irq_find_mapping); #ifdef CONFIG_IRQ_DOMAIN_DEBUG static int virq_debug_show(struct seq_file *m, void *private) { unsigned long flags; struct irq_desc *desc; - const char *p; - static const char none[] = "none"; - void *data; + struct irq_domain *domain; + struct radix_tree_iter iter; + void *data, **slot; int i; - seq_printf(m, "%-5s %-7s %-15s %-*s %s\n", "irq", "hwirq", + seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", + "name", "mapped", "linear-max", "direct-max", "devtree-node"); + mutex_lock(&irq_domain_mutex); + list_for_each_entry(domain, &irq_domain_list, link) { + int count = 0; + radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) + count++; + seq_printf(m, "%c%-16s %6u %10u %10u %s\n", + domain == irq_default_domain ? '*' : ' ', domain->name, + domain->revmap_size + count, domain->revmap_size, + domain->revmap_direct_max_irq, + domain->of_node ? of_node_full_name(domain->of_node) : ""); + } + mutex_unlock(&irq_domain_mutex); + + seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq", "chip name", (int)(2 * sizeof(void *) + 2), "chip data", - "domain name"); + "active", "type", "domain"); for (i = 1; i < nr_irqs; i++) { desc = irq_to_desc(i); @@ -809,28 +591,28 @@ static int virq_debug_show(struct seq_file *m, void *private) continue; raw_spin_lock_irqsave(&desc->lock, flags); + domain = desc->irq_data.domain; - if (desc->action && desc->action->handler) { + if (domain) { struct irq_chip *chip; + int hwirq = desc->irq_data.hwirq; + bool direct; seq_printf(m, "%5d ", i); - seq_printf(m, "0x%05lx ", desc->irq_data.hwirq); + seq_printf(m, "0x%05x ", hwirq); chip = irq_desc_get_chip(desc); - if (chip && chip->name) - p = chip->name; - else - p = none; - seq_printf(m, "%-15s ", p); + seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); data = irq_desc_get_chip_data(desc); seq_printf(m, data ? "0x%p " : " %p ", data); - if (desc->irq_data.domain) - p = of_node_full_name(desc->irq_data.domain->of_node); - else - p = none; - seq_printf(m, "%s\n", p); + seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); + direct = (i == hwirq) && (i < domain->revmap_direct_max_irq); + seq_printf(m, "%6s%-8s ", + (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", + direct ? "(DIRECT)" : ""); + seq_printf(m, "%s\n", desc->irq_data.domain->name); } raw_spin_unlock_irqrestore(&desc->lock, flags); @@ -927,18 +709,3 @@ const struct irq_domain_ops irq_domain_simple_ops = { .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); - -#ifdef CONFIG_OF_IRQ -void irq_domain_generate_simple(const struct of_device_id *match, - u64 phys_base, unsigned int irq_start) -{ - struct device_node *node; - pr_debug("looking for phys_base=%llx, irq_start=%i\n", - (unsigned long long) phys_base, (int) irq_start); - node = of_find_matching_node_by_address(NULL, match, phys_base); - if (node) - irq_domain_add_legacy(node, 32, irq_start, 0, - &irq_domain_simple_ops, NULL); -} -EXPORT_SYMBOL_GPL(irq_domain_generate_simple); -#endif diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index fa17855ca65..514bcfd855a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -555,9 +555,9 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) return 0; if (irq_settings_can_request(desc)) { - if (desc->action) - if (irqflags & desc->action->flags & IRQF_SHARED) - canrequest =1; + if (!desc->action || + irqflags & desc->action->flags & IRQF_SHARED) + canrequest = 1; } irq_put_desc_unlock(desc, flags); return canrequest; @@ -840,9 +840,6 @@ static void irq_thread_dtor(struct callback_head *unused) static int irq_thread(void *data) { struct callback_head on_exit_work; - static const struct sched_param param = { - .sched_priority = MAX_USER_RT_PRIO/2, - }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); irqreturn_t (*handler_fn)(struct irq_desc *desc, @@ -854,8 +851,6 @@ static int irq_thread(void *data) else handler_fn = irq_thread_fn; - sched_setscheduler(current, SCHED_FIFO, ¶m); - init_task_work(&on_exit_work, irq_thread_dtor); task_work_add(current, &on_exit_work, false); @@ -950,6 +945,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ if (new->thread_fn && !nested) { struct task_struct *t; + static const struct sched_param param = { + .sched_priority = MAX_USER_RT_PRIO/2, + }; t = kthread_create(irq_thread, new, "irq/%d-%s", irq, new->name); @@ -957,6 +955,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) ret = PTR_ERR(t); goto out_mput; } + + sched_setscheduler(t, SCHED_FIFO, ¶m); + /* * We keep the reference to the task struct even if * the thread dies to avoid that the interrupt code diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 19ed5c425c3..36f6ee181b0 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -462,6 +462,8 @@ int show_interrupts(struct seq_file *p, void *v) } else { seq_printf(p, " %8s", "None"); } + if (desc->irq_data.domain) + seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq); #ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); #endif diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 60f48fa0fd0..297a9247a3b 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -13,6 +13,7 @@ #include <linux/sort.h> #include <linux/err.h> #include <linux/static_key.h> +#include <linux/jump_label_ratelimit.h> #ifdef HAVE_JUMP_LABEL diff --git a/kernel/kexec.c b/kernel/kexec.c index 59f7b55ba74..2a74f307c5e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1474,11 +1474,8 @@ static int __init __parse_crashkernel(char *cmdline, if (first_colon && (!first_space || first_colon < first_space)) return parse_crashkernel_mem(ck_cmdline, system_ram, crash_size, crash_base); - else - return parse_crashkernel_simple(ck_cmdline, crash_size, - crash_base); - return 0; + return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); } /* diff --git a/kernel/kmod.c b/kernel/kmod.c index 8241906c4b6..b086006c59e 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -147,6 +147,9 @@ int __request_module(bool wait, const char *fmt, ...) */ WARN_ON_ONCE(wait && current_is_async()); + if (!modprobe_path[0]) + return 0; + va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); @@ -568,15 +571,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) DECLARE_COMPLETION_ONSTACK(done); int retval = 0; - helper_lock(); if (!sub_info->path) { - retval = -EINVAL; - goto out; + call_usermodehelper_freeinfo(sub_info); + return -EINVAL; } - - if (sub_info->path[0] == '\0') - goto out; - + helper_lock(); if (!khelper_wq || usermodehelper_disabled) { retval = -EBUSY; goto out; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3fed7f0cbcd..a0d367a4912 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -112,6 +112,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = { struct kprobe_insn_page { struct list_head list; kprobe_opcode_t *insns; /* Page of instruction slots */ + struct kprobe_insn_cache *cache; int nused; int ngarbage; char slot_used[]; @@ -121,12 +122,6 @@ struct kprobe_insn_page { (offsetof(struct kprobe_insn_page, slot_used) + \ (sizeof(char) * (slots))) -struct kprobe_insn_cache { - struct list_head pages; /* list of kprobe_insn_page */ - size_t insn_size; /* size of instruction slot */ - int nr_garbage; -}; - static int slots_per_page(struct kprobe_insn_cache *c) { return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); @@ -138,8 +133,20 @@ enum kprobe_slot_state { SLOT_USED = 2, }; -static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */ -static struct kprobe_insn_cache kprobe_insn_slots = { +static void *alloc_insn_page(void) +{ + return module_alloc(PAGE_SIZE); +} + +static void free_insn_page(void *page) +{ + module_free(NULL, page); +} + +struct kprobe_insn_cache kprobe_insn_slots = { + .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex), + .alloc = alloc_insn_page, + .free = free_insn_page, .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), .insn_size = MAX_INSN_SIZE, .nr_garbage = 0, @@ -150,10 +157,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c); * __get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. */ -static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) +kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) { struct kprobe_insn_page *kip; + kprobe_opcode_t *slot = NULL; + mutex_lock(&c->mutex); retry: list_for_each_entry(kip, &c->pages, list) { if (kip->nused < slots_per_page(c)) { @@ -162,7 +171,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) if (kip->slot_used[i] == SLOT_CLEAN) { kip->slot_used[i] = SLOT_USED; kip->nused++; - return kip->insns + (i * c->insn_size); + slot = kip->insns + (i * c->insn_size); + goto out; } } /* kip->nused is broken. Fix it. */ @@ -178,37 +188,29 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) /* All out of space. Need to allocate a new page. */ kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); if (!kip) - return NULL; + goto out; /* * Use module_alloc so this page is within +/- 2GB of where the * kernel image and loaded module images reside. This is required * so x86_64 can correctly handle the %rip-relative fixups. */ - kip->insns = module_alloc(PAGE_SIZE); + kip->insns = c->alloc(); if (!kip->insns) { kfree(kip); - return NULL; + goto out; } INIT_LIST_HEAD(&kip->list); memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); kip->slot_used[0] = SLOT_USED; kip->nused = 1; kip->ngarbage = 0; + kip->cache = c; list_add(&kip->list, &c->pages); - return kip->insns; -} - - -kprobe_opcode_t __kprobes *get_insn_slot(void) -{ - kprobe_opcode_t *ret = NULL; - - mutex_lock(&kprobe_insn_mutex); - ret = __get_insn_slot(&kprobe_insn_slots); - mutex_unlock(&kprobe_insn_mutex); - - return ret; + slot = kip->insns; +out: + mutex_unlock(&c->mutex); + return slot; } /* Return 1 if all garbages are collected, otherwise 0. */ @@ -225,7 +227,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) */ if (!list_is_singular(&kip->list)) { list_del(&kip->list); - module_free(NULL, kip->insns); + kip->cache->free(kip->insns); kfree(kip); } return 1; @@ -255,11 +257,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) return 0; } -static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, - kprobe_opcode_t *slot, int dirty) +void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, + kprobe_opcode_t *slot, int dirty) { struct kprobe_insn_page *kip; + mutex_lock(&c->mutex); list_for_each_entry(kip, &c->pages, list) { long idx = ((long)slot - (long)kip->insns) / (c->insn_size * sizeof(kprobe_opcode_t)); @@ -272,45 +275,25 @@ static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, collect_garbage_slots(c); } else collect_one_slot(kip, idx); - return; + goto out; } } /* Could not free this slot. */ WARN_ON(1); +out: + mutex_unlock(&c->mutex); } -void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) -{ - mutex_lock(&kprobe_insn_mutex); - __free_insn_slot(&kprobe_insn_slots, slot, dirty); - mutex_unlock(&kprobe_insn_mutex); -} #ifdef CONFIG_OPTPROBES /* For optimized_kprobe buffer */ -static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */ -static struct kprobe_insn_cache kprobe_optinsn_slots = { +struct kprobe_insn_cache kprobe_optinsn_slots = { + .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex), + .alloc = alloc_insn_page, + .free = free_insn_page, .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), /* .insn_size is initialized later */ .nr_garbage = 0, }; -/* Get a slot for optimized_kprobe buffer */ -kprobe_opcode_t __kprobes *get_optinsn_slot(void) -{ - kprobe_opcode_t *ret = NULL; - - mutex_lock(&kprobe_optinsn_mutex); - ret = __get_insn_slot(&kprobe_optinsn_slots); - mutex_unlock(&kprobe_optinsn_mutex); - - return ret; -} - -void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty) -{ - mutex_lock(&kprobe_optinsn_mutex); - __free_insn_slot(&kprobe_optinsn_slots, slot, dirty); - mutex_unlock(&kprobe_optinsn_mutex); -} #endif #endif @@ -467,6 +450,7 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) /* Optimization staging list, protected by kprobe_mutex */ static LIST_HEAD(optimizing_list); static LIST_HEAD(unoptimizing_list); +static LIST_HEAD(freeing_list); static void kprobe_optimizer(struct work_struct *work); static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); @@ -504,7 +488,7 @@ static __kprobes void do_optimize_kprobes(void) * Unoptimize (replace a jump with a breakpoint and remove the breakpoint * if need) kprobes listed on unoptimizing_list. */ -static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) +static __kprobes void do_unoptimize_kprobes(void) { struct optimized_kprobe *op, *tmp; @@ -515,9 +499,9 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) /* Ditto to do_optimize_kprobes */ get_online_cpus(); mutex_lock(&text_mutex); - arch_unoptimize_kprobes(&unoptimizing_list, free_list); + arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); /* Loop free_list for disarming */ - list_for_each_entry_safe(op, tmp, free_list, list) { + list_for_each_entry_safe(op, tmp, &freeing_list, list) { /* Disarm probes if marked disabled */ if (kprobe_disabled(&op->kp)) arch_disarm_kprobe(&op->kp); @@ -536,11 +520,11 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) } /* Reclaim all kprobes on the free_list */ -static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) +static __kprobes void do_free_cleaned_kprobes(void) { struct optimized_kprobe *op, *tmp; - list_for_each_entry_safe(op, tmp, free_list, list) { + list_for_each_entry_safe(op, tmp, &freeing_list, list) { BUG_ON(!kprobe_unused(&op->kp)); list_del_init(&op->list); free_aggr_kprobe(&op->kp); @@ -556,8 +540,6 @@ static __kprobes void kick_kprobe_optimizer(void) /* Kprobe jump optimizer */ static __kprobes void kprobe_optimizer(struct work_struct *work) { - LIST_HEAD(free_list); - mutex_lock(&kprobe_mutex); /* Lock modules while optimizing kprobes */ mutex_lock(&module_mutex); @@ -566,7 +548,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) * kprobes before waiting for quiesence period. */ - do_unoptimize_kprobes(&free_list); + do_unoptimize_kprobes(); /* * Step 2: Wait for quiesence period to ensure all running interrupts @@ -581,7 +563,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) do_optimize_kprobes(); /* Step 4: Free cleaned kprobes after quiesence period */ - do_free_cleaned_kprobes(&free_list); + do_free_cleaned_kprobes(); mutex_unlock(&module_mutex); mutex_unlock(&kprobe_mutex); @@ -723,8 +705,19 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p) if (!list_empty(&op->list)) /* Dequeue from the (un)optimization queue */ list_del_init(&op->list); - op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + + if (kprobe_unused(p)) { + /* Enqueue if it is unused */ + list_add(&op->list, &freeing_list); + /* + * Remove unused probes from the hash list. After waiting + * for synchronization, this probe is reclaimed. + * (reclaiming is done by do_free_cleaned_kprobes().) + */ + hlist_del_rcu(&op->kp.hlist); + } + /* Don't touch the code, because it is already freed. */ arch_remove_optimized_kprobe(op); } @@ -2322,6 +2315,7 @@ static ssize_t write_enabled_file_bool(struct file *file, if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT; + buf[buf_size] = '\0'; switch (buf[0]) { case 'y': case 'Y': @@ -2333,6 +2327,8 @@ static ssize_t write_enabled_file_bool(struct file *file, case '0': disarm_all_kprobes(); break; + default: + return -EINVAL; } return count; diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 6ada93c23a9..9659d38e008 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -113,7 +113,7 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj, unsigned long cnt; int ret; - if (strict_strtoul(buf, 0, &cnt)) + if (kstrtoul(buf, 0, &cnt)) return -EINVAL; ret = crash_shrink_memory(cnt); diff --git a/kernel/lglock.c b/kernel/lglock.c index 6535a667a5a..86ae2aebf00 100644 --- a/kernel/lglock.c +++ b/kernel/lglock.c @@ -21,7 +21,7 @@ void lg_local_lock(struct lglock *lg) arch_spinlock_t *lock; preempt_disable(); - rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); lock = this_cpu_ptr(lg->lock); arch_spin_lock(lock); } @@ -31,7 +31,7 @@ void lg_local_unlock(struct lglock *lg) { arch_spinlock_t *lock; - rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock_release(&lg->lock_dep_map, 1, _RET_IP_); lock = this_cpu_ptr(lg->lock); arch_spin_unlock(lock); preempt_enable(); @@ -43,7 +43,7 @@ void lg_local_lock_cpu(struct lglock *lg, int cpu) arch_spinlock_t *lock; preempt_disable(); - rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); lock = per_cpu_ptr(lg->lock, cpu); arch_spin_lock(lock); } @@ -53,7 +53,7 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu) { arch_spinlock_t *lock; - rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock_release(&lg->lock_dep_map, 1, _RET_IP_); lock = per_cpu_ptr(lg->lock, cpu); arch_spin_unlock(lock); preempt_enable(); @@ -65,7 +65,7 @@ void lg_global_lock(struct lglock *lg) int i; preempt_disable(); - rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); for_each_possible_cpu(i) { arch_spinlock_t *lock; lock = per_cpu_ptr(lg->lock, i); @@ -78,7 +78,7 @@ void lg_global_unlock(struct lglock *lg) { int i; - rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock_release(&lg->lock_dep_map, 1, _RET_IP_); for_each_possible_cpu(i) { arch_spinlock_t *lock; lock = per_cpu_ptr(lg->lock, i); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 1f3186b37fd..e16c45b9ee7 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -4090,7 +4090,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) } EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); -static void print_held_locks_bug(struct task_struct *curr) +static void print_held_locks_bug(void) { if (!debug_locks_off()) return; @@ -4099,22 +4099,21 @@ static void print_held_locks_bug(struct task_struct *curr) printk("\n"); printk("=====================================\n"); - printk("[ BUG: lock held at task exit time! ]\n"); + printk("[ BUG: %s/%d still has locks held! ]\n", + current->comm, task_pid_nr(current)); print_kernel_ident(); printk("-------------------------------------\n"); - printk("%s/%d is exiting with locks still held!\n", - curr->comm, task_pid_nr(curr)); - lockdep_print_held_locks(curr); - + lockdep_print_held_locks(current); printk("\nstack backtrace:\n"); dump_stack(); } -void debug_check_no_locks_held(struct task_struct *task) +void debug_check_no_locks_held(void) { - if (unlikely(task->lockdep_depth > 0)) - print_held_locks_bug(task); + if (unlikely(current->lockdep_depth > 0)) + print_held_locks_bug(); } +EXPORT_SYMBOL_GPL(debug_check_no_locks_held); void debug_show_all_locks(void) { diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c index 2b6e69909c3..7cbd4507a7e 100644 --- a/kernel/modsign_pubkey.c +++ b/kernel/modsign_pubkey.c @@ -18,14 +18,14 @@ struct key *modsign_keyring; -extern __initdata const u8 modsign_certificate_list[]; -extern __initdata const u8 modsign_certificate_list_end[]; +extern __initconst const u8 modsign_certificate_list[]; +extern __initconst const u8 modsign_certificate_list_end[]; /* * We need to make sure ccache doesn't cache the .o file as it doesn't notice * if modsign.pub changes. */ -static __initdata const char annoy_ccache[] = __TIME__ "foo"; +static __initconst const char annoy_ccache[] = __TIME__ "foo"; /* * Load the compiled-in keys diff --git a/kernel/module.c b/kernel/module.c index cab4bce49c2..dc582749fa1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -136,6 +136,7 @@ static int param_set_bool_enable_only(const char *val, } static const struct kernel_param_ops param_ops_bool_enable_only = { + .flags = KERNEL_PARAM_FL_NOARG, .set = param_set_bool_enable_only, .get = param_get_bool, }; @@ -455,7 +456,7 @@ const struct kernel_symbol *find_symbol(const char *name, EXPORT_SYMBOL_GPL(find_symbol); /* Search for module by name: must hold module_mutex. */ -static struct module *find_module_all(const char *name, +static struct module *find_module_all(const char *name, size_t len, bool even_unformed) { struct module *mod; @@ -463,7 +464,7 @@ static struct module *find_module_all(const char *name, list_for_each_entry(mod, &modules, list) { if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) continue; - if (strcmp(mod->name, name) == 0) + if (strlen(mod->name) == len && !memcmp(mod->name, name, len)) return mod; } return NULL; @@ -471,7 +472,7 @@ static struct module *find_module_all(const char *name, struct module *find_module(const char *name) { - return find_module_all(name, false); + return find_module_all(name, strlen(name), false); } EXPORT_SYMBOL_GPL(find_module); @@ -482,23 +483,28 @@ static inline void __percpu *mod_percpu(struct module *mod) return mod->percpu; } -static int percpu_modalloc(struct module *mod, - unsigned long size, unsigned long align) +static int percpu_modalloc(struct module *mod, struct load_info *info) { + Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu]; + unsigned long align = pcpusec->sh_addralign; + + if (!pcpusec->sh_size) + return 0; + if (align > PAGE_SIZE) { printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", mod->name, align, PAGE_SIZE); align = PAGE_SIZE; } - mod->percpu = __alloc_reserved_percpu(size, align); + mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align); if (!mod->percpu) { printk(KERN_WARNING "%s: Could not allocate %lu bytes percpu data\n", - mod->name, size); + mod->name, (unsigned long)pcpusec->sh_size); return -ENOMEM; } - mod->percpu_size = size; + mod->percpu_size = pcpusec->sh_size; return 0; } @@ -563,10 +569,12 @@ static inline void __percpu *mod_percpu(struct module *mod) { return NULL; } -static inline int percpu_modalloc(struct module *mod, - unsigned long size, unsigned long align) +static int percpu_modalloc(struct module *mod, struct load_info *info) { - return -ENOMEM; + /* UP modules shouldn't have this section: ENOMEM isn't quite right */ + if (info->sechdrs[info->index.pcpu].sh_size != 0) + return -ENOMEM; + return 0; } static inline void percpu_modfree(struct module *mod) { @@ -596,7 +604,7 @@ static void setup_modinfo_##field(struct module *mod, const char *s) \ static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ struct module_kobject *mk, char *buffer) \ { \ - return sprintf(buffer, "%s\n", mk->mod->field); \ + return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field); \ } \ static int modinfo_##field##_exists(struct module *mod) \ { \ @@ -1604,6 +1612,14 @@ static void module_remove_modinfo_attrs(struct module *mod) kfree(mod->modinfo_attrs); } +static void mod_kobject_put(struct module *mod) +{ + DECLARE_COMPLETION_ONSTACK(c); + mod->mkobj.kobj_completion = &c; + kobject_put(&mod->mkobj.kobj); + wait_for_completion(&c); +} + static int mod_sysfs_init(struct module *mod) { int err; @@ -1631,7 +1647,7 @@ static int mod_sysfs_init(struct module *mod) err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, "%s", mod->name); if (err) - kobject_put(&mod->mkobj.kobj); + mod_kobject_put(mod); /* delay uevent until full sysfs population */ out: @@ -1675,7 +1691,7 @@ out_unreg_param: out_unreg_holders: kobject_put(mod->holders_dir); out_unreg: - kobject_put(&mod->mkobj.kobj); + mod_kobject_put(mod); out: return err; } @@ -1684,7 +1700,7 @@ static void mod_sysfs_fini(struct module *mod) { remove_notes_attrs(mod); remove_sect_attrs(mod); - kobject_put(&mod->mkobj.kobj); + mod_kobject_put(mod); } #else /* !CONFIG_SYSFS */ @@ -2533,21 +2549,20 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, /* Sets info->hdr and info->len. */ static int copy_module_from_fd(int fd, struct load_info *info) { - struct file *file; + struct fd f = fdget(fd); int err; struct kstat stat; loff_t pos; ssize_t bytes = 0; - file = fget(fd); - if (!file) + if (!f.file) return -ENOEXEC; - err = security_kernel_module_from_file(file); + err = security_kernel_module_from_file(f.file); if (err) goto out; - err = vfs_getattr(&file->f_path, &stat); + err = vfs_getattr(&f.file->f_path, &stat); if (err) goto out; @@ -2570,7 +2585,7 @@ static int copy_module_from_fd(int fd, struct load_info *info) pos = 0; while (pos < stat.size) { - bytes = kernel_read(file, pos, (char *)(info->hdr) + pos, + bytes = kernel_read(f.file, pos, (char *)(info->hdr) + pos, stat.size - pos); if (bytes < 0) { vfree(info->hdr); @@ -2584,7 +2599,7 @@ static int copy_module_from_fd(int fd, struct load_info *info) info->len = pos; out: - fput(file); + fdput(f); return err; } @@ -2927,7 +2942,6 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) { /* Module within temporary copy. */ struct module *mod; - Elf_Shdr *pcpusec; int err; mod = setup_load_info(info, flags); @@ -2942,17 +2956,10 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) err = module_frob_arch_sections(info->hdr, info->sechdrs, info->secstrings, mod); if (err < 0) - goto out; + return ERR_PTR(err); - pcpusec = &info->sechdrs[info->index.pcpu]; - if (pcpusec->sh_size) { - /* We have a special allocation for this section. */ - err = percpu_modalloc(mod, - pcpusec->sh_size, pcpusec->sh_addralign); - if (err) - goto out; - pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC; - } + /* We will do a special allocation for per-cpu sections later. */ + info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC; /* Determine total sizes, and put offsets in sh_entsize. For now this is done generically; there doesn't appear to be any @@ -2963,17 +2970,12 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) /* Allocate and move to the final place */ err = move_module(mod, info); if (err) - goto free_percpu; + return ERR_PTR(err); /* Module has been copied to its final place now: return it. */ mod = (void *)info->sechdrs[info->index.mod].sh_addr; kmemleak_load_module(mod, info); return mod; - -free_percpu: - percpu_modfree(mod); -out: - return ERR_PTR(err); } /* mod is no longer valid after this! */ @@ -3014,7 +3016,7 @@ static bool finished_loading(const char *name) bool ret; mutex_lock(&module_mutex); - mod = find_module_all(name, true); + mod = find_module_all(name, strlen(name), true); ret = !mod || mod->state == MODULE_STATE_LIVE || mod->state == MODULE_STATE_GOING; mutex_unlock(&module_mutex); @@ -3152,7 +3154,8 @@ static int add_unformed_module(struct module *mod) again: mutex_lock(&module_mutex); - if ((old = find_module_all(mod->name, true)) != NULL) { + old = find_module_all(mod->name, strlen(mod->name), true); + if (old != NULL) { if (old->state == MODULE_STATE_COMING || old->state == MODULE_STATE_UNFORMED) { /* Wait in case it fails to load. */ @@ -3198,6 +3201,17 @@ out: return err; } +static int unknown_module_param_cb(char *param, char *val, const char *modname) +{ + /* Check for magic 'dyndbg' arg */ + int ret = ddebug_dyndbg_module_param_cb(param, val, modname); + if (ret != 0) { + printk(KERN_WARNING "%s: unknown parameter '%s' ignored\n", + modname, param); + } + return 0; +} + /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ static int load_module(struct load_info *info, const char __user *uargs, @@ -3237,6 +3251,11 @@ static int load_module(struct load_info *info, const char __user *uargs, } #endif + /* To avoid stressing percpu allocator, do this once we're unique. */ + err = percpu_modalloc(mod, info); + if (err) + goto unlink_mod; + /* Now module is in final location, initialize linked lists, etc. */ err = module_unload_init(mod); if (err) @@ -3284,7 +3303,7 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Module is ready to execute: parsing args may do that. */ err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, - -32768, 32767, &ddebug_dyndbg_module_param_cb); + -32768, 32767, unknown_module_param_cb); if (err < 0) goto bug_cleanup; @@ -3563,10 +3582,8 @@ unsigned long module_kallsyms_lookup_name(const char *name) /* Don't lock: we're in enough trouble already. */ preempt_disable(); if ((colon = strchr(name, ':')) != NULL) { - *colon = '\0'; - if ((mod = find_module(name)) != NULL) + if ((mod = find_module_all(name, colon - name, false)) != NULL) ret = mod_find_symname(mod, colon+1); - *colon = ':'; } else { list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) diff --git a/kernel/mutex.c b/kernel/mutex.c index ad53a664f11..d24105b1b79 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -18,6 +18,7 @@ * Also see Documentation/mutex-design.txt. */ #include <linux/mutex.h> +#include <linux/ww_mutex.h> #include <linux/sched.h> #include <linux/sched/rt.h> #include <linux/export.h> @@ -208,11 +209,13 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) */ static inline int mutex_can_spin_on_owner(struct mutex *lock) { + struct task_struct *owner; int retval = 1; rcu_read_lock(); - if (lock->owner) - retval = lock->owner->on_cpu; + owner = ACCESS_ONCE(lock->owner); + if (owner) + retval = owner->on_cpu; rcu_read_unlock(); /* * if lock->owner is not set, the mutex owner may have just acquired @@ -254,16 +257,165 @@ void __sched mutex_unlock(struct mutex *lock) EXPORT_SYMBOL(mutex_unlock); +/** + * ww_mutex_unlock - release the w/w mutex + * @lock: the mutex to be released + * + * Unlock a mutex that has been locked by this task previously with any of the + * ww_mutex_lock* functions (with or without an acquire context). It is + * forbidden to release the locks after releasing the acquire context. + * + * This function must not be used in interrupt context. Unlocking + * of a unlocked mutex is not allowed. + */ +void __sched ww_mutex_unlock(struct ww_mutex *lock) +{ + /* + * The unlocking fastpath is the 0->1 transition from 'locked' + * into 'unlocked' state: + */ + if (lock->ctx) { +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); +#endif + if (lock->ctx->acquired > 0) + lock->ctx->acquired--; + lock->ctx = NULL; + } + +#ifndef CONFIG_DEBUG_MUTEXES + /* + * When debugging is enabled we must not clear the owner before time, + * the slow path will always be taken, and that clears the owner field + * after verifying that it was indeed current. + */ + mutex_clear_owner(&lock->base); +#endif + __mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath); +} +EXPORT_SYMBOL(ww_mutex_unlock); + +static inline int __sched +__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) +{ + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); + + if (!hold_ctx) + return 0; + + if (unlikely(ctx == hold_ctx)) + return -EALREADY; + + if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && + (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(ctx->contending_lock); + ctx->contending_lock = ww; +#endif + return -EDEADLK; + } + + return 0; +} + +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, + struct ww_acquire_ctx *ww_ctx) +{ +#ifdef CONFIG_DEBUG_MUTEXES + /* + * If this WARN_ON triggers, you used ww_mutex_lock to acquire, + * but released with a normal mutex_unlock in this call. + * + * This should never happen, always use ww_mutex_unlock. + */ + DEBUG_LOCKS_WARN_ON(ww->ctx); + + /* + * Not quite done after calling ww_acquire_done() ? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); + + if (ww_ctx->contending_lock) { + /* + * After -EDEADLK you tried to + * acquire a different ww_mutex? Bad! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); + + /* + * You called ww_mutex_lock after receiving -EDEADLK, + * but 'forgot' to unlock everything else first? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); + ww_ctx->contending_lock = NULL; + } + + /* + * Naughty, using a different class will lead to undefined behavior! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); +#endif + ww_ctx->acquired++; +} + +/* + * after acquiring lock with fastpath or when we lost out in contested + * slowpath, set ctx and wake up any waiters so they can recheck. + * + * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, + * as the fastpath and opportunistic spinning are disabled in that case. + */ +static __always_inline void +ww_mutex_set_context_fastpath(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx) +{ + unsigned long flags; + struct mutex_waiter *cur; + + ww_mutex_lock_acquired(lock, ctx); + + lock->ctx = ctx; + + /* + * The lock->ctx update should be visible on all cores before + * the atomic read is done, otherwise contended waiters might be + * missed. The contended waiters will either see ww_ctx == NULL + * and keep spinning, or it will acquire wait_lock, add itself + * to waiter list and sleep. + */ + smp_mb(); /* ^^^ */ + + /* + * Check if lock is contended, if not there is nobody to wake up + */ + if (likely(atomic_read(&lock->base.count) == 0)) + return; + + /* + * Uh oh, we raced in fastpath, wake up everyone in this case, + * so they can see the new lock->ctx. + */ + spin_lock_mutex(&lock->base.wait_lock, flags); + list_for_each_entry(cur, &lock->base.wait_list, list) { + debug_mutex_wake_waiter(&lock->base, cur); + wake_up_process(cur->task); + } + spin_unlock_mutex(&lock->base.wait_lock, flags); +} + /* * Lock a mutex (possibly interruptible), slowpath: */ -static inline int __sched +static __always_inline int __sched __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, - struct lockdep_map *nest_lock, unsigned long ip) + struct lockdep_map *nest_lock, unsigned long ip, + struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) { struct task_struct *task = current; struct mutex_waiter waiter; unsigned long flags; + int ret; preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); @@ -298,6 +450,22 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct task_struct *owner; struct mspin_node node; + if (use_ww_ctx && ww_ctx->acquired > 0) { + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); + /* + * If ww->ctx is set the contents are undefined, only + * by acquiring wait_lock there is a guarantee that + * they are not invalid when reading. + * + * As such, when deadlock detection needs to be + * performed the optimistic spinning cannot be done. + */ + if (ACCESS_ONCE(ww->ctx)) + goto slowpath; + } + /* * If there's an owner, wait for it to either * release the lock or go to sleep. @@ -306,12 +474,19 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, owner = ACCESS_ONCE(lock->owner); if (owner && !mutex_spin_on_owner(lock, owner)) { mspin_unlock(MLOCK(lock), &node); - break; + goto slowpath; } if ((atomic_read(&lock->count) == 1) && (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { lock_acquired(&lock->dep_map, ip); + if (use_ww_ctx) { + struct ww_mutex *ww; + ww = container_of(lock, struct ww_mutex, base); + + ww_mutex_set_context_fastpath(ww, ww_ctx); + } + mutex_set_owner(lock); mspin_unlock(MLOCK(lock), &node); preempt_enable(); @@ -326,7 +501,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * the owner complete. */ if (!owner && (need_resched() || rt_task(task))) - break; + goto slowpath; /* * The cpu_relax() call is a compiler barrier which forces @@ -340,6 +515,10 @@ slowpath: #endif spin_lock_mutex(&lock->wait_lock, flags); + /* once more, can we acquire the lock? */ + if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1)) + goto skip_wait; + debug_mutex_lock_common(lock, &waiter); debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); @@ -347,9 +526,6 @@ slowpath: list_add_tail(&waiter.list, &lock->wait_list); waiter.task = task; - if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1)) - goto done; - lock_contended(&lock->dep_map, ip); for (;;) { @@ -363,7 +539,7 @@ slowpath: * other waiters: */ if (MUTEX_SHOW_NO_WAITER(lock) && - (atomic_xchg(&lock->count, -1) == 1)) + (atomic_xchg(&lock->count, -1) == 1)) break; /* @@ -371,15 +547,16 @@ slowpath: * TASK_UNINTERRUPTIBLE case.) */ if (unlikely(signal_pending_state(state, task))) { - mutex_remove_waiter(lock, &waiter, - task_thread_info(task)); - mutex_release(&lock->dep_map, 1, ip); - spin_unlock_mutex(&lock->wait_lock, flags); + ret = -EINTR; + goto err; + } - debug_mutex_free_waiter(&waiter); - preempt_enable(); - return -EINTR; + if (use_ww_ctx && ww_ctx->acquired > 0) { + ret = __mutex_lock_check_stamp(lock, ww_ctx); + if (ret) + goto err; } + __set_task_state(task, state); /* didn't get the lock, go to sleep: */ @@ -387,23 +564,49 @@ slowpath: schedule_preempt_disabled(); spin_lock_mutex(&lock->wait_lock, flags); } - -done: - lock_acquired(&lock->dep_map, ip); - /* got the lock - rejoice! */ mutex_remove_waiter(lock, &waiter, current_thread_info()); - mutex_set_owner(lock); - /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); + debug_mutex_free_waiter(&waiter); + +skip_wait: + /* got the lock - cleanup and rejoice! */ + lock_acquired(&lock->dep_map, ip); + mutex_set_owner(lock); + + if (use_ww_ctx) { + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + struct mutex_waiter *cur; + + /* + * This branch gets optimized out for the common case, + * and is only important for ww_mutex_lock. + */ + ww_mutex_lock_acquired(ww, ww_ctx); + ww->ctx = ww_ctx; + + /* + * Give any possible sleeping processes the chance to wake up, + * so they can recheck if they have to back off. + */ + list_for_each_entry(cur, &lock->wait_list, list) { + debug_mutex_wake_waiter(lock, cur); + wake_up_process(cur->task); + } + } spin_unlock_mutex(&lock->wait_lock, flags); + preempt_enable(); + return 0; +err: + mutex_remove_waiter(lock, &waiter, task_thread_info(task)); + spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); + mutex_release(&lock->dep_map, 1, ip); preempt_enable(); - - return 0; + return ret; } #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -411,7 +614,8 @@ void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, + subclass, NULL, _RET_IP_, NULL, 0); } EXPORT_SYMBOL_GPL(mutex_lock_nested); @@ -420,7 +624,8 @@ void __sched _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) { might_sleep(); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, + 0, nest, _RET_IP_, NULL, 0); } EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); @@ -429,7 +634,8 @@ int __sched mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); - return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); + return __mutex_lock_common(lock, TASK_KILLABLE, + subclass, NULL, _RET_IP_, NULL, 0); } EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); @@ -438,10 +644,68 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, - subclass, NULL, _RET_IP_); + subclass, NULL, _RET_IP_, NULL, 0); } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); + +static inline int +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH + unsigned tmp; + + if (ctx->deadlock_inject_countdown-- == 0) { + tmp = ctx->deadlock_inject_interval; + if (tmp > UINT_MAX/4) + tmp = UINT_MAX; + else + tmp = tmp*2 + tmp + tmp/2; + + ctx->deadlock_inject_interval = tmp; + ctx->deadlock_inject_countdown = tmp; + ctx->contending_lock = lock; + + ww_mutex_unlock(lock); + + return -EDEADLK; + } +#endif + + return 0; +} + +int __sched +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + int ret; + + might_sleep(); + ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, + 0, &ctx->dep_map, _RET_IP_, ctx, 1); + if (!ret && ctx->acquired > 1) + return ww_mutex_deadlock_injection(lock, ctx); + + return ret; +} +EXPORT_SYMBOL_GPL(__ww_mutex_lock); + +int __sched +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + int ret; + + might_sleep(); + ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, + 0, &ctx->dep_map, _RET_IP_, ctx, 1); + + if (!ret && ctx->acquired > 1) + return ww_mutex_deadlock_injection(lock, ctx); + + return ret; +} +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); + #endif /* @@ -494,10 +758,10 @@ __mutex_unlock_slowpath(atomic_t *lock_count) * mutex_lock_interruptible() and mutex_trylock(). */ static noinline int __sched -__mutex_lock_killable_slowpath(atomic_t *lock_count); +__mutex_lock_killable_slowpath(struct mutex *lock); static noinline int __sched -__mutex_lock_interruptible_slowpath(atomic_t *lock_count); +__mutex_lock_interruptible_slowpath(struct mutex *lock); /** * mutex_lock_interruptible - acquire the mutex, interruptible @@ -515,12 +779,12 @@ int __sched mutex_lock_interruptible(struct mutex *lock) int ret; might_sleep(); - ret = __mutex_fastpath_lock_retval - (&lock->count, __mutex_lock_interruptible_slowpath); - if (!ret) + ret = __mutex_fastpath_lock_retval(&lock->count); + if (likely(!ret)) { mutex_set_owner(lock); - - return ret; + return 0; + } else + return __mutex_lock_interruptible_slowpath(lock); } EXPORT_SYMBOL(mutex_lock_interruptible); @@ -530,12 +794,12 @@ int __sched mutex_lock_killable(struct mutex *lock) int ret; might_sleep(); - ret = __mutex_fastpath_lock_retval - (&lock->count, __mutex_lock_killable_slowpath); - if (!ret) + ret = __mutex_fastpath_lock_retval(&lock->count); + if (likely(!ret)) { mutex_set_owner(lock); - - return ret; + return 0; + } else + return __mutex_lock_killable_slowpath(lock); } EXPORT_SYMBOL(mutex_lock_killable); @@ -544,24 +808,39 @@ __mutex_lock_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, + NULL, _RET_IP_, NULL, 0); } static noinline int __sched -__mutex_lock_killable_slowpath(atomic_t *lock_count) +__mutex_lock_killable_slowpath(struct mutex *lock) { - struct mutex *lock = container_of(lock_count, struct mutex, count); + return __mutex_lock_common(lock, TASK_KILLABLE, 0, + NULL, _RET_IP_, NULL, 0); +} - return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); +static noinline int __sched +__mutex_lock_interruptible_slowpath(struct mutex *lock) +{ + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, + NULL, _RET_IP_, NULL, 0); } static noinline int __sched -__mutex_lock_interruptible_slowpath(atomic_t *lock_count) +__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - struct mutex *lock = container_of(lock_count, struct mutex, count); + return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, + NULL, _RET_IP_, ctx, 1); +} - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); +static noinline int __sched +__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx) +{ + return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, + NULL, _RET_IP_, ctx, 1); } + #endif /* @@ -617,6 +896,45 @@ int __sched mutex_trylock(struct mutex *lock) } EXPORT_SYMBOL(mutex_trylock); +#ifndef CONFIG_DEBUG_LOCK_ALLOC +int __sched +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + int ret; + + might_sleep(); + + ret = __mutex_fastpath_lock_retval(&lock->base.count); + + if (likely(!ret)) { + ww_mutex_set_context_fastpath(lock, ctx); + mutex_set_owner(&lock->base); + } else + ret = __ww_mutex_lock_slowpath(lock, ctx); + return ret; +} +EXPORT_SYMBOL(__ww_mutex_lock); + +int __sched +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + int ret; + + might_sleep(); + + ret = __mutex_fastpath_lock_retval(&lock->base.count); + + if (likely(!ret)) { + ww_mutex_set_context_fastpath(lock, ctx); + mutex_set_owner(&lock->base); + } else + ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx); + return ret; +} +EXPORT_SYMBOL(__ww_mutex_lock_interruptible); + +#endif + /** * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 * @cnt: the atomic which we are to dec diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 364ceab15f0..8e7811086b8 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -29,15 +29,15 @@ static struct kmem_cache *nsproxy_cachep; struct nsproxy init_nsproxy = { - .count = ATOMIC_INIT(1), - .uts_ns = &init_uts_ns, + .count = ATOMIC_INIT(1), + .uts_ns = &init_uts_ns, #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) - .ipc_ns = &init_ipc_ns, + .ipc_ns = &init_ipc_ns, #endif - .mnt_ns = NULL, - .pid_ns = &init_pid_ns, + .mnt_ns = NULL, + .pid_ns_for_children = &init_pid_ns, #ifdef CONFIG_NET - .net_ns = &init_net, + .net_ns = &init_net, #endif }; @@ -85,9 +85,10 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_ipc; } - new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns); - if (IS_ERR(new_nsp->pid_ns)) { - err = PTR_ERR(new_nsp->pid_ns); + new_nsp->pid_ns_for_children = + copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children); + if (IS_ERR(new_nsp->pid_ns_for_children)) { + err = PTR_ERR(new_nsp->pid_ns_for_children); goto out_pid; } @@ -100,8 +101,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, return new_nsp; out_net: - if (new_nsp->pid_ns) - put_pid_ns(new_nsp->pid_ns); + if (new_nsp->pid_ns_for_children) + put_pid_ns(new_nsp->pid_ns_for_children); out_pid: if (new_nsp->ipc_ns) put_ipc_ns(new_nsp->ipc_ns); @@ -125,22 +126,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) struct nsproxy *old_ns = tsk->nsproxy; struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); struct nsproxy *new_ns; - int err = 0; - if (!old_ns) + if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | + CLONE_NEWPID | CLONE_NEWNET)))) { + get_nsproxy(old_ns); return 0; - - get_nsproxy(old_ns); - - if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWPID | CLONE_NEWNET))) - return 0; - - if (!ns_capable(user_ns, CAP_SYS_ADMIN)) { - err = -EPERM; - goto out; } + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + return -EPERM; + /* * CLONE_NEWIPC must detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old @@ -148,22 +143,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) * means share undolist with parent, so we must forbid using * it along with CLONE_NEWIPC. */ - if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) { - err = -EINVAL; - goto out; - } + if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) == + (CLONE_NEWIPC | CLONE_SYSVSEM)) + return -EINVAL; new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); - if (IS_ERR(new_ns)) { - err = PTR_ERR(new_ns); - goto out; - } + if (IS_ERR(new_ns)) + return PTR_ERR(new_ns); tsk->nsproxy = new_ns; - -out: - put_nsproxy(old_ns); - return err; + return 0; } void free_nsproxy(struct nsproxy *ns) @@ -174,8 +163,8 @@ void free_nsproxy(struct nsproxy *ns) put_uts_ns(ns->uts_ns); if (ns->ipc_ns) put_ipc_ns(ns->ipc_ns); - if (ns->pid_ns) - put_pid_ns(ns->pid_ns); + if (ns->pid_ns_for_children) + put_pid_ns(ns->pid_ns_for_children); put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); } diff --git a/kernel/padata.c b/kernel/padata.c index 072f4ee4eb8..07af2c95dcf 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -846,6 +846,8 @@ static int padata_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); @@ -857,6 +859,8 @@ static int padata_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); @@ -865,22 +869,6 @@ static int padata_cpu_callback(struct notifier_block *nfb, if (err) return notifier_from_errno(err); break; - - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!pinst_has_cpu(pinst, cpu)) - break; - mutex_lock(&pinst->lock); - __padata_remove_cpu(pinst, cpu); - mutex_unlock(&pinst->lock); - - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - if (!pinst_has_cpu(pinst, cpu)) - break; - mutex_lock(&pinst->lock); - __padata_add_cpu(pinst, cpu); - mutex_unlock(&pinst->lock); } return NOTIFY_OK; @@ -1086,18 +1074,18 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq, pinst->flags = 0; -#ifdef CONFIG_HOTPLUG_CPU - pinst->cpu_notifier.notifier_call = padata_cpu_callback; - pinst->cpu_notifier.priority = 0; - register_hotcpu_notifier(&pinst->cpu_notifier); -#endif - put_online_cpus(); BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); kobject_init(&pinst->kobj, &padata_attr_type); mutex_init(&pinst->lock); +#ifdef CONFIG_HOTPLUG_CPU + pinst->cpu_notifier.notifier_call = padata_cpu_callback; + pinst->cpu_notifier.priority = 0; + register_hotcpu_notifier(&pinst->cpu_notifier); +#endif + return pinst; err_free_masks: diff --git a/kernel/panic.c b/kernel/panic.c index 167ec097ce8..b6c482ccc5d 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -15,6 +15,7 @@ #include <linux/notifier.h> #include <linux/module.h> #include <linux/random.h> +#include <linux/ftrace.h> #include <linux/reboot.h> #include <linux/delay.h> #include <linux/kexec.h> @@ -122,10 +123,14 @@ void panic(const char *fmt, ...) */ smp_send_stop(); - kmsg_dump(KMSG_DUMP_PANIC); - + /* + * Run any panic handlers, including those that might need to + * add information to the kmsg dump output. + */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + kmsg_dump(KMSG_DUMP_PANIC); + bust_spinlocks(0); if (!panic_blink) @@ -399,8 +404,11 @@ struct slowpath_args { static void warn_slowpath_common(const char *file, int line, void *caller, unsigned taint, struct slowpath_args *args) { - printk(KERN_WARNING "------------[ cut here ]------------\n"); - printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); + disable_trace_on_warning(); + + pr_warn("------------[ cut here ]------------\n"); + pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n", + raw_smp_processor_id(), current->pid, file, line, caller); if (args) vprintk(args->fmt, args->args); diff --git a/kernel/params.c b/kernel/params.c index 53b958fcd63..c00d5b502aa 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -103,8 +103,8 @@ static int parse_one(char *param, || params[i].level > max_level) return 0; /* No one handled NULL, so do it here. */ - if (!val && params[i].ops->set != param_set_bool - && params[i].ops->set != param_set_bint) + if (!val && + !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG)) return -EINVAL; pr_debug("handling %s with %p\n", param, params[i].ops->set); @@ -241,7 +241,8 @@ int parse_args(const char *doing, } \ int param_get_##name(char *buffer, const struct kernel_param *kp) \ { \ - return sprintf(buffer, format, *((type *)kp->arg)); \ + return scnprintf(buffer, PAGE_SIZE, format, \ + *((type *)kp->arg)); \ } \ struct kernel_param_ops param_ops_##name = { \ .set = param_set_##name, \ @@ -252,13 +253,13 @@ int parse_args(const char *doing, EXPORT_SYMBOL(param_ops_##name) -STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul); -STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); -STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul); -STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol); -STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul); -STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); -STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); +STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul); +STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtol); +STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul); +STANDARD_PARAM_DEF(int, int, "%i", long, kstrtol); +STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul); +STANDARD_PARAM_DEF(long, long, "%li", long, kstrtol); +STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul); int param_set_charp(const char *val, const struct kernel_param *kp) { @@ -285,7 +286,7 @@ EXPORT_SYMBOL(param_set_charp); int param_get_charp(char *buffer, const struct kernel_param *kp) { - return sprintf(buffer, "%s", *((char **)kp->arg)); + return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg)); } EXPORT_SYMBOL(param_get_charp); @@ -320,6 +321,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp) EXPORT_SYMBOL(param_get_bool); struct kernel_param_ops param_ops_bool = { + .flags = KERNEL_PARAM_FL_NOARG, .set = param_set_bool, .get = param_get_bool, }; @@ -370,6 +372,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp) EXPORT_SYMBOL(param_set_bint); struct kernel_param_ops param_ops_bint = { + .flags = KERNEL_PARAM_FL_NOARG, .set = param_set_bint, .get = param_get_int, }; @@ -787,7 +790,7 @@ static void __init kernel_add_sysfs_param(const char *name, } /* - * param_sysfs_builtin - add contents in /sys/parameters for built-in modules + * param_sysfs_builtin - add sysfs parameters for built-in modules * * Add module_parameters to sysfs for "modules" built into the kernel. * @@ -827,7 +830,7 @@ ssize_t __modver_version_show(struct module_attribute *mattr, struct module_version_attribute *vattr = container_of(mattr, struct module_version_attribute, mattr); - return sprintf(buf, "%s\n", vattr->version); + return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version); } extern const struct module_version_attribute *__start___modver[]; @@ -912,7 +915,14 @@ static const struct kset_uevent_ops module_uevent_ops = { struct kset *module_kset; int module_sysfs_initialized; +static void module_kobj_release(struct kobject *kobj) +{ + struct module_kobject *mk = to_module_kobject(kobj); + complete(mk->kobj_completion); +} + struct kobj_type module_ktype = { + .release = module_kobj_release, .sysfs_ops = &module_sysfs_ops, }; diff --git a/kernel/pid.c b/kernel/pid.c index 0db3e791a06..9b9a2669814 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -75,6 +75,7 @@ struct pid_namespace init_pid_ns = { [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }, .last_pid = 0, + .nr_hashed = PIDNS_HASH_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, @@ -264,6 +265,7 @@ void free_pid(struct pid *pid) struct pid_namespace *ns = upid->ns; hlist_del_rcu(&upid->pid_chain); switch(--ns->nr_hashed) { + case 2: case 1: /* When all that is left in the pid namespace * is the reaper wake up the reaper. The reaper @@ -271,6 +273,11 @@ void free_pid(struct pid *pid) */ wake_up_process(ns->child_reaper); break; + case PIDNS_HASH_ADDING: + /* Handle a fork failure of the first process */ + WARN_ON(ns->child_reaper); + ns->nr_hashed = 0; + /* fall through */ case 0: schedule_work(&ns->proc_work); break; @@ -373,14 +380,10 @@ EXPORT_SYMBOL_GPL(find_vpid); /* * attach_pid() must be called with the tasklist_lock write-held. */ -void attach_pid(struct task_struct *task, enum pid_type type, - struct pid *pid) +void attach_pid(struct task_struct *task, enum pid_type type) { - struct pid_link *link; - - link = &task->pids[type]; - link->pid = pid; - hlist_add_head_rcu(&link->node, &pid->tasks[type]); + struct pid_link *link = &task->pids[type]; + hlist_add_head_rcu(&link->node, &link->pid->tasks[type]); } static void __change_pid(struct task_struct *task, enum pid_type type, @@ -412,7 +415,7 @@ void change_pid(struct task_struct *task, enum pid_type type, struct pid *pid) { __change_pid(task, type, pid); - attach_pid(task, type, pid); + attach_pid(task, type); } /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ @@ -594,7 +597,6 @@ void __init pidmap_init(void) /* Reserve PID 0. We never call free_pidmap(0) */ set_bit(0, init_pid_ns.pidmap[0].page); atomic_dec(&init_pid_ns.pidmap[0].nr_free); - init_pid_ns.nr_hashed = PIDNS_HASH_ADDING; init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6917e8edb48..42086551a24 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -329,7 +329,7 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) struct pid_namespace *ancestor, *new = ns; if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || - !nsown_capable(CAP_SYS_ADMIN)) + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; /* @@ -349,8 +349,8 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) if (ancestor != active) return -EINVAL; - put_pid_ns(nsproxy->pid_ns); - nsproxy->pid_ns = get_pid_ns(new); + put_pid_ns(nsproxy->pid_ns_for_children); + nsproxy->pid_ns_for_children = get_pid_ns(new); return 0; } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 42670e9b44e..c7f31aa272f 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -51,59 +51,28 @@ static int check_clock(const clockid_t which_clock) return error; } -static inline union cpu_time_count +static inline unsigned long long timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) { - union cpu_time_count ret; - ret.sched = 0; /* high half always zero when .cpu used */ + unsigned long long ret; + + ret = 0; /* high half always zero when .cpu used */ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; + ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; } else { - ret.cpu = timespec_to_cputime(tp); + ret = cputime_to_expires(timespec_to_cputime(tp)); } return ret; } static void sample_to_timespec(const clockid_t which_clock, - union cpu_time_count cpu, + unsigned long long expires, struct timespec *tp) { if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) - *tp = ns_to_timespec(cpu.sched); + *tp = ns_to_timespec(expires); else - cputime_to_timespec(cpu.cpu, tp); -} - -static inline int cpu_time_before(const clockid_t which_clock, - union cpu_time_count now, - union cpu_time_count then) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - return now.sched < then.sched; - } else { - return now.cpu < then.cpu; - } -} -static inline void cpu_time_add(const clockid_t which_clock, - union cpu_time_count *acc, - union cpu_time_count val) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - acc->sched += val.sched; - } else { - acc->cpu += val.cpu; - } -} -static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, - union cpu_time_count a, - union cpu_time_count b) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - a.sched -= b.sched; - } else { - a.cpu -= b.cpu; - } - return a; + cputime_to_timespec((__force cputime_t)expires, tp); } /* @@ -111,47 +80,31 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, * given the current clock sample. */ static void bump_cpu_timer(struct k_itimer *timer, - union cpu_time_count now) + unsigned long long now) { int i; + unsigned long long delta, incr; - if (timer->it.cpu.incr.sched == 0) + if (timer->it.cpu.incr == 0) return; - if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { - unsigned long long delta, incr; + if (now < timer->it.cpu.expires) + return; - if (now.sched < timer->it.cpu.expires.sched) - return; - incr = timer->it.cpu.incr.sched; - delta = now.sched + incr - timer->it.cpu.expires.sched; - /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; incr < delta - incr; i++) - incr = incr << 1; - for (; i >= 0; incr >>= 1, i--) { - if (delta < incr) - continue; - timer->it.cpu.expires.sched += incr; - timer->it_overrun += 1 << i; - delta -= incr; - } - } else { - cputime_t delta, incr; + incr = timer->it.cpu.incr; + delta = now + incr - timer->it.cpu.expires; - if (now.cpu < timer->it.cpu.expires.cpu) - return; - incr = timer->it.cpu.incr.cpu; - delta = now.cpu + incr - timer->it.cpu.expires.cpu; - /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; incr < delta - incr; i++) - incr += incr; - for (; i >= 0; incr = incr >> 1, i--) { - if (delta < incr) - continue; - timer->it.cpu.expires.cpu += incr; - timer->it_overrun += 1 << i; - delta -= incr; - } + /* Don't use (incr*2 < delta), incr*2 might overflow. */ + for (i = 0; incr < delta - incr; i++) + incr = incr << 1; + + for (; i >= 0; incr >>= 1, i--) { + if (delta < incr) + continue; + + timer->it.cpu.expires += incr; + timer->it_overrun += 1 << i; + delta -= incr; } } @@ -170,21 +123,21 @@ static inline int task_cputime_zero(const struct task_cputime *cputime) return 0; } -static inline cputime_t prof_ticks(struct task_struct *p) +static inline unsigned long long prof_ticks(struct task_struct *p) { cputime_t utime, stime; task_cputime(p, &utime, &stime); - return utime + stime; + return cputime_to_expires(utime + stime); } -static inline cputime_t virt_ticks(struct task_struct *p) +static inline unsigned long long virt_ticks(struct task_struct *p) { cputime_t utime; task_cputime(p, &utime, NULL); - return utime; + return cputime_to_expires(utime); } static int @@ -225,19 +178,19 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) * Sample a per-thread clock for the given task. */ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { switch (CPUCLOCK_WHICH(which_clock)) { default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = prof_ticks(p); + *sample = prof_ticks(p); break; case CPUCLOCK_VIRT: - cpu->cpu = virt_ticks(p); + *sample = virt_ticks(p); break; case CPUCLOCK_SCHED: - cpu->sched = task_sched_runtime(p); + *sample = task_sched_runtime(p); break; } return 0; @@ -284,7 +237,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) */ static int cpu_clock_sample_group(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { struct task_cputime cputime; @@ -293,15 +246,15 @@ static int cpu_clock_sample_group(const clockid_t which_clock, return -EINVAL; case CPUCLOCK_PROF: thread_group_cputime(p, &cputime); - cpu->cpu = cputime.utime + cputime.stime; + *sample = cputime_to_expires(cputime.utime + cputime.stime); break; case CPUCLOCK_VIRT: thread_group_cputime(p, &cputime); - cpu->cpu = cputime.utime; + *sample = cputime_to_expires(cputime.utime); break; case CPUCLOCK_SCHED: thread_group_cputime(p, &cputime); - cpu->sched = cputime.sum_exec_runtime; + *sample = cputime.sum_exec_runtime; break; } return 0; @@ -312,7 +265,7 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) { const pid_t pid = CPUCLOCK_PID(which_clock); int error = -EINVAL; - union cpu_time_count rtn; + unsigned long long rtn; if (pid == 0) { /* @@ -446,6 +399,15 @@ static int posix_cpu_timer_del(struct k_itimer *timer) return ret; } +static void cleanup_timers_list(struct list_head *head, + unsigned long long curr) +{ + struct cpu_timer_list *timer, *next; + + list_for_each_entry_safe(timer, next, head, entry) + list_del_init(&timer->entry); +} + /* * Clean out CPU timers still ticking when a thread exited. The task * pointer is cleared, and the expiry time is replaced with the residual @@ -456,37 +418,12 @@ static void cleanup_timers(struct list_head *head, cputime_t utime, cputime_t stime, unsigned long long sum_exec_runtime) { - struct cpu_timer_list *timer, *next; - cputime_t ptime = utime + stime; - - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires.cpu < ptime) { - timer->expires.cpu = 0; - } else { - timer->expires.cpu -= ptime; - } - } - ++head; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires.cpu < utime) { - timer->expires.cpu = 0; - } else { - timer->expires.cpu -= utime; - } - } + cputime_t ptime = utime + stime; - ++head; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires.sched < sum_exec_runtime) { - timer->expires.sched = 0; - } else { - timer->expires.sched -= sum_exec_runtime; - } - } + cleanup_timers_list(head, cputime_to_expires(ptime)); + cleanup_timers_list(++head, cputime_to_expires(utime)); + cleanup_timers_list(++head, sum_exec_runtime); } /* @@ -516,17 +453,21 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } -static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) +static void clear_dead_task(struct k_itimer *itimer, unsigned long long now) { + struct cpu_timer_list *timer = &itimer->it.cpu; + /* * That's all for this thread or process. * We leave our residual in expires to be reported. */ - put_task_struct(timer->it.cpu.task); - timer->it.cpu.task = NULL; - timer->it.cpu.expires = cpu_time_sub(timer->it_clock, - timer->it.cpu.expires, - now); + put_task_struct(timer->task); + timer->task = NULL; + if (timer->expires < now) { + timer->expires = 0; + } else { + timer->expires -= now; + } } static inline int expires_gt(cputime_t expires, cputime_t new_exp) @@ -558,14 +499,14 @@ static void arm_timer(struct k_itimer *timer) listpos = head; list_for_each_entry(next, head, entry) { - if (cpu_time_before(timer->it_clock, nt->expires, next->expires)) + if (nt->expires < next->expires) break; listpos = &next->entry; } list_add(&nt->entry, listpos); if (listpos == head) { - union cpu_time_count *exp = &nt->expires; + unsigned long long exp = nt->expires; /* * We are the new earliest-expiring POSIX 1.b timer, hence @@ -576,17 +517,17 @@ static void arm_timer(struct k_itimer *timer) switch (CPUCLOCK_WHICH(timer->it_clock)) { case CPUCLOCK_PROF: - if (expires_gt(cputime_expires->prof_exp, exp->cpu)) - cputime_expires->prof_exp = exp->cpu; + if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp))) + cputime_expires->prof_exp = expires_to_cputime(exp); break; case CPUCLOCK_VIRT: - if (expires_gt(cputime_expires->virt_exp, exp->cpu)) - cputime_expires->virt_exp = exp->cpu; + if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp))) + cputime_expires->virt_exp = expires_to_cputime(exp); break; case CPUCLOCK_SCHED: if (cputime_expires->sched_exp == 0 || - cputime_expires->sched_exp > exp->sched) - cputime_expires->sched_exp = exp->sched; + cputime_expires->sched_exp > exp) + cputime_expires->sched_exp = exp; break; } } @@ -601,20 +542,20 @@ static void cpu_timer_fire(struct k_itimer *timer) /* * User don't want any signal. */ - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; } else if (unlikely(timer->sigq == NULL)) { /* * This a special case for clock_nanosleep, * not a normal timer from sys_timer_create. */ wake_up_process(timer->it_process); - timer->it.cpu.expires.sched = 0; - } else if (timer->it.cpu.incr.sched == 0) { + timer->it.cpu.expires = 0; + } else if (timer->it.cpu.incr == 0) { /* * One-shot timer. Clear it as soon as it's fired. */ posix_timer_event(timer, 0); - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { /* * The signal did not get queued because the signal @@ -632,7 +573,7 @@ static void cpu_timer_fire(struct k_itimer *timer) */ static int cpu_timer_sample_group(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { struct task_cputime cputime; @@ -641,13 +582,13 @@ static int cpu_timer_sample_group(const clockid_t which_clock, default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = cputime.utime + cputime.stime; + *sample = cputime_to_expires(cputime.utime + cputime.stime); break; case CPUCLOCK_VIRT: - cpu->cpu = cputime.utime; + *sample = cputime_to_expires(cputime.utime); break; case CPUCLOCK_SCHED: - cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); + *sample = cputime.sum_exec_runtime + task_delta_exec(p); break; } return 0; @@ -694,7 +635,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, struct itimerspec *new, struct itimerspec *old) { struct task_struct *p = timer->it.cpu.task; - union cpu_time_count old_expires, new_expires, old_incr, val; + unsigned long long old_expires, new_expires, old_incr, val; int ret; if (unlikely(p == NULL)) { @@ -749,7 +690,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, } if (old) { - if (old_expires.sched == 0) { + if (old_expires == 0) { old->it_value.tv_sec = 0; old->it_value.tv_nsec = 0; } else { @@ -764,11 +705,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, * new setting. */ bump_cpu_timer(timer, val); - if (cpu_time_before(timer->it_clock, val, - timer->it.cpu.expires)) { - old_expires = cpu_time_sub( - timer->it_clock, - timer->it.cpu.expires, val); + if (val < timer->it.cpu.expires) { + old_expires = timer->it.cpu.expires - val; sample_to_timespec(timer->it_clock, old_expires, &old->it_value); @@ -791,8 +729,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, goto out; } - if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) { - cpu_time_add(timer->it_clock, &new_expires, val); + if (new_expires != 0 && !(flags & TIMER_ABSTIME)) { + new_expires += val; } /* @@ -801,8 +739,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, * arm the timer (we'll just fake it for timer_gettime). */ timer->it.cpu.expires = new_expires; - if (new_expires.sched != 0 && - cpu_time_before(timer->it_clock, val, new_expires)) { + if (new_expires != 0 && val < new_expires) { arm_timer(timer); } @@ -826,8 +763,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, timer->it_overrun_last = 0; timer->it_overrun = -1; - if (new_expires.sched != 0 && - !cpu_time_before(timer->it_clock, val, new_expires)) { + if (new_expires != 0 && !(val < new_expires)) { /* * The designated time already passed, so we notify * immediately, even if the thread never runs to @@ -849,7 +785,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) { - union cpu_time_count now; + unsigned long long now; struct task_struct *p = timer->it.cpu.task; int clear_dead; @@ -859,7 +795,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) sample_to_timespec(timer->it_clock, timer->it.cpu.incr, &itp->it_interval); - if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */ + if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; return; } @@ -891,7 +827,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) */ put_task_struct(p); timer->it.cpu.task = NULL; - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; read_unlock(&tasklist_lock); goto dead; } else { @@ -912,10 +848,9 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) goto dead; } - if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) { + if (now < timer->it.cpu.expires) { sample_to_timespec(timer->it_clock, - cpu_time_sub(timer->it_clock, - timer->it.cpu.expires, now), + timer->it.cpu.expires - now, &itp->it_value); } else { /* @@ -927,6 +862,28 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) } } +static unsigned long long +check_timers_list(struct list_head *timers, + struct list_head *firing, + unsigned long long curr) +{ + int maxfire = 20; + + while (!list_empty(timers)) { + struct cpu_timer_list *t; + + t = list_first_entry(timers, struct cpu_timer_list, entry); + + if (!--maxfire || curr < t->expires) + return t->expires; + + t->firing = 1; + list_move_tail(&t->entry, firing); + } + + return 0; +} + /* * Check for any per-thread CPU timers that have fired and move them off * the tsk->cpu_timers[N] list onto the firing list. Here we update the @@ -935,54 +892,20 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) static void check_thread_timers(struct task_struct *tsk, struct list_head *firing) { - int maxfire; struct list_head *timers = tsk->cpu_timers; struct signal_struct *const sig = tsk->signal; + struct task_cputime *tsk_expires = &tsk->cputime_expires; + unsigned long long expires; unsigned long soft; - maxfire = 20; - tsk->cputime_expires.prof_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) { - tsk->cputime_expires.prof_exp = t->expires.cpu; - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + expires = check_timers_list(timers, firing, prof_ticks(tsk)); + tsk_expires->prof_exp = expires_to_cputime(expires); - ++timers; - maxfire = 20; - tsk->cputime_expires.virt_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) { - tsk->cputime_expires.virt_exp = t->expires.cpu; - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + expires = check_timers_list(++timers, firing, virt_ticks(tsk)); + tsk_expires->virt_exp = expires_to_cputime(expires); - ++timers; - maxfire = 20; - tsk->cputime_expires.sched_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { - tsk->cputime_expires.sched_exp = t->expires.sched; - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + tsk_expires->sched_exp = check_timers_list(++timers, firing, + tsk->se.sum_exec_runtime); /* * Check for the special case thread timers. @@ -1030,7 +953,8 @@ static void stop_process_timers(struct signal_struct *sig) static u32 onecputick; static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, - cputime_t *expires, cputime_t cur_time, int signo) + unsigned long long *expires, + unsigned long long cur_time, int signo) { if (!it->expires) return; @@ -1066,9 +990,8 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, static void check_process_timers(struct task_struct *tsk, struct list_head *firing) { - int maxfire; struct signal_struct *const sig = tsk->signal; - cputime_t utime, ptime, virt_expires, prof_expires; + unsigned long long utime, ptime, virt_expires, prof_expires; unsigned long long sum_sched_runtime, sched_expires; struct list_head *timers = sig->cpu_timers; struct task_cputime cputime; @@ -1078,52 +1001,13 @@ static void check_process_timers(struct task_struct *tsk, * Collect the current process totals. */ thread_group_cputimer(tsk, &cputime); - utime = cputime.utime; - ptime = utime + cputime.stime; + utime = cputime_to_expires(cputime.utime); + ptime = utime + cputime_to_expires(cputime.stime); sum_sched_runtime = cputime.sum_exec_runtime; - maxfire = 20; - prof_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || ptime < tl->expires.cpu) { - prof_expires = tl->expires.cpu; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } - ++timers; - maxfire = 20; - virt_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || utime < tl->expires.cpu) { - virt_expires = tl->expires.cpu; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } - - ++timers; - maxfire = 20; - sched_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || sum_sched_runtime < tl->expires.sched) { - sched_expires = tl->expires.sched; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } + prof_expires = check_timers_list(timers, firing, ptime); + virt_expires = check_timers_list(++timers, firing, utime); + sched_expires = check_timers_list(++timers, firing, sum_sched_runtime); /* * Check for the special case process timers. @@ -1162,8 +1046,8 @@ static void check_process_timers(struct task_struct *tsk, } } - sig->cputime_expires.prof_exp = prof_expires; - sig->cputime_expires.virt_exp = virt_expires; + sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires); + sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires); sig->cputime_expires.sched_exp = sched_expires; if (task_cputime_zero(&sig->cputime_expires)) stop_process_timers(sig); @@ -1176,7 +1060,7 @@ static void check_process_timers(struct task_struct *tsk, void posix_cpu_timer_schedule(struct k_itimer *timer) { struct task_struct *p = timer->it.cpu.task; - union cpu_time_count now; + unsigned long long now; if (unlikely(p == NULL)) /* @@ -1205,7 +1089,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) */ put_task_struct(p); timer->it.cpu.task = p = NULL; - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; goto out_unlock; } else if (unlikely(p->exit_state) && thread_group_empty(p)) { /* @@ -1213,6 +1097,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) * not yet reaped. Take this opportunity to * drop our task ref. */ + cpu_timer_sample_group(timer->it_clock, p, &now); clear_dead_task(timer, now); goto out_unlock; } @@ -1387,7 +1272,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_t *newval, cputime_t *oldval) { - union cpu_time_count now; + unsigned long long now; BUG_ON(clock_idx == CPUCLOCK_SCHED); cpu_timer_sample_group(clock_idx, tsk, &now); @@ -1399,17 +1284,17 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, * it to be absolute. */ if (*oldval) { - if (*oldval <= now.cpu) { + if (*oldval <= now) { /* Just about to fire. */ *oldval = cputime_one_jiffy; } else { - *oldval -= now.cpu; + *oldval -= now; } } if (!*newval) goto out; - *newval += now.cpu; + *newval += now; } /* @@ -1459,7 +1344,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, } while (!signal_pending(current)) { - if (timer.it.cpu.expires.sched == 0) { + if (timer.it.cpu.expires == 0) { /* * Our timer fired and was reset, below * deletion can not fail. diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 5dfdc9ea180..d444c4e834f 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -100,7 +100,6 @@ config PM_SLEEP_SMP depends on SMP depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE depends on PM_SLEEP - select HOTPLUG select HOTPLUG_CPU config PM_AUTOSLEEP @@ -263,6 +262,26 @@ config PM_GENERIC_DOMAINS bool depends on PM +config WQ_POWER_EFFICIENT_DEFAULT + bool "Enable workqueue power-efficient mode by default" + depends on PM + default n + help + Per-cpu workqueues are generally preferred because they show + better performance thanks to cache locality; unfortunately, + per-cpu workqueues tend to be more power hungry than unbound + workqueues. + + Enabling workqueue.power_efficient kernel parameter makes the + per-cpu workqueues which were observed to contribute + significantly to power consumption unbound, leading to measurably + lower power usage at the cost of small performance overhead. + + This config option determines whether workqueue.power_efficient + is enabled by default. + + If in doubt, say N. + config PM_GENERIC_DOMAINS_SLEEP def_bool y depends on PM_SLEEP && PM_GENERIC_DOMAINS diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index c6422ffeda9..9012ecf7b81 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -32,7 +32,8 @@ static void try_to_suspend(struct work_struct *work) mutex_lock(&autosleep_lock); - if (!pm_save_wakeup_count(initial_count)) { + if (!pm_save_wakeup_count(initial_count) || + system_state != SYSTEM_RUNNING) { mutex_unlock(&autosleep_lock); goto out; } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b26f5f1e773..0121dab83f4 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -39,7 +39,7 @@ static int resume_delay; static char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; -int in_suspend __nosavedata; +__visible int in_suspend __nosavedata; enum { HIBERNATION_INVALID, @@ -644,22 +644,23 @@ int hibernate(void) if (error) goto Exit; - /* Allocate memory management structures */ - error = create_basic_memory_bitmaps(); - if (error) - goto Exit; - printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); printk("done.\n"); error = freeze_processes(); if (error) - goto Free_bitmaps; + goto Exit; + + lock_device_hotplug(); + /* Allocate memory management structures */ + error = create_basic_memory_bitmaps(); + if (error) + goto Thaw; error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); if (error || freezer_test_done) - goto Thaw; + goto Free_bitmaps; if (in_suspend) { unsigned int flags = 0; @@ -682,14 +683,14 @@ int hibernate(void) pr_debug("PM: Image restored successfully.\n"); } + Free_bitmaps: + free_basic_memory_bitmaps(); Thaw: + unlock_device_hotplug(); thaw_processes(); /* Don't bother checking whether freezer_test_done is true */ freezer_test_done = false; - - Free_bitmaps: - free_basic_memory_bitmaps(); Exit: pm_notifier_call_chain(PM_POST_HIBERNATION); pm_restore_console(); @@ -806,21 +807,20 @@ static int software_resume(void) pm_prepare_console(); error = pm_notifier_call_chain(PM_RESTORE_PREPARE); if (error) - goto close_finish; - - error = create_basic_memory_bitmaps(); - if (error) - goto close_finish; + goto Close_Finish; pr_debug("PM: Preparing processes for restore.\n"); error = freeze_processes(); - if (error) { - swsusp_close(FMODE_READ); - goto Done; - } + if (error) + goto Close_Finish; pr_debug("PM: Loading hibernation image.\n"); + lock_device_hotplug(); + error = create_basic_memory_bitmaps(); + if (error) + goto Thaw; + error = swsusp_read(&flags); swsusp_close(FMODE_READ); if (!error) @@ -828,9 +828,10 @@ static int software_resume(void) printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); swsusp_free(); - thaw_processes(); - Done: free_basic_memory_bitmaps(); + Thaw: + unlock_device_hotplug(); + thaw_processes(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); pm_restore_console(); @@ -840,12 +841,12 @@ static int software_resume(void) mutex_unlock(&pm_mutex); pr_debug("PM: Hibernation image not present or could not be loaded.\n"); return error; -close_finish: + Close_Finish: swsusp_close(FMODE_READ); goto Finish; } -late_initcall(software_resume); +late_initcall_sync(software_resume); static const char * const hibernation_modes[] = { diff --git a/kernel/power/main.c b/kernel/power/main.c index d77663bfede..1d1bf630e6e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -424,6 +424,8 @@ static ssize_t wakeup_count_store(struct kobject *kobj, if (sscanf(buf, "%u", &val) == 1) { if (pm_save_wakeup_count(val)) error = n; + else + pm_print_active_wakeup_sources(); } out: @@ -528,6 +530,10 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, if (sscanf(buf, "%d", &val) == 1) { pm_trace_enabled = !!val; + if (pm_trace_enabled) { + pr_warn("PM: Enabling pm_trace changes system date and time during resume.\n" + "PM: Correct system time has to be restored manually after resume.\n"); + } return n; } return -EINVAL; diff --git a/kernel/power/process.c b/kernel/power/process.c index 98088e0e71e..06ec8869dbf 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -30,9 +30,10 @@ static int try_to_freeze_tasks(bool user_only) unsigned int todo; bool wq_busy = false; struct timeval start, end; - u64 elapsed_csecs64; - unsigned int elapsed_csecs; + u64 elapsed_msecs64; + unsigned int elapsed_msecs; bool wakeup = false; + int sleep_usecs = USEC_PER_MSEC; do_gettimeofday(&start); @@ -68,22 +69,25 @@ static int try_to_freeze_tasks(bool user_only) /* * We need to retry, but first give the freezing tasks some - * time to enter the refrigerator. + * time to enter the refrigerator. Start with an initial + * 1 ms sleep followed by exponential backoff until 8 ms. */ - msleep(10); + usleep_range(sleep_usecs / 2, sleep_usecs); + if (sleep_usecs < 8 * USEC_PER_MSEC) + sleep_usecs *= 2; } do_gettimeofday(&end); - elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); - do_div(elapsed_csecs64, NSEC_PER_SEC / 100); - elapsed_csecs = elapsed_csecs64; + elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); + do_div(elapsed_msecs64, NSEC_PER_MSEC); + elapsed_msecs = elapsed_msecs64; if (todo) { printk("\n"); - printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " + printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds " "(%d tasks refusing to freeze, wq_busy=%d):\n", wakeup ? "aborted" : "failed", - elapsed_csecs / 100, elapsed_csecs % 100, + elapsed_msecs / 1000, elapsed_msecs % 1000, todo - wq_busy, wq_busy); if (!wakeup) { @@ -96,8 +100,8 @@ static int try_to_freeze_tasks(bool user_only) read_unlock(&tasklist_lock); } } else { - printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, - elapsed_csecs % 100); + printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, + elapsed_msecs % 1000); } return todo ? -EBUSY : 0; @@ -105,6 +109,8 @@ static int try_to_freeze_tasks(bool user_only) /** * freeze_processes - Signal user space processes to enter the refrigerator. + * The current thread will not be frozen. The same process that calls + * freeze_processes must later call thaw_processes. * * On success, returns 0. On failure, -errno and system is fully thawed. */ @@ -116,6 +122,9 @@ int freeze_processes(void) if (error) return error; + /* Make sure this task doesn't get frozen */ + current->flags |= PF_SUSPEND_TASK; + if (!pm_freezing) atomic_inc(&system_freezing_cnt); @@ -164,6 +173,7 @@ int freeze_kernel_threads(void) void thaw_processes(void) { struct task_struct *g, *p; + struct task_struct *curr = current; if (pm_freezing) atomic_dec(&system_freezing_cnt); @@ -178,10 +188,15 @@ void thaw_processes(void) read_lock(&tasklist_lock); do_each_thread(g, p) { + /* No other threads should have PF_SUSPEND_TASK set */ + WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); __thaw_task(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); + WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); + curr->flags &= ~PF_SUSPEND_TASK; + usermodehelper_enable(); schedule(); diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 587dddeebf1..a394297f8b2 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -44,6 +44,7 @@ #include <linux/uaccess.h> #include <linux/export.h> +#include <trace/events/power.h> /* * locking rule: all changes to constraints or notifiers lists @@ -202,6 +203,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, spin_unlock_irqrestore(&pm_qos_lock, flags); + trace_pm_qos_update_target(action, prev_value, curr_value); if (prev_value != curr_value) { blocking_notifier_call_chain(c->notifiers, (unsigned long)curr_value, @@ -272,6 +274,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf, spin_unlock_irqrestore(&pm_qos_lock, irqflags); + trace_pm_qos_update_flags(action, prev_value, curr_value); return prev_value != curr_value; } @@ -293,6 +296,17 @@ int pm_qos_request_active(struct pm_qos_request *req) } EXPORT_SYMBOL_GPL(pm_qos_request_active); +static void __pm_qos_update_request(struct pm_qos_request *req, + s32 new_value) +{ + trace_pm_qos_update_request(req->pm_qos_class, new_value); + + if (new_value != req->node.prio) + pm_qos_update_target( + pm_qos_array[req->pm_qos_class]->constraints, + &req->node, PM_QOS_UPDATE_REQ, new_value); +} + /** * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout * @work: work struct for the delayed work (timeout) @@ -305,7 +319,7 @@ static void pm_qos_work_fn(struct work_struct *work) struct pm_qos_request, work); - pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); + __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); } /** @@ -333,6 +347,7 @@ void pm_qos_add_request(struct pm_qos_request *req, } req->pm_qos_class = pm_qos_class; INIT_DELAYED_WORK(&req->work, pm_qos_work_fn); + trace_pm_qos_add_request(pm_qos_class, value); pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, &req->node, PM_QOS_ADD_REQ, value); } @@ -360,11 +375,7 @@ void pm_qos_update_request(struct pm_qos_request *req, } cancel_delayed_work_sync(&req->work); - - if (new_value != req->node.prio) - pm_qos_update_target( - pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_UPDATE_REQ, new_value); + __pm_qos_update_request(req, new_value); } EXPORT_SYMBOL_GPL(pm_qos_update_request); @@ -387,6 +398,8 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value, cancel_delayed_work_sync(&req->work); + trace_pm_qos_update_request_timeout(req->pm_qos_class, + new_value, timeout_us); if (new_value != req->node.prio) pm_qos_update_target( pm_qos_array[req->pm_qos_class]->constraints, @@ -416,6 +429,7 @@ void pm_qos_remove_request(struct pm_qos_request *req) cancel_delayed_work_sync(&req->work); + trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE); pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, &req->node, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); @@ -477,7 +491,7 @@ static int find_pm_qos_object_by_minor(int minor) { int pm_qos_class; - for (pm_qos_class = 0; + for (pm_qos_class = PM_QOS_CPU_DMA_LATENCY; pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { if (minor == pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) @@ -491,7 +505,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) long pm_qos_class; pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); - if (pm_qos_class >= 0) { + if (pm_qos_class >= PM_QOS_CPU_DMA_LATENCY) { struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; @@ -584,7 +598,7 @@ static int __init pm_qos_power_init(void) BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); - for (i = 1; i < PM_QOS_NUM_CLASSES; i++) { + for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { ret = register_pm_qos_misc(pm_qos_array[i]); if (ret < 0) { printk(KERN_ERR "pm_qos_param: %s setup failed\n", diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0de28576807..98c3b34a4cf 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -352,7 +352,7 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) struct mem_extent *ext, *cur, *aux; zone_start = zone->zone_start_pfn; - zone_end = zone->zone_start_pfn + zone->spanned_pages; + zone_end = zone_end_pfn(zone); list_for_each_entry(ext, list, hook) if (zone_start <= ext->end) @@ -642,8 +642,9 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); Report: - printk(KERN_INFO "PM: Registered nosave memory: %016lx - %016lx\n", - start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); + printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n", + (unsigned long long) start_pfn << PAGE_SHIFT, + ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); } /* @@ -742,7 +743,10 @@ int create_basic_memory_bitmaps(void) struct memory_bitmap *bm1, *bm2; int error = 0; - BUG_ON(forbidden_pages_map || free_pages_map); + if (forbidden_pages_map && free_pages_map) + return 0; + else + BUG_ON(forbidden_pages_map || free_pages_map); bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); if (!bm1) @@ -883,7 +887,7 @@ static unsigned int count_highmem_pages(void) continue; mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (saveable_highmem_page(zone, pfn)) n++; @@ -947,7 +951,7 @@ static unsigned int count_data_pages(void) continue; mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (saveable_page(zone, pfn)) n++; @@ -1040,7 +1044,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) unsigned long max_zone_pfn; mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (page_is_saveable(zone, pfn)) memory_bm_set_bit(orig_bm, pfn); @@ -1092,7 +1096,7 @@ void swsusp_free(void) unsigned long pfn, max_zone_pfn; for_each_populated_zone(zone) { - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); @@ -1651,7 +1655,7 @@ unsigned long snapshot_get_image_size(void) static int init_header(struct swsusp_info *info) { memset(info, 0, sizeof(struct swsusp_info)); - info->num_physpages = num_physpages; + info->num_physpages = get_num_physpages(); info->image_pages = nr_copy_pages; info->pages = snapshot_get_image_size(); info->size = info->pages; @@ -1754,7 +1758,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) /* Clear page flags */ for_each_populated_zone(zone) { - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) swsusp_unset_page_free(pfn_to_page(pfn)); @@ -1795,7 +1799,7 @@ static int check_header(struct swsusp_info *info) char *reason; reason = check_image_kernel(info); - if (!reason && info->num_physpages != num_physpages) + if (!reason && info->num_physpages != get_num_physpages()) reason = "memory size"; if (reason) { printk(KERN_ERR "PM: Image mismatch: %s\n", reason); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index bef86d121eb..62ee437b5c7 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -210,6 +210,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_wake; } + ftrace_stop(); error = disable_nonboot_cpus(); if (error || suspend_test(TEST_CPUS)) goto Enable_cpus; @@ -232,6 +233,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) Enable_cpus: enable_nonboot_cpus(); + ftrace_start(); Platform_wake: if (need_suspend_ops(state) && suspend_ops->wake) @@ -265,11 +267,10 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); - ftrace_stop(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: Some devices failed to suspend\n"); + pr_err("PM: Some devices failed to suspend, or early wake event detected\n"); goto Recover_platform; } suspend_test_finish("suspend devices"); @@ -285,7 +286,6 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); - ftrace_start(); resume_console(); Close: if (need_suspend_ops(state) && suspend_ops->end) diff --git a/kernel/power/user.c b/kernel/power/user.c index 4ed81e74f86..957f06164ad 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -39,6 +39,7 @@ static struct snapshot_data { char frozen; char ready; char platform_support; + bool free_bitmaps; } snapshot_state; atomic_t snapshot_device_available = ATOMIC_INIT(1); @@ -60,11 +61,6 @@ static int snapshot_open(struct inode *inode, struct file *filp) error = -ENOSYS; goto Unlock; } - if(create_basic_memory_bitmaps()) { - atomic_inc(&snapshot_device_available); - error = -ENOMEM; - goto Unlock; - } nonseekable_open(inode, filp); data = &snapshot_state; filp->private_data = data; @@ -87,13 +83,16 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->swap = -1; data->mode = O_WRONLY; error = pm_notifier_call_chain(PM_RESTORE_PREPARE); + if (!error) { + error = create_basic_memory_bitmaps(); + data->free_bitmaps = !error; + } if (error) pm_notifier_call_chain(PM_POST_RESTORE); } - if (error) { - free_basic_memory_bitmaps(); + if (error) atomic_inc(&snapshot_device_available); - } + data->frozen = 0; data->ready = 0; data->platform_support = 0; @@ -111,12 +110,14 @@ static int snapshot_release(struct inode *inode, struct file *filp) lock_system_sleep(); swsusp_free(); - free_basic_memory_bitmaps(); data = filp->private_data; free_all_swap_pages(data->swap); if (data->frozen) { pm_restore_gfp_mask(); + free_basic_memory_bitmaps(); thaw_processes(); + } else if (data->free_bitmaps) { + free_basic_memory_bitmaps(); } pm_notifier_call_chain(data->mode == O_RDONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); @@ -207,6 +208,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (!mutex_trylock(&pm_mutex)) return -EBUSY; + lock_device_hotplug(); data = filp->private_data; switch (cmd) { @@ -220,14 +222,23 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, printk("done.\n"); error = freeze_processes(); - if (!error) + if (error) + break; + + error = create_basic_memory_bitmaps(); + if (error) + thaw_processes(); + else data->frozen = 1; + break; case SNAPSHOT_UNFREEZE: if (!data->frozen || data->ready) break; pm_restore_gfp_mask(); + free_basic_memory_bitmaps(); + data->free_bitmaps = false; thaw_processes(); data->frozen = 0; break; @@ -371,6 +382,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } + unlock_device_hotplug(); mutex_unlock(&pm_mutex); return error; diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile new file mode 100644 index 00000000000..85405bdcf2b --- /dev/null +++ b/kernel/printk/Makefile @@ -0,0 +1,2 @@ +obj-y = printk.o +obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c new file mode 100644 index 00000000000..276762f3a46 --- /dev/null +++ b/kernel/printk/braille.c @@ -0,0 +1,49 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/console.h> +#include <linux/string.h> + +#include "console_cmdline.h" +#include "braille.h" + +char *_braille_console_setup(char **str, char **brl_options) +{ + if (!memcmp(*str, "brl,", 4)) { + *brl_options = ""; + *str += 4; + } else if (!memcmp(str, "brl=", 4)) { + *brl_options = *str + 4; + *str = strchr(*brl_options, ','); + if (!*str) + pr_err("need port name after brl=\n"); + else + *((*str)++) = 0; + } else + return NULL; + + return *str; +} + +int +_braille_register_console(struct console *console, struct console_cmdline *c) +{ + int rtn = 0; + + if (c->brl_options) { + console->flags |= CON_BRL; + rtn = braille_register_console(console, c->index, c->options, + c->brl_options); + } + + return rtn; +} + +int +_braille_unregister_console(struct console *console) +{ + if (console->flags & CON_BRL) + return braille_unregister_console(console); + + return 0; +} diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h new file mode 100644 index 00000000000..769d771145c --- /dev/null +++ b/kernel/printk/braille.h @@ -0,0 +1,48 @@ +#ifndef _PRINTK_BRAILLE_H +#define _PRINTK_BRAILLE_H + +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE + +static inline void +braille_set_options(struct console_cmdline *c, char *brl_options) +{ + c->brl_options = brl_options; +} + +char * +_braille_console_setup(char **str, char **brl_options); + +int +_braille_register_console(struct console *console, struct console_cmdline *c); + +int +_braille_unregister_console(struct console *console); + +#else + +static inline void +braille_set_options(struct console_cmdline *c, char *brl_options) +{ +} + +static inline char * +_braille_console_setup(char **str, char **brl_options) +{ + return NULL; +} + +static inline int +_braille_register_console(struct console *console, struct console_cmdline *c) +{ + return 0; +} + +static inline int +_braille_unregister_console(struct console *console) +{ + return 0; +} + +#endif + +#endif diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h new file mode 100644 index 00000000000..cbd69d84234 --- /dev/null +++ b/kernel/printk/console_cmdline.h @@ -0,0 +1,14 @@ +#ifndef _CONSOLE_CMDLINE_H +#define _CONSOLE_CMDLINE_H + +struct console_cmdline +{ + char name[8]; /* Name of the driver */ + int index; /* Minor dev. to use */ + char *options; /* Options for the driver */ +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE + char *brl_options; /* Options for braille driver */ +#endif +}; + +#endif diff --git a/kernel/printk.c b/kernel/printk/printk.c index fa36e149442..b4e8500afdb 100644 --- a/kernel/printk.c +++ b/kernel/printk/printk.c @@ -51,6 +51,9 @@ #define CREATE_TRACE_POINTS #include <trace/events/printk.h> +#include "console_cmdline.h" +#include "braille.h" + /* printk's without a loglevel use this.. */ #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL @@ -105,19 +108,11 @@ static struct console *exclusive_console; /* * Array of consoles built from command line options (console=) */ -struct console_cmdline -{ - char name[8]; /* Name of the driver */ - int index; /* Minor dev. to use */ - char *options; /* Options for the driver */ -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - char *brl_options; /* Options for braille driver */ -#endif -}; #define MAX_CMDLINECONSOLES 8 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; + static int selected_console = -1; static int preferred_console = -1; int console_set_on_cmdline; @@ -178,7 +173,7 @@ static int console_may_schedule; * 67 "g" * 0032 00 00 00 padding to next message header * - * The 'struct log' buffer header must never be directly exported to + * The 'struct printk_log' buffer header must never be directly exported to * userspace, it is a kernel-private implementation detail that might * need to be changed in the future, when the requirements change. * @@ -200,7 +195,7 @@ enum log_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; -struct log { +struct printk_log { u64 ts_nsec; /* timestamp in nanoseconds */ u16 len; /* length of entire record */ u16 text_len; /* length of text buffer */ @@ -248,7 +243,7 @@ static u32 clear_idx; #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) #define LOG_ALIGN 4 #else -#define LOG_ALIGN __alignof__(struct log) +#define LOG_ALIGN __alignof__(struct printk_log) #endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); @@ -259,35 +254,35 @@ static u32 log_buf_len = __LOG_BUF_LEN; static volatile unsigned int logbuf_cpu = UINT_MAX; /* human readable text of the record */ -static char *log_text(const struct log *msg) +static char *log_text(const struct printk_log *msg) { - return (char *)msg + sizeof(struct log); + return (char *)msg + sizeof(struct printk_log); } /* optional key/value pair dictionary attached to the record */ -static char *log_dict(const struct log *msg) +static char *log_dict(const struct printk_log *msg) { - return (char *)msg + sizeof(struct log) + msg->text_len; + return (char *)msg + sizeof(struct printk_log) + msg->text_len; } /* get record by index; idx must point to valid msg */ -static struct log *log_from_idx(u32 idx) +static struct printk_log *log_from_idx(u32 idx) { - struct log *msg = (struct log *)(log_buf + idx); + struct printk_log *msg = (struct printk_log *)(log_buf + idx); /* * A length == 0 record is the end of buffer marker. Wrap around and * read the message at the start of the buffer. */ if (!msg->len) - return (struct log *)log_buf; + return (struct printk_log *)log_buf; return msg; } /* get next record; idx must point to valid msg */ static u32 log_next(u32 idx) { - struct log *msg = (struct log *)(log_buf + idx); + struct printk_log *msg = (struct printk_log *)(log_buf + idx); /* length == 0 indicates the end of the buffer; wrap */ /* @@ -296,7 +291,7 @@ static u32 log_next(u32 idx) * return the one after that. */ if (!msg->len) { - msg = (struct log *)log_buf; + msg = (struct printk_log *)log_buf; return msg->len; } return idx + msg->len; @@ -308,11 +303,11 @@ static void log_store(int facility, int level, const char *dict, u16 dict_len, const char *text, u16 text_len) { - struct log *msg; + struct printk_log *msg; u32 size, pad_len; /* number of '\0' padding bytes to next message */ - size = sizeof(struct log) + text_len + dict_len; + size = sizeof(struct printk_log) + text_len + dict_len; pad_len = (-size) & (LOG_ALIGN - 1); size += pad_len; @@ -324,7 +319,7 @@ static void log_store(int facility, int level, else free = log_first_idx - log_next_idx; - if (free > size + sizeof(struct log)) + if (free > size + sizeof(struct printk_log)) break; /* drop old messages until we have enough contiuous space */ @@ -332,18 +327,18 @@ static void log_store(int facility, int level, log_first_seq++; } - if (log_next_idx + size + sizeof(struct log) >= log_buf_len) { + if (log_next_idx + size + sizeof(struct printk_log) >= log_buf_len) { /* * This message + an additional empty header does not fit * at the end of the buffer. Add an empty header with len == 0 * to signify a wrap around. */ - memset(log_buf + log_next_idx, 0, sizeof(struct log)); + memset(log_buf + log_next_idx, 0, sizeof(struct printk_log)); log_next_idx = 0; } /* fill message */ - msg = (struct log *)(log_buf + log_next_idx); + msg = (struct printk_log *)(log_buf + log_next_idx); memcpy(log_text(msg), text, text_len); msg->text_len = text_len; memcpy(log_dict(msg), dict, dict_len); @@ -356,13 +351,60 @@ static void log_store(int facility, int level, else msg->ts_nsec = local_clock(); memset(log_dict(msg) + dict_len, 0, pad_len); - msg->len = sizeof(struct log) + text_len + dict_len + pad_len; + msg->len = sizeof(struct printk_log) + text_len + dict_len + pad_len; /* insert message */ log_next_idx += msg->len; log_next_seq++; } +#ifdef CONFIG_SECURITY_DMESG_RESTRICT +int dmesg_restrict = 1; +#else +int dmesg_restrict; +#endif + +static int syslog_action_restricted(int type) +{ + if (dmesg_restrict) + return 1; + /* + * Unless restricted, we allow "read all" and "get buffer size" + * for everybody. + */ + return type != SYSLOG_ACTION_READ_ALL && + type != SYSLOG_ACTION_SIZE_BUFFER; +} + +static int check_syslog_permissions(int type, bool from_file) +{ + /* + * If this is from /proc/kmsg and we've already opened it, then we've + * already done the capabilities checks at open time. + */ + if (from_file && type != SYSLOG_ACTION_OPEN) + return 0; + + if (syslog_action_restricted(type)) { + if (capable(CAP_SYSLOG)) + return 0; + /* + * For historical reasons, accept CAP_SYS_ADMIN too, with + * a warning. + */ + if (capable(CAP_SYS_ADMIN)) { + pr_warn_once("%s (%d): Attempt to access syslog with " + "CAP_SYS_ADMIN but no CAP_SYSLOG " + "(deprecated).\n", + current->comm, task_pid_nr(current)); + return 0; + } + return -EPERM; + } + return security_syslog(type); +} + + /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { u64 seq; @@ -432,7 +474,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct devkmsg_user *user = file->private_data; - struct log *msg; + struct printk_log *msg; u64 ts_usec; size_t i; char cont = '-'; @@ -620,7 +662,8 @@ static int devkmsg_open(struct inode *inode, struct file *file) if ((file->f_flags & O_ACCMODE) == O_WRONLY) return 0; - err = security_syslog(SYSLOG_ACTION_READ_ALL); + err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, + SYSLOG_FROM_READER); if (err) return err; @@ -676,14 +719,14 @@ void log_buf_kexec_setup(void) VMCOREINFO_SYMBOL(log_first_idx); VMCOREINFO_SYMBOL(log_next_idx); /* - * Export struct log size and field offsets. User space tools can + * Export struct printk_log size and field offsets. User space tools can * parse it and detect any changes to structure down the line. */ - VMCOREINFO_STRUCT_SIZE(log); - VMCOREINFO_OFFSET(log, ts_nsec); - VMCOREINFO_OFFSET(log, len); - VMCOREINFO_OFFSET(log, text_len); - VMCOREINFO_OFFSET(log, dict_len); + VMCOREINFO_STRUCT_SIZE(printk_log); + VMCOREINFO_OFFSET(printk_log, ts_nsec); + VMCOREINFO_OFFSET(printk_log, len); + VMCOREINFO_OFFSET(printk_log, text_len); + VMCOREINFO_OFFSET(printk_log, dict_len); } #endif @@ -813,45 +856,6 @@ static inline void boot_delay_msec(int level) } #endif -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif - -static int syslog_action_restricted(int type) -{ - if (dmesg_restrict) - return 1; - /* Unless restricted, we allow "read all" and "get buffer size" for everybody */ - return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; -} - -static int check_syslog_permissions(int type, bool from_file) -{ - /* - * If this is from /proc/kmsg and we've already opened it, then we've - * already done the capabilities checks at open time. - */ - if (from_file && type != SYSLOG_ACTION_OPEN) - return 0; - - if (syslog_action_restricted(type)) { - if (capable(CAP_SYSLOG)) - return 0; - /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ - if (capable(CAP_SYS_ADMIN)) { - printk_once(KERN_WARNING "%s (%d): " - "Attempt to access syslog with CAP_SYS_ADMIN " - "but no CAP_SYSLOG (deprecated).\n", - current->comm, task_pid_nr(current)); - return 0; - } - return -EPERM; - } - return 0; -} - #if defined(CONFIG_PRINTK_TIME) static bool printk_time = 1; #else @@ -875,7 +879,7 @@ static size_t print_time(u64 ts, char *buf) (unsigned long)ts, rem_nsec / 1000); } -static size_t print_prefix(const struct log *msg, bool syslog, char *buf) +static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf) { size_t len = 0; unsigned int prefix = (msg->facility << 3) | msg->level; @@ -898,7 +902,7 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf) return len; } -static size_t msg_print_text(const struct log *msg, enum log_flags prev, +static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, bool syslog, char *buf, size_t size) { const char *text = log_text(msg); @@ -960,7 +964,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev, static int syslog_print(char __user *buf, int size) { char *text; - struct log *msg; + struct printk_log *msg; int len = 0; text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); @@ -1051,7 +1055,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) idx = clear_idx; prev = 0; while (seq < log_next_seq) { - struct log *msg = log_from_idx(idx); + struct printk_log *msg = log_from_idx(idx); len += msg_print_text(msg, prev, true, NULL, 0); prev = msg->flags; @@ -1064,7 +1068,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) idx = clear_idx; prev = 0; while (len > size && seq < log_next_seq) { - struct log *msg = log_from_idx(idx); + struct printk_log *msg = log_from_idx(idx); len -= msg_print_text(msg, prev, true, NULL, 0); prev = msg->flags; @@ -1078,7 +1082,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) len = 0; prev = 0; while (len >= 0 && seq < next_seq) { - struct log *msg = log_from_idx(idx); + struct printk_log *msg = log_from_idx(idx); int textlen; textlen = msg_print_text(msg, prev, true, text, @@ -1224,7 +1228,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) error = 0; while (seq < log_next_seq) { - struct log *msg = log_from_idx(idx); + struct printk_log *msg = log_from_idx(idx); error += msg_print_text(msg, prev, true, NULL, 0); idx = log_next(idx); @@ -1249,7 +1253,7 @@ out: SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { - return do_syslog(type, buf, len, SYSLOG_FROM_CALL); + return do_syslog(type, buf, len, SYSLOG_FROM_READER); } /* @@ -1360,9 +1364,9 @@ static int console_trylock_for_printk(unsigned int cpu) } } logbuf_cpu = UINT_MAX; + raw_spin_unlock(&logbuf_lock); if (wake) up(&console_sem); - raw_spin_unlock(&logbuf_lock); return retval; } @@ -1710,10 +1714,10 @@ static struct cont { u8 level; bool flushed:1; } cont; -static struct log *log_from_idx(u32 idx) { return NULL; } +static struct printk_log *log_from_idx(u32 idx) { return NULL; } static u32 log_next(u32 idx) { return 0; } static void call_console_drivers(int level, const char *text, size_t len) {} -static size_t msg_print_text(const struct log *msg, enum log_flags prev, +static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, bool syslog, char *buf, size_t size) { return 0; } static size_t cont_print_text(char *text, size_t size) { return 0; } @@ -1752,23 +1756,23 @@ static int __add_preferred_console(char *name, int idx, char *options, * See if this tty is not yet registered, and * if we have a slot free. */ - for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) - if (strcmp(console_cmdline[i].name, name) == 0 && - console_cmdline[i].index == idx) { - if (!brl_options) - selected_console = i; - return 0; + for (i = 0, c = console_cmdline; + i < MAX_CMDLINECONSOLES && c->name[0]; + i++, c++) { + if (strcmp(c->name, name) == 0 && c->index == idx) { + if (!brl_options) + selected_console = i; + return 0; } + } if (i == MAX_CMDLINECONSOLES) return -E2BIG; if (!brl_options) selected_console = i; - c = &console_cmdline[i]; strlcpy(c->name, name, sizeof(c->name)); c->options = options; -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - c->brl_options = brl_options; -#endif + braille_set_options(c, brl_options); + c->index = idx; return 0; } @@ -1781,20 +1785,8 @@ static int __init console_setup(char *str) char *s, *options, *brl_options = NULL; int idx; -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - if (!memcmp(str, "brl,", 4)) { - brl_options = ""; - str += 4; - } else if (!memcmp(str, "brl=", 4)) { - brl_options = str + 4; - str = strchr(brl_options, ','); - if (!str) { - printk(KERN_ERR "need port name after brl=\n"); - return 1; - } - *(str++) = 0; - } -#endif + if (_braille_console_setup(&str, &brl_options)) + return 1; /* * Decode str into name, index, options. @@ -1849,15 +1841,15 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha struct console_cmdline *c; int i; - for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) - if (strcmp(console_cmdline[i].name, name) == 0 && - console_cmdline[i].index == idx) { - c = &console_cmdline[i]; - strlcpy(c->name, name_new, sizeof(c->name)); - c->name[sizeof(c->name) - 1] = 0; - c->options = options; - c->index = idx_new; - return i; + for (i = 0, c = console_cmdline; + i < MAX_CMDLINECONSOLES && c->name[0]; + i++, c++) + if (strcmp(c->name, name) == 0 && c->index == idx) { + strlcpy(c->name, name_new, sizeof(c->name)); + c->name[sizeof(c->name) - 1] = 0; + c->options = options; + c->index = idx_new; + return i; } /* not found */ return -1; @@ -1912,7 +1904,7 @@ void resume_console(void) * called when a new CPU comes online (or fails to come up), and ensures * that any such output gets printed. */ -static int __cpuinit console_cpu_notify(struct notifier_block *self, +static int console_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { switch (action) { @@ -2037,7 +2029,7 @@ void console_unlock(void) console_cont_flush(text, sizeof(text)); again: for (;;) { - struct log *msg; + struct printk_log *msg; size_t len; int level; @@ -2232,6 +2224,14 @@ void register_console(struct console *newcon) int i; unsigned long flags; struct console *bcon = NULL; + struct console_cmdline *c; + + if (console_drivers) + for_each_console(bcon) + if (WARN(bcon == newcon, + "console '%s%d' already registered\n", + bcon->name, bcon->index)) + return; /* * before we register a new CON_BOOT console, make sure we don't @@ -2279,30 +2279,25 @@ void register_console(struct console *newcon) * See if this console matches one we selected on * the command line. */ - for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; - i++) { - if (strcmp(console_cmdline[i].name, newcon->name) != 0) + for (i = 0, c = console_cmdline; + i < MAX_CMDLINECONSOLES && c->name[0]; + i++, c++) { + if (strcmp(c->name, newcon->name) != 0) continue; if (newcon->index >= 0 && - newcon->index != console_cmdline[i].index) + newcon->index != c->index) continue; if (newcon->index < 0) - newcon->index = console_cmdline[i].index; -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - if (console_cmdline[i].brl_options) { - newcon->flags |= CON_BRL; - braille_register_console(newcon, - console_cmdline[i].index, - console_cmdline[i].options, - console_cmdline[i].brl_options); + newcon->index = c->index; + + if (_braille_register_console(newcon, c)) return; - } -#endif + if (newcon->setup && newcon->setup(newcon, console_cmdline[i].options) != 0) break; newcon->flags |= CON_ENABLED; - newcon->index = console_cmdline[i].index; + newcon->index = c->index; if (i == selected_console) { newcon->flags |= CON_CONSDEV; preferred_console = selected_console; @@ -2385,13 +2380,13 @@ EXPORT_SYMBOL(register_console); int unregister_console(struct console *console) { struct console *a, *b; - int res = 1; + int res; -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE - if (console->flags & CON_BRL) - return braille_unregister_console(console); -#endif + res = _braille_unregister_console(console); + if (res) + return res; + res = 1; console_lock(); if (console_drivers == console) { console_drivers=console->next; @@ -2657,7 +2652,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, char *line, size_t size, size_t *len) { - struct log *msg; + struct printk_log *msg; size_t l = 0; bool ret = false; @@ -2769,7 +2764,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, idx = dumper->cur_idx; prev = 0; while (seq < dumper->next_seq) { - struct log *msg = log_from_idx(idx); + struct printk_log *msg = log_from_idx(idx); l += msg_print_text(msg, prev, true, NULL, 0); idx = log_next(idx); @@ -2782,7 +2777,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, idx = dumper->cur_idx; prev = 0; while (l > size && seq < dumper->next_seq) { - struct log *msg = log_from_idx(idx); + struct printk_log *msg = log_from_idx(idx); l -= msg_print_text(msg, prev, true, NULL, 0); idx = log_next(idx); @@ -2797,7 +2792,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, l = 0; prev = 0; while (seq < dumper->next_seq) { - struct log *msg = log_from_idx(idx); + struct printk_log *msg = log_from_idx(idx); l += msg_print_text(msg, prev, syslog, buf + l, size - l); idx = log_next(idx); diff --git a/kernel/profile.c b/kernel/profile.c index 0bf40073766..6631e1ef55a 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -331,7 +331,7 @@ out: put_cpu(); } -static int __cpuinit profile_cpu_callback(struct notifier_block *info, +static int profile_cpu_callback(struct notifier_block *info, unsigned long action, void *__cpu) { int node, cpu = (unsigned long)__cpu; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index aed981a3f69..dd562e9aa2c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -236,7 +236,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) */ int dumpable = 0; /* Don't let security modules deny introspection */ - if (task == current) + if (same_thread_group(task, current)) return 0; rcu_read_lock(); tcred = __task_cred(task); @@ -665,20 +665,22 @@ static int ptrace_peek_siginfo(struct task_struct *child, if (unlikely(is_compat_task())) { compat_siginfo_t __user *uinfo = compat_ptr(data); - ret = copy_siginfo_to_user32(uinfo, &info); - ret |= __put_user(info.si_code, &uinfo->si_code); + if (copy_siginfo_to_user32(uinfo, &info) || + __put_user(info.si_code, &uinfo->si_code)) { + ret = -EFAULT; + break; + } + } else #endif { siginfo_t __user *uinfo = (siginfo_t __user *) data; - ret = copy_siginfo_to_user(uinfo, &info); - ret |= __put_user(info.si_code, &uinfo->si_code); - } - - if (ret) { - ret = -EFAULT; - break; + if (copy_siginfo_to_user(uinfo, &info) || + __put_user(info.si_code, &uinfo->si_code)) { + ret = -EFAULT; + break; + } } data += sizeof(siginfo_t); @@ -842,6 +844,47 @@ int ptrace_request(struct task_struct *child, long request, ret = ptrace_setsiginfo(child, &siginfo); break; + case PTRACE_GETSIGMASK: + if (addr != sizeof(sigset_t)) { + ret = -EINVAL; + break; + } + + if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t))) + ret = -EFAULT; + else + ret = 0; + + break; + + case PTRACE_SETSIGMASK: { + sigset_t new_set; + + if (addr != sizeof(sigset_t)) { + ret = -EINVAL; + break; + } + + if (copy_from_user(&new_set, datavp, sizeof(sigset_t))) { + ret = -EFAULT; + break; + } + + sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); + + /* + * Every thread does recalc_sigpending() after resume, so + * retarget_shared_pending() and recalc_sigpending() are not + * called here. + */ + spin_lock_irq(&child->sighand->siglock); + child->blocked = new_set; + spin_unlock_irq(&child->sighand->siglock); + + ret = 0; + break; + } + case PTRACE_INTERRUPT: /* * Stop tracee without any side-effect on signal or job @@ -946,8 +989,7 @@ int ptrace_request(struct task_struct *child, long request, #ifdef CONFIG_HAVE_ARCH_TRACEHOOK case PTRACE_GETREGSET: - case PTRACE_SETREGSET: - { + case PTRACE_SETREGSET: { struct iovec kiov; struct iovec __user *uiov = datavp; @@ -1179,19 +1221,3 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, return ret; } #endif /* CONFIG_COMPAT */ - -#ifdef CONFIG_HAVE_HW_BREAKPOINT -int ptrace_get_breakpoints(struct task_struct *tsk) -{ - if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt)) - return 0; - - return -1; -} - -void ptrace_put_breakpoints(struct task_struct *tsk) -{ - if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt)) - flush_ptrace_hw_breakpoint(tsk); -} -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ diff --git a/kernel/range.c b/kernel/range.c index eb911dbce26..322ea8e93e4 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -4,7 +4,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/sort.h> - +#include <linux/string.h> #include <linux/range.h> int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) @@ -32,9 +32,8 @@ int add_range_with_merge(struct range *range, int az, int nr_range, if (start >= end) return nr_range; - /* Try to merge it with old one: */ + /* get new start/end: */ for (i = 0; i < nr_range; i++) { - u64 final_start, final_end; u64 common_start, common_end; if (!range[i].end) @@ -45,14 +44,16 @@ int add_range_with_merge(struct range *range, int az, int nr_range, if (common_start > common_end) continue; - final_start = min(range[i].start, start); - final_end = max(range[i].end, end); + /* new start/end, will add it back at last */ + start = min(range[i].start, start); + end = max(range[i].end, end); - /* clear it and add it back for further merge */ - range[i].start = 0; - range[i].end = 0; - return add_range_with_merge(range, az, nr_range, - final_start, final_end); + memmove(&range[i], &range[i + 1], + (nr_range - (i + 1)) * sizeof(range[i])); + range[nr_range - 1].start = 0; + range[nr_range - 1].end = 0; + nr_range--; + i--; } /* Need to add it: */ diff --git a/kernel/rcu.h b/kernel/rcu.h index 7f8e7590e3e..77131966c4a 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -67,12 +67,15 @@ extern struct debug_obj_descr rcuhead_debug_descr; -static inline void debug_rcu_head_queue(struct rcu_head *head) +static inline int debug_rcu_head_queue(struct rcu_head *head) { - debug_object_activate(head, &rcuhead_debug_descr); + int r1; + + r1 = debug_object_activate(head, &rcuhead_debug_descr); debug_object_active_state(head, &rcuhead_debug_descr, STATE_RCU_HEAD_READY, STATE_RCU_HEAD_QUEUED); + return r1; } static inline void debug_rcu_head_unqueue(struct rcu_head *head) @@ -83,8 +86,9 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) debug_object_deactivate(head, &rcuhead_debug_descr); } #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -static inline void debug_rcu_head_queue(struct rcu_head *head) +static inline int debug_rcu_head_queue(struct rcu_head *head) { + return 0; } static inline void debug_rcu_head_unqueue(struct rcu_head *head) @@ -94,7 +98,7 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) extern void kfree(const void *); -static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) +static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) { unsigned long offset = (unsigned long)head->func; diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 48ab70384a4..b02a339836b 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -104,31 +104,7 @@ void __rcu_read_unlock(void) } EXPORT_SYMBOL_GPL(__rcu_read_unlock); -/* - * Check for a task exiting while in a preemptible-RCU read-side - * critical section, clean up if so. No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. - */ -void exit_rcu(void) -{ - struct task_struct *t = current; - - if (likely(list_empty(¤t->rcu_node_entry))) - return; - t->rcu_read_lock_nesting = 1; - barrier(); - t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; - __rcu_read_unlock(); -} - -#else /* #ifdef CONFIG_PREEMPT_RCU */ - -void exit_rcu(void) -{ -} - -#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ +#endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; @@ -145,11 +121,8 @@ static struct lock_class_key rcu_sched_lock_key; struct lockdep_map rcu_sched_lock_map = STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); EXPORT_SYMBOL_GPL(rcu_sched_lock_map); -#endif - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -int debug_lockdep_rcu_enabled(void) +int notrace debug_lockdep_rcu_enabled(void) { return rcu_scheduler_active && debug_locks && current->lockdep_recursion == 0; @@ -239,43 +212,6 @@ static inline void debug_rcu_head_free(struct rcu_head *head) } /* - * fixup_init is called when: - * - an active object is initialized - */ -static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) -{ - struct rcu_head *head = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - /* - * Ensure that queued callbacks are all executed. - * If we detect that we are nested in a RCU read-side critical - * section, we should simply fail, otherwise we would deadlock. - * In !PREEMPT configurations, there is no way to tell if we are - * in a RCU read-side critical section or not, so we never - * attempt any fixup and just print a warning. - */ -#ifndef CONFIG_PREEMPT - WARN_ON_ONCE(1); - return 0; -#endif - if (rcu_preempt_depth() != 0 || preempt_count() != 0 || - irqs_disabled()) { - WARN_ON_ONCE(1); - return 0; - } - rcu_barrier(); - rcu_barrier_sched(); - rcu_barrier_bh(); - debug_object_init(head, &rcuhead_debug_descr); - return 1; - default: - return 0; - } -} - -/* * fixup_activate is called when: * - an active object is activated * - an unknown object is activated (might be a statically initialized object) @@ -295,69 +231,8 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) debug_object_init(head, &rcuhead_debug_descr); debug_object_activate(head, &rcuhead_debug_descr); return 0; - - case ODEBUG_STATE_ACTIVE: - /* - * Ensure that queued callbacks are all executed. - * If we detect that we are nested in a RCU read-side critical - * section, we should simply fail, otherwise we would deadlock. - * In !PREEMPT configurations, there is no way to tell if we are - * in a RCU read-side critical section or not, so we never - * attempt any fixup and just print a warning. - */ -#ifndef CONFIG_PREEMPT - WARN_ON_ONCE(1); - return 0; -#endif - if (rcu_preempt_depth() != 0 || preempt_count() != 0 || - irqs_disabled()) { - WARN_ON_ONCE(1); - return 0; - } - rcu_barrier(); - rcu_barrier_sched(); - rcu_barrier_bh(); - debug_object_activate(head, &rcuhead_debug_descr); - return 1; default: - return 0; - } -} - -/* - * fixup_free is called when: - * - an active object is freed - */ -static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) -{ - struct rcu_head *head = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - /* - * Ensure that queued callbacks are all executed. - * If we detect that we are nested in a RCU read-side critical - * section, we should simply fail, otherwise we would deadlock. - * In !PREEMPT configurations, there is no way to tell if we are - * in a RCU read-side critical section or not, so we never - * attempt any fixup and just print a warning. - */ -#ifndef CONFIG_PREEMPT - WARN_ON_ONCE(1); - return 0; -#endif - if (rcu_preempt_depth() != 0 || preempt_count() != 0 || - irqs_disabled()) { - WARN_ON_ONCE(1); - return 0; - } - rcu_barrier(); - rcu_barrier_sched(); - rcu_barrier_bh(); - debug_object_free(head, &rcuhead_debug_descr); return 1; - default: - return 0; } } @@ -396,15 +271,13 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); struct debug_obj_descr rcuhead_debug_descr = { .name = "rcu_head", - .fixup_init = rcuhead_fixup_init, .fixup_activate = rcuhead_fixup_activate, - .fixup_free = rcuhead_fixup_free, }; EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) -void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp, +void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, unsigned long secs, unsigned long c_old, unsigned long c) { diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index a0714a51b6d..9ed6075dc56 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -44,7 +44,6 @@ /* Forward declarations for rcutiny_plugin.h. */ struct rcu_ctrlblk; -static void invoke_rcu_callbacks(void); static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); static void rcu_process_callbacks(struct softirq_action *unused); static void __call_rcu(struct rcu_head *head, @@ -205,7 +204,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) */ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { - reset_cpu_stall_ticks(rcp); + RCU_TRACE(reset_cpu_stall_ticks(rcp)); if (rcp->rcucblist != NULL && rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; @@ -227,7 +226,7 @@ void rcu_sched_qs(int cpu) local_irq_save(flags); if (rcu_qsctr_help(&rcu_sched_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) - invoke_rcu_callbacks(); + raise_softirq(RCU_SOFTIRQ); local_irq_restore(flags); } @@ -240,7 +239,7 @@ void rcu_bh_qs(int cpu) local_irq_save(flags); if (rcu_qsctr_help(&rcu_bh_ctrlblk)) - invoke_rcu_callbacks(); + raise_softirq(RCU_SOFTIRQ); local_irq_restore(flags); } @@ -252,12 +251,11 @@ void rcu_bh_qs(int cpu) */ void rcu_check_callbacks(int cpu, int user) { - check_cpu_stalls(); + RCU_TRACE(check_cpu_stalls()); if (user || rcu_is_cpu_rrupt_from_idle()) rcu_sched_qs(cpu); else if (!in_softirq()) rcu_bh_qs(cpu); - rcu_preempt_check_callbacks(); } /* @@ -266,7 +264,7 @@ void rcu_check_callbacks(int cpu, int user) */ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) { - char *rn = NULL; + const char *rn = NULL; struct rcu_head *next, *list; unsigned long flags; RCU_TRACE(int cb_count = 0); @@ -278,7 +276,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) ACCESS_ONCE(rcp->rcucblist), need_resched(), is_idle_task(current), - rcu_is_callbacks_kthread())); + false)); return; } @@ -290,7 +288,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) *rcp->donetail = NULL; if (rcp->curtail == rcp->donetail) rcp->curtail = &rcp->rcucblist; - rcu_preempt_remove_callbacks(rcp); rcp->donetail = &rcp->rcucblist; local_irq_restore(flags); @@ -309,14 +306,13 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), is_idle_task(current), - rcu_is_callbacks_kthread())); + false)); } static void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_sched_ctrlblk); __rcu_process_callbacks(&rcu_bh_ctrlblk); - rcu_preempt_process_callbacks(); } /* @@ -382,3 +378,8 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) __call_rcu(head, func, &rcu_bh_ctrlblk); } EXPORT_SYMBOL_GPL(call_rcu_bh); + +void rcu_init(void) +{ + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); +} diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 8a233002fae..280d06cae35 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -36,7 +36,7 @@ struct rcu_ctrlblk { RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ - RCU_TRACE(char *name); /* Name of RCU type. */ + RCU_TRACE(const char *name); /* Name of RCU type. */ }; /* Definition for rcupdate control block. */ @@ -53,958 +53,10 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = { }; #ifdef CONFIG_DEBUG_LOCK_ALLOC +#include <linux/kernel_stat.h> + int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -#ifdef CONFIG_RCU_TRACE - -static void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long j; - unsigned long js; - - if (rcu_cpu_stall_suppress) - return; - rcp->ticks_this_gp++; - j = jiffies; - js = rcp->jiffies_stall; - if (*rcp->curtail && ULONG_CMP_GE(j, js)) { - pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", - rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, - jiffies - rcp->gp_start, rcp->qlen); - dump_stack(); - } - if (*rcp->curtail && ULONG_CMP_GE(j, js)) - rcp->jiffies_stall = jiffies + - 3 * rcu_jiffies_till_stall_check() + 3; - else if (ULONG_CMP_GE(j, js)) - rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); -} - -static void check_cpu_stall_preempt(void); - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) -{ -#ifdef CONFIG_RCU_TRACE - rcp->ticks_this_gp = 0; - rcp->gp_start = jiffies; - rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); -#endif /* #ifdef CONFIG_RCU_TRACE */ -} - -static void check_cpu_stalls(void) -{ - RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); - RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); - RCU_TRACE(check_cpu_stall_preempt()); -} - -#ifdef CONFIG_TINY_PREEMPT_RCU - -#include <linux/delay.h> - -/* Global control variables for preemptible RCU. */ -struct rcu_preempt_ctrlblk { - struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */ - struct rcu_head **nexttail; - /* Tasks blocked in a preemptible RCU */ - /* read-side critical section while an */ - /* preemptible-RCU grace period is in */ - /* progress must wait for a later grace */ - /* period. This pointer points to the */ - /* ->next pointer of the last task that */ - /* must wait for a later grace period, or */ - /* to &->rcb.rcucblist if there is no */ - /* such task. */ - struct list_head blkd_tasks; - /* Tasks blocked in RCU read-side critical */ - /* section. Tasks are placed at the head */ - /* of this list and age towards the tail. */ - struct list_head *gp_tasks; - /* Pointer to the first task blocking the */ - /* current grace period, or NULL if there */ - /* is no such task. */ - struct list_head *exp_tasks; - /* Pointer to first task blocking the */ - /* current expedited grace period, or NULL */ - /* if there is no such task. If there */ - /* is no current expedited grace period, */ - /* then there cannot be any such task. */ -#ifdef CONFIG_RCU_BOOST - struct list_head *boost_tasks; - /* Pointer to first task that needs to be */ - /* priority-boosted, or NULL if no priority */ - /* boosting is needed. If there is no */ - /* current or expedited grace period, there */ - /* can be no such task. */ -#endif /* #ifdef CONFIG_RCU_BOOST */ - u8 gpnum; /* Current grace period. */ - u8 gpcpu; /* Last grace period blocked by the CPU. */ - u8 completed; /* Last grace period completed. */ - /* If all three are equal, RCU is idle. */ -#ifdef CONFIG_RCU_BOOST - unsigned long boost_time; /* When to start boosting (jiffies) */ -#endif /* #ifdef CONFIG_RCU_BOOST */ -#ifdef CONFIG_RCU_TRACE - unsigned long n_grace_periods; -#ifdef CONFIG_RCU_BOOST - unsigned long n_tasks_boosted; - /* Total number of tasks boosted. */ - unsigned long n_exp_boosts; - /* Number of tasks boosted for expedited GP. */ - unsigned long n_normal_boosts; - /* Number of tasks boosted for normal GP. */ - unsigned long n_balk_blkd_tasks; - /* Refused to boost: no blocked tasks. */ - unsigned long n_balk_exp_gp_tasks; - /* Refused to boost: nothing blocking GP. */ - unsigned long n_balk_boost_tasks; - /* Refused to boost: already boosting. */ - unsigned long n_balk_notyet; - /* Refused to boost: not yet time. */ - unsigned long n_balk_nos; - /* Refused to boost: not sure why, though. */ - /* This can happen due to race conditions. */ -#endif /* #ifdef CONFIG_RCU_BOOST */ -#endif /* #ifdef CONFIG_RCU_TRACE */ -}; - -static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { - .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist, - .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, - .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, - .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), - RCU_TRACE(.rcb.name = "rcu_preempt") -}; - -static int rcu_preempted_readers_exp(void); -static void rcu_report_exp_done(void); - -/* - * Return true if the CPU has not yet responded to the current grace period. - */ -static int rcu_cpu_blocking_cur_gp(void) -{ - return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum; -} - -/* - * Check for a running RCU reader. Because there is only one CPU, - * there can be but one running RCU reader at a time. ;-) - * - * Returns zero if there are no running readers. Returns a positive - * number if there is at least one reader within its RCU read-side - * critical section. Returns a negative number if an outermost reader - * is in the midst of exiting from its RCU read-side critical section - * - * Returns zero if there are no running readers. Returns a positive - * number if there is at least one reader within its RCU read-side - * critical section. Returns a negative number if an outermost reader - * is in the midst of exiting from its RCU read-side critical section. - */ -static int rcu_preempt_running_reader(void) -{ - return current->rcu_read_lock_nesting; -} - -/* - * Check for preempted RCU readers blocking any grace period. - * If the caller needs a reliable answer, it must disable hard irqs. - */ -static int rcu_preempt_blocked_readers_any(void) -{ - return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks); -} - -/* - * Check for preempted RCU readers blocking the current grace period. - * If the caller needs a reliable answer, it must disable hard irqs. - */ -static int rcu_preempt_blocked_readers_cgp(void) -{ - return rcu_preempt_ctrlblk.gp_tasks != NULL; -} - -/* - * Return true if another preemptible-RCU grace period is needed. - */ -static int rcu_preempt_needs_another_gp(void) -{ - return *rcu_preempt_ctrlblk.rcb.curtail != NULL; -} - -/* - * Return true if a preemptible-RCU grace period is in progress. - * The caller must disable hardirqs. - */ -static int rcu_preempt_gp_in_progress(void) -{ - return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum; -} - -/* - * Advance a ->blkd_tasks-list pointer to the next entry, instead - * returning NULL if at the end of the list. - */ -static struct list_head *rcu_next_node_entry(struct task_struct *t) -{ - struct list_head *np; - - np = t->rcu_node_entry.next; - if (np == &rcu_preempt_ctrlblk.blkd_tasks) - np = NULL; - return np; -} - -#ifdef CONFIG_RCU_TRACE - -#ifdef CONFIG_RCU_BOOST -static void rcu_initiate_boost_trace(void); -#endif /* #ifdef CONFIG_RCU_BOOST */ - -/* - * Dump additional statistice for TINY_PREEMPT_RCU. - */ -static void show_tiny_preempt_stats(struct seq_file *m) -{ - seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n", - rcu_preempt_ctrlblk.rcb.qlen, - rcu_preempt_ctrlblk.n_grace_periods, - rcu_preempt_ctrlblk.gpnum, - rcu_preempt_ctrlblk.gpcpu, - rcu_preempt_ctrlblk.completed, - "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)], - "N."[!rcu_preempt_ctrlblk.gp_tasks], - "E."[!rcu_preempt_ctrlblk.exp_tasks]); -#ifdef CONFIG_RCU_BOOST - seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", - " ", - "B."[!rcu_preempt_ctrlblk.boost_tasks], - rcu_preempt_ctrlblk.n_tasks_boosted, - rcu_preempt_ctrlblk.n_exp_boosts, - rcu_preempt_ctrlblk.n_normal_boosts, - (int)(jiffies & 0xffff), - (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); - seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n", - " balk", - rcu_preempt_ctrlblk.n_balk_blkd_tasks, - rcu_preempt_ctrlblk.n_balk_exp_gp_tasks, - rcu_preempt_ctrlblk.n_balk_boost_tasks, - rcu_preempt_ctrlblk.n_balk_notyet, - rcu_preempt_ctrlblk.n_balk_nos); -#endif /* #ifdef CONFIG_RCU_BOOST */ -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -#ifdef CONFIG_RCU_BOOST - -#include "rtmutex_common.h" - -#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO - -/* Controls for rcu_kthread() kthread. */ -static struct task_struct *rcu_kthread_task; -static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); -static unsigned long have_rcu_kthread_work; - -/* - * Carry out RCU priority boosting on the task indicated by ->boost_tasks, - * and advance ->boost_tasks to the next task in the ->blkd_tasks list. - */ -static int rcu_boost(void) -{ - unsigned long flags; - struct rt_mutex mtx; - struct task_struct *t; - struct list_head *tb; - - if (rcu_preempt_ctrlblk.boost_tasks == NULL && - rcu_preempt_ctrlblk.exp_tasks == NULL) - return 0; /* Nothing to boost. */ - - local_irq_save(flags); - - /* - * Recheck with irqs disabled: all tasks in need of boosting - * might exit their RCU read-side critical sections on their own - * if we are preempted just before disabling irqs. - */ - if (rcu_preempt_ctrlblk.boost_tasks == NULL && - rcu_preempt_ctrlblk.exp_tasks == NULL) { - local_irq_restore(flags); - return 0; - } - - /* - * Preferentially boost tasks blocking expedited grace periods. - * This cannot starve the normal grace periods because a second - * expedited grace period must boost all blocked tasks, including - * those blocking the pre-existing normal grace period. - */ - if (rcu_preempt_ctrlblk.exp_tasks != NULL) { - tb = rcu_preempt_ctrlblk.exp_tasks; - RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); - } else { - tb = rcu_preempt_ctrlblk.boost_tasks; - RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); - } - RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); - - /* - * We boost task t by manufacturing an rt_mutex that appears to - * be held by task t. We leave a pointer to that rt_mutex where - * task t can find it, and task t will release the mutex when it - * exits its outermost RCU read-side critical section. Then - * simply acquiring this artificial rt_mutex will boost task - * t's priority. (Thanks to tglx for suggesting this approach!) - */ - t = container_of(tb, struct task_struct, rcu_node_entry); - rt_mutex_init_proxy_locked(&mtx, t); - t->rcu_boost_mutex = &mtx; - local_irq_restore(flags); - rt_mutex_lock(&mtx); - rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ - - return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL || - ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL; -} - -/* - * Check to see if it is now time to start boosting RCU readers blocking - * the current grace period, and, if so, tell the rcu_kthread_task to - * start boosting them. If there is an expedited boost in progress, - * we wait for it to complete. - * - * If there are no blocked readers blocking the current grace period, - * return 0 to let the caller know, otherwise return 1. Note that this - * return value is independent of whether or not boosting was done. - */ -static int rcu_initiate_boost(void) -{ - if (!rcu_preempt_blocked_readers_cgp() && - rcu_preempt_ctrlblk.exp_tasks == NULL) { - RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++); - return 0; - } - if (rcu_preempt_ctrlblk.exp_tasks != NULL || - (rcu_preempt_ctrlblk.gp_tasks != NULL && - rcu_preempt_ctrlblk.boost_tasks == NULL && - ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) { - if (rcu_preempt_ctrlblk.exp_tasks == NULL) - rcu_preempt_ctrlblk.boost_tasks = - rcu_preempt_ctrlblk.gp_tasks; - invoke_rcu_callbacks(); - } else { - RCU_TRACE(rcu_initiate_boost_trace()); - } - return 1; -} - -#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) - -/* - * Do priority-boost accounting for the start of a new grace period. - */ -static void rcu_preempt_boost_start_gp(void) -{ - rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; -} - -#else /* #ifdef CONFIG_RCU_BOOST */ - -/* - * If there is no RCU priority boosting, we don't initiate boosting, - * but we do indicate whether there are blocked readers blocking the - * current grace period. - */ -static int rcu_initiate_boost(void) -{ - return rcu_preempt_blocked_readers_cgp(); -} - -/* - * If there is no RCU priority boosting, nothing to do at grace-period start. - */ -static void rcu_preempt_boost_start_gp(void) -{ -} - -#endif /* else #ifdef CONFIG_RCU_BOOST */ - -/* - * Record a preemptible-RCU quiescent state for the specified CPU. Note - * that this just means that the task currently running on the CPU is - * in a quiescent state. There might be any number of tasks blocked - * while in an RCU read-side critical section. - * - * Unlike the other rcu_*_qs() functions, callers to this function - * must disable irqs in order to protect the assignment to - * ->rcu_read_unlock_special. - * - * Because this is a single-CPU implementation, the only way a grace - * period can end is if the CPU is in a quiescent state. The reason is - * that a blocked preemptible-RCU reader can exit its critical section - * only if the CPU is running it at the time. Therefore, when the - * last task blocking the current grace period exits its RCU read-side - * critical section, neither the CPU nor blocked tasks will be stopping - * the current grace period. (In contrast, SMP implementations - * might have CPUs running in RCU read-side critical sections that - * block later grace periods -- but this is not possible given only - * one CPU.) - */ -static void rcu_preempt_cpu_qs(void) -{ - /* Record both CPU and task as having responded to current GP. */ - rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; - current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; - - /* If there is no GP then there is nothing more to do. */ - if (!rcu_preempt_gp_in_progress()) - return; - /* - * Check up on boosting. If there are readers blocking the - * current grace period, leave. - */ - if (rcu_initiate_boost()) - return; - - /* Advance callbacks. */ - rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum; - rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail; - rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail; - - /* If there are no blocked readers, next GP is done instantly. */ - if (!rcu_preempt_blocked_readers_any()) - rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; - - /* If there are done callbacks, cause them to be invoked. */ - if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) - invoke_rcu_callbacks(); -} - -/* - * Start a new RCU grace period if warranted. Hard irqs must be disabled. - */ -static void rcu_preempt_start_gp(void) -{ - if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) { - - /* Official start of GP. */ - rcu_preempt_ctrlblk.gpnum++; - RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); - reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb); - - /* Any blocked RCU readers block new GP. */ - if (rcu_preempt_blocked_readers_any()) - rcu_preempt_ctrlblk.gp_tasks = - rcu_preempt_ctrlblk.blkd_tasks.next; - - /* Set up for RCU priority boosting. */ - rcu_preempt_boost_start_gp(); - - /* If there is no running reader, CPU is done with GP. */ - if (!rcu_preempt_running_reader()) - rcu_preempt_cpu_qs(); - } -} - -/* - * We have entered the scheduler, and the current task might soon be - * context-switched away from. If this task is in an RCU read-side - * critical section, we will no longer be able to rely on the CPU to - * record that fact, so we enqueue the task on the blkd_tasks list. - * If the task started after the current grace period began, as recorded - * by ->gpcpu, we enqueue at the beginning of the list. Otherwise - * before the element referenced by ->gp_tasks (or at the tail if - * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element. - * The task will dequeue itself when it exits the outermost enclosing - * RCU read-side critical section. Therefore, the current grace period - * cannot be permitted to complete until the ->gp_tasks pointer becomes - * NULL. - * - * Caller must disable preemption. - */ -void rcu_preempt_note_context_switch(void) -{ - struct task_struct *t = current; - unsigned long flags; - - local_irq_save(flags); /* must exclude scheduler_tick(). */ - if (rcu_preempt_running_reader() > 0 && - (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { - - /* Possibly blocking in an RCU read-side critical section. */ - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; - - /* - * If this CPU has already checked in, then this task - * will hold up the next grace period rather than the - * current grace period. Queue the task accordingly. - * If the task is queued for the current grace period - * (i.e., this CPU has not yet passed through a quiescent - * state for the current grace period), then as long - * as that task remains queued, the current grace period - * cannot end. - */ - list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); - if (rcu_cpu_blocking_cur_gp()) - rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; - } else if (rcu_preempt_running_reader() < 0 && - t->rcu_read_unlock_special) { - /* - * Complete exit from RCU read-side critical section on - * behalf of preempted instance of __rcu_read_unlock(). - */ - rcu_read_unlock_special(t); - } - - /* - * Either we were not in an RCU read-side critical section to - * begin with, or we have now recorded that critical section - * globally. Either way, we can now note a quiescent state - * for this CPU. Again, if we were in an RCU read-side critical - * section, and if that critical section was blocking the current - * grace period, then the fact that the task has been enqueued - * means that current grace period continues to be blocked. - */ - rcu_preempt_cpu_qs(); - local_irq_restore(flags); -} - -/* - * Handle special cases during rcu_read_unlock(), such as needing to - * notify RCU core processing or task having blocked during the RCU - * read-side critical section. - */ -void rcu_read_unlock_special(struct task_struct *t) -{ - int empty; - int empty_exp; - unsigned long flags; - struct list_head *np; -#ifdef CONFIG_RCU_BOOST - struct rt_mutex *rbmp = NULL; -#endif /* #ifdef CONFIG_RCU_BOOST */ - int special; - - /* - * NMI handlers cannot block and cannot safely manipulate state. - * They therefore cannot possibly be special, so just leave. - */ - if (in_nmi()) - return; - - local_irq_save(flags); - - /* - * If RCU core is waiting for this CPU to exit critical section, - * let it know that we have done so. - */ - special = t->rcu_read_unlock_special; - if (special & RCU_READ_UNLOCK_NEED_QS) - rcu_preempt_cpu_qs(); - - /* Hardware IRQ handlers cannot block. */ - if (in_irq() || in_serving_softirq()) { - local_irq_restore(flags); - return; - } - - /* Clean up if blocked during RCU read-side critical section. */ - if (special & RCU_READ_UNLOCK_BLOCKED) { - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; - - /* - * Remove this task from the ->blkd_tasks list and adjust - * any pointers that might have been referencing it. - */ - empty = !rcu_preempt_blocked_readers_cgp(); - empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; - np = rcu_next_node_entry(t); - list_del_init(&t->rcu_node_entry); - if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) - rcu_preempt_ctrlblk.gp_tasks = np; - if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) - rcu_preempt_ctrlblk.exp_tasks = np; -#ifdef CONFIG_RCU_BOOST - if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) - rcu_preempt_ctrlblk.boost_tasks = np; -#endif /* #ifdef CONFIG_RCU_BOOST */ - - /* - * If this was the last task on the current list, and if - * we aren't waiting on the CPU, report the quiescent state - * and start a new grace period if needed. - */ - if (!empty && !rcu_preempt_blocked_readers_cgp()) { - rcu_preempt_cpu_qs(); - rcu_preempt_start_gp(); - } - - /* - * If this was the last task on the expedited lists, - * then we need wake up the waiting task. - */ - if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) - rcu_report_exp_done(); - } -#ifdef CONFIG_RCU_BOOST - /* Unboost self if was boosted. */ - if (t->rcu_boost_mutex != NULL) { - rbmp = t->rcu_boost_mutex; - t->rcu_boost_mutex = NULL; - rt_mutex_unlock(rbmp); - } -#endif /* #ifdef CONFIG_RCU_BOOST */ - local_irq_restore(flags); -} - -/* - * Check for a quiescent state from the current CPU. When a task blocks, - * the task is recorded in the rcu_preempt_ctrlblk structure, which is - * checked elsewhere. This is called from the scheduling-clock interrupt. - * - * Caller must disable hard irqs. - */ -static void rcu_preempt_check_callbacks(void) -{ - struct task_struct *t = current; - - if (rcu_preempt_gp_in_progress() && - (!rcu_preempt_running_reader() || - !rcu_cpu_blocking_cur_gp())) - rcu_preempt_cpu_qs(); - if (&rcu_preempt_ctrlblk.rcb.rcucblist != - rcu_preempt_ctrlblk.rcb.donetail) - invoke_rcu_callbacks(); - if (rcu_preempt_gp_in_progress() && - rcu_cpu_blocking_cur_gp() && - rcu_preempt_running_reader() > 0) - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; -} - -/* - * TINY_PREEMPT_RCU has an extra callback-list tail pointer to - * update, so this is invoked from rcu_process_callbacks() to - * handle that case. Of course, it is invoked for all flavors of - * RCU, but RCU callbacks can appear only on one of the lists, and - * neither ->nexttail nor ->donetail can possibly be NULL, so there - * is no need for an explicit check. - */ -static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) -{ - if (rcu_preempt_ctrlblk.nexttail == rcp->donetail) - rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist; -} - -/* - * Process callbacks for preemptible RCU. - */ -static void rcu_preempt_process_callbacks(void) -{ - __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); -} - -/* - * Queue a preemptible -RCU callback for invocation after a grace period. - */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - unsigned long flags; - - debug_rcu_head_queue(head); - head->func = func; - head->next = NULL; - - local_irq_save(flags); - *rcu_preempt_ctrlblk.nexttail = head; - rcu_preempt_ctrlblk.nexttail = &head->next; - RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++); - rcu_preempt_start_gp(); /* checks to see if GP needed. */ - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(call_rcu); - -/* - * synchronize_rcu - wait until a grace period has elapsed. - * - * Control will return to the caller some time after a full grace - * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. - */ -void synchronize_rcu(void) -{ - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu() in RCU read-side critical section"); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - if (!rcu_scheduler_active) - return; -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - - WARN_ON_ONCE(rcu_preempt_running_reader()); - if (!rcu_preempt_blocked_readers_any()) - return; - - /* Once we get past the fastpath checks, same code as rcu_barrier(). */ - if (rcu_expedited) - synchronize_rcu_expedited(); - else - rcu_barrier(); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); - -static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); -static unsigned long sync_rcu_preempt_exp_count; -static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); - -/* - * Return non-zero if there are any tasks in RCU read-side critical - * sections blocking the current preemptible-RCU expedited grace period. - * If there is no preemptible-RCU expedited grace period currently in - * progress, returns zero unconditionally. - */ -static int rcu_preempted_readers_exp(void) -{ - return rcu_preempt_ctrlblk.exp_tasks != NULL; -} - -/* - * Report the exit from RCU read-side critical section for the last task - * that queued itself during or before the current expedited preemptible-RCU - * grace period. - */ -static void rcu_report_exp_done(void) -{ - wake_up(&sync_rcu_preempt_exp_wq); -} - -/* - * Wait for an rcu-preempt grace period, but expedite it. The basic idea - * is to rely in the fact that there is but one CPU, and that it is - * illegal for a task to invoke synchronize_rcu_expedited() while in a - * preemptible-RCU read-side critical section. Therefore, any such - * critical sections must correspond to blocked tasks, which must therefore - * be on the ->blkd_tasks list. So just record the current head of the - * list in the ->exp_tasks pointer, and wait for all tasks including and - * after the task pointed to by ->exp_tasks to drain. - */ -void synchronize_rcu_expedited(void) -{ - unsigned long flags; - struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk; - unsigned long snap; - - barrier(); /* ensure prior action seen before grace period. */ - - WARN_ON_ONCE(rcu_preempt_running_reader()); - - /* - * Acquire lock so that there is only one preemptible RCU grace - * period in flight. Of course, if someone does the expedited - * grace period for us while we are acquiring the lock, just leave. - */ - snap = sync_rcu_preempt_exp_count + 1; - mutex_lock(&sync_rcu_preempt_exp_mutex); - if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count)) - goto unlock_mb_ret; /* Others did our work for us. */ - - local_irq_save(flags); - - /* - * All RCU readers have to already be on blkd_tasks because - * we cannot legally be executing in an RCU read-side critical - * section. - */ - - /* Snapshot current head of ->blkd_tasks list. */ - rpcp->exp_tasks = rpcp->blkd_tasks.next; - if (rpcp->exp_tasks == &rpcp->blkd_tasks) - rpcp->exp_tasks = NULL; - - /* Wait for tail of ->blkd_tasks list to drain. */ - if (!rcu_preempted_readers_exp()) { - local_irq_restore(flags); - } else { - rcu_initiate_boost(); - local_irq_restore(flags); - wait_event(sync_rcu_preempt_exp_wq, - !rcu_preempted_readers_exp()); - } - - /* Clean up and exit. */ - barrier(); /* ensure expedited GP seen before counter increment. */ - sync_rcu_preempt_exp_count++; -unlock_mb_ret: - mutex_unlock(&sync_rcu_preempt_exp_mutex); - barrier(); /* ensure subsequent action seen after grace period. */ -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - -/* - * Does preemptible RCU need the CPU to stay out of dynticks mode? - */ -int rcu_preempt_needs_cpu(void) -{ - return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; -} - -#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ - -#ifdef CONFIG_RCU_TRACE - -/* - * Because preemptible RCU does not exist, it is not necessary to - * dump out its statistics. - */ -static void show_tiny_preempt_stats(struct seq_file *m) -{ -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -/* - * Because preemptible RCU does not exist, it never has any callbacks - * to check. - */ -static void rcu_preempt_check_callbacks(void) -{ -} - -/* - * Because preemptible RCU does not exist, it never has any callbacks - * to remove. - */ -static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) -{ -} - -/* - * Because preemptible RCU does not exist, it never has any callbacks - * to process. - */ -static void rcu_preempt_process_callbacks(void) -{ -} - -#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ - -#ifdef CONFIG_RCU_BOOST - -/* - * Wake up rcu_kthread() to process callbacks now eligible for invocation - * or to boost readers. - */ -static void invoke_rcu_callbacks(void) -{ - have_rcu_kthread_work = 1; - if (rcu_kthread_task != NULL) - wake_up(&rcu_kthread_wq); -} - -#ifdef CONFIG_RCU_TRACE - -/* - * Is the current CPU running the RCU-callbacks kthread? - * Caller must have preemption disabled. - */ -static bool rcu_is_callbacks_kthread(void) -{ - return rcu_kthread_task == current; -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -/* - * This kthread invokes RCU callbacks whose grace periods have - * elapsed. It is awakened as needed, and takes the place of the - * RCU_SOFTIRQ that is used for this purpose when boosting is disabled. - * This is a kthread, but it is never stopped, at least not until - * the system goes down. - */ -static int rcu_kthread(void *arg) -{ - unsigned long work; - unsigned long morework; - unsigned long flags; - - for (;;) { - wait_event_interruptible(rcu_kthread_wq, - have_rcu_kthread_work != 0); - morework = rcu_boost(); - local_irq_save(flags); - work = have_rcu_kthread_work; - have_rcu_kthread_work = morework; - local_irq_restore(flags); - if (work) - rcu_process_callbacks(NULL); - schedule_timeout_interruptible(1); /* Leave CPU for others. */ - } - - return 0; /* Not reached, but needed to shut gcc up. */ -} - -/* - * Spawn the kthread that invokes RCU callbacks. - */ -static int __init rcu_spawn_kthreads(void) -{ - struct sched_param sp; - - rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); - sp.sched_priority = RCU_BOOST_PRIO; - sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); - return 0; -} -early_initcall(rcu_spawn_kthreads); - -#else /* #ifdef CONFIG_RCU_BOOST */ - -/* Hold off callback invocation until early_initcall() time. */ -static int rcu_scheduler_fully_active __read_mostly; - -/* - * Start up softirq processing of callbacks. - */ -void invoke_rcu_callbacks(void) -{ - if (rcu_scheduler_fully_active) - raise_softirq(RCU_SOFTIRQ); -} - -#ifdef CONFIG_RCU_TRACE - -/* - * There is no callback kthread, so this thread is never it. - */ -static bool rcu_is_callbacks_kthread(void) -{ - return false; -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ - -static int __init rcu_scheduler_really_started(void) -{ - rcu_scheduler_fully_active = 1; - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); - raise_softirq(RCU_SOFTIRQ); /* Invoke any callbacks from early boot. */ - return 0; -} -early_initcall(rcu_scheduler_really_started); - -#endif /* #else #ifdef CONFIG_RCU_BOOST */ - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -#include <linux/kernel_stat.h> /* * During boot, we forgive RCU lockdep issues. After this function is @@ -1020,25 +72,6 @@ void __init rcu_scheduler_starting(void) #ifdef CONFIG_RCU_TRACE -#ifdef CONFIG_RCU_BOOST - -static void rcu_initiate_boost_trace(void) -{ - if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) - rcu_preempt_ctrlblk.n_balk_blkd_tasks++; - else if (rcu_preempt_ctrlblk.gp_tasks == NULL && - rcu_preempt_ctrlblk.exp_tasks == NULL) - rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++; - else if (rcu_preempt_ctrlblk.boost_tasks != NULL) - rcu_preempt_ctrlblk.n_balk_boost_tasks++; - else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) - rcu_preempt_ctrlblk.n_balk_notyet++; - else - rcu_preempt_ctrlblk.n_balk_nos++; -} - -#endif /* #ifdef CONFIG_RCU_BOOST */ - static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) { unsigned long flags; @@ -1053,7 +86,6 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) */ static int show_tiny_stats(struct seq_file *m, void *unused) { - show_tiny_preempt_stats(m); seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); return 0; @@ -1103,11 +135,40 @@ MODULE_AUTHOR("Paul E. McKenney"); MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); MODULE_LICENSE("GPL"); -static void check_cpu_stall_preempt(void) +static void check_cpu_stall(struct rcu_ctrlblk *rcp) { -#ifdef CONFIG_TINY_PREEMPT_RCU - check_cpu_stall(&rcu_preempt_ctrlblk.rcb); -#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */ + unsigned long j; + unsigned long js; + + if (rcu_cpu_stall_suppress) + return; + rcp->ticks_this_gp++; + j = jiffies; + js = rcp->jiffies_stall; + if (*rcp->curtail && ULONG_CMP_GE(j, js)) { + pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", + rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, + jiffies - rcp->gp_start, rcp->qlen); + dump_stack(); + } + if (*rcp->curtail && ULONG_CMP_GE(j, js)) + rcp->jiffies_stall = jiffies + + 3 * rcu_jiffies_till_stall_check() + 3; + else if (ULONG_CMP_GE(j, js)) + rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +} + +static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) +{ + rcp->ticks_this_gp = 0; + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +} + +static void check_cpu_stalls(void) +{ + RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); + RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); } #endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index e1f3a8c9672..be63101c617 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -52,72 +52,78 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); -static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ -static int nfakewriters = 4; /* # fake writer threads */ -static int stat_interval = 60; /* Interval between stats, in seconds. */ - /* Zero means "only at end of test". */ -static bool verbose; /* Print more debug info. */ -static bool test_no_idle_hz = true; - /* Test RCU support for tickless idle CPUs. */ -static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ -static int stutter = 5; /* Start/stop testing interval (in sec) */ -static int irqreader = 1; /* RCU readers from irq (timers). */ -static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ -static int fqs_holdoff; /* Hold time within burst (us). */ -static int fqs_stutter = 3; /* Wait time between bursts (s). */ -static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */ -static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ -static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ -static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ -static int stall_cpu; /* CPU-stall duration (s). 0 for no stall. */ -static int stall_cpu_holdoff = 10; /* Time to wait until stall (s). */ -static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ -static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ -static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ -static char *torture_type = "rcu"; /* What RCU implementation to torture. */ - -module_param(nreaders, int, 0444); -MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); -module_param(nfakewriters, int, 0444); -MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); -module_param(stat_interval, int, 0644); -MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); -module_param(verbose, bool, 0444); -MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); -module_param(test_no_idle_hz, bool, 0444); -MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); -module_param(shuffle_interval, int, 0444); -MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); -module_param(stutter, int, 0444); -MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); -module_param(irqreader, int, 0444); -MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); +static int fqs_duration; module_param(fqs_duration, int, 0444); -MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); +MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); +static int fqs_holdoff; module_param(fqs_holdoff, int, 0444); MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); +static int fqs_stutter = 3; module_param(fqs_stutter, int, 0444); MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); +static bool gp_exp; +module_param(gp_exp, bool, 0444); +MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); +static bool gp_normal; +module_param(gp_normal, bool, 0444); +MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); +static int irqreader = 1; +module_param(irqreader, int, 0444); +MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); +static int n_barrier_cbs; module_param(n_barrier_cbs, int, 0444); MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); -module_param(onoff_interval, int, 0444); -MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); +static int nfakewriters = 4; +module_param(nfakewriters, int, 0444); +MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); +static int nreaders = -1; +module_param(nreaders, int, 0444); +MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); +static int object_debug; +module_param(object_debug, int, 0444); +MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); +static int onoff_holdoff; module_param(onoff_holdoff, int, 0444); MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); +static int onoff_interval; +module_param(onoff_interval, int, 0444); +MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); +static int shuffle_interval = 3; +module_param(shuffle_interval, int, 0444); +MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); +static int shutdown_secs; module_param(shutdown_secs, int, 0444); -MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); +MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable."); +static int stall_cpu; module_param(stall_cpu, int, 0444); MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); +static int stall_cpu_holdoff = 10; module_param(stall_cpu_holdoff, int, 0444); MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); +static int stat_interval = 60; +module_param(stat_interval, int, 0644); +MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); +static int stutter = 5; +module_param(stutter, int, 0444); +MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); +static int test_boost = 1; module_param(test_boost, int, 0444); MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); -module_param(test_boost_interval, int, 0444); -MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); +static int test_boost_duration = 4; module_param(test_boost_duration, int, 0444); MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); +static int test_boost_interval = 7; +module_param(test_boost_interval, int, 0444); +MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); +static bool test_no_idle_hz = true; +module_param(test_no_idle_hz, bool, 0444); +MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); +static char *torture_type = "rcu"; module_param(torture_type, charp, 0444); -MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); +MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); +static bool verbose; +module_param(verbose, bool, 0444); +MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); #define TORTURE_FLAG "-torture:" #define PRINTK_STRING(s) \ @@ -267,7 +273,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, * Absorb kthreads into a kernel function that won't return, so that * they won't ever access module text or data again. */ -static void rcutorture_shutdown_absorb(char *title) +static void rcutorture_shutdown_absorb(const char *title) { if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { pr_notice( @@ -337,7 +343,7 @@ rcu_random(struct rcu_random_state *rrsp) } static void -rcu_stutter_wait(char *title) +rcu_stutter_wait(const char *title) { while (stutter_pause_test || !rcutorture_runnable) { if (rcutorture_runnable) @@ -360,13 +366,14 @@ struct rcu_torture_ops { int (*completed)(void); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); + void (*exp_sync)(void); void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); void (*cb_barrier)(void); void (*fqs)(void); int (*stats)(char *page); int irq_capable; int can_boost; - char *name; + const char *name; }; static struct rcu_torture_ops *cur_ops; @@ -443,81 +450,27 @@ static void rcu_torture_deferred_free(struct rcu_torture *p) call_rcu(&p->rtort_rcu, rcu_torture_cb); } -static struct rcu_torture_ops rcu_ops = { - .init = NULL, - .readlock = rcu_torture_read_lock, - .read_delay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, - .deferred_free = rcu_torture_deferred_free, - .sync = synchronize_rcu, - .call = call_rcu, - .cb_barrier = rcu_barrier, - .fqs = rcu_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .can_boost = rcu_can_boost(), - .name = "rcu" -}; - -static void rcu_sync_torture_deferred_free(struct rcu_torture *p) -{ - int i; - struct rcu_torture *rp; - struct rcu_torture *rp1; - - cur_ops->sync(); - list_add(&p->rtort_free, &rcu_torture_removed); - list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - list_del(&rp->rtort_free); - rcu_torture_free(rp); - } - } -} - static void rcu_sync_torture_init(void) { INIT_LIST_HEAD(&rcu_torture_removed); } -static struct rcu_torture_ops rcu_sync_ops = { +static struct rcu_torture_ops rcu_ops = { .init = rcu_sync_torture_init, .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, .completed = rcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, + .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, - .call = NULL, - .cb_barrier = NULL, - .fqs = rcu_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .can_boost = rcu_can_boost(), - .name = "rcu_sync" -}; - -static struct rcu_torture_ops rcu_expedited_ops = { - .init = rcu_sync_torture_init, - .readlock = rcu_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_torture_read_unlock, - .completed = rcu_no_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_rcu_expedited, - .call = NULL, - .cb_barrier = NULL, + .exp_sync = synchronize_rcu_expedited, + .call = call_rcu, + .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, .stats = NULL, .irq_capable = 1, .can_boost = rcu_can_boost(), - .name = "rcu_expedited" + .name = "rcu" }; /* @@ -546,13 +499,14 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) } static struct rcu_torture_ops rcu_bh_ops = { - .init = NULL, + .init = rcu_sync_torture_init, .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, .completed = rcu_bh_torture_completed, .deferred_free = rcu_bh_torture_deferred_free, .sync = synchronize_rcu_bh, + .exp_sync = synchronize_rcu_bh_expedited, .call = call_rcu_bh, .cb_barrier = rcu_barrier_bh, .fqs = rcu_bh_force_quiescent_state, @@ -561,38 +515,6 @@ static struct rcu_torture_ops rcu_bh_ops = { .name = "rcu_bh" }; -static struct rcu_torture_ops rcu_bh_sync_ops = { - .init = rcu_sync_torture_init, - .readlock = rcu_bh_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_rcu_bh, - .call = NULL, - .cb_barrier = NULL, - .fqs = rcu_bh_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .name = "rcu_bh_sync" -}; - -static struct rcu_torture_ops rcu_bh_expedited_ops = { - .init = rcu_sync_torture_init, - .readlock = rcu_bh_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_rcu_bh_expedited, - .call = NULL, - .cb_barrier = NULL, - .fqs = rcu_bh_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .name = "rcu_bh_expedited" -}; - /* * Definitions for srcu torture testing. */ @@ -667,6 +589,11 @@ static int srcu_torture_stats(char *page) return cnt; } +static void srcu_torture_synchronize_expedited(void) +{ + synchronize_srcu_expedited(&srcu_ctl); +} + static struct rcu_torture_ops srcu_ops = { .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock, @@ -675,83 +602,13 @@ static struct rcu_torture_ops srcu_ops = { .completed = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, + .exp_sync = srcu_torture_synchronize_expedited, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, .name = "srcu" }; -static struct rcu_torture_ops srcu_sync_ops = { - .init = rcu_sync_torture_init, - .readlock = srcu_torture_read_lock, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock, - .completed = srcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = srcu_torture_synchronize, - .call = NULL, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu_sync" -}; - -static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) -{ - return srcu_read_lock_raw(&srcu_ctl); -} - -static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl) -{ - srcu_read_unlock_raw(&srcu_ctl, idx); -} - -static struct rcu_torture_ops srcu_raw_ops = { - .init = rcu_sync_torture_init, - .readlock = srcu_torture_read_lock_raw, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock_raw, - .completed = srcu_torture_completed, - .deferred_free = srcu_torture_deferred_free, - .sync = srcu_torture_synchronize, - .call = NULL, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu_raw" -}; - -static struct rcu_torture_ops srcu_raw_sync_ops = { - .init = rcu_sync_torture_init, - .readlock = srcu_torture_read_lock_raw, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock_raw, - .completed = srcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = srcu_torture_synchronize, - .call = NULL, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu_raw_sync" -}; - -static void srcu_torture_synchronize_expedited(void) -{ - synchronize_srcu_expedited(&srcu_ctl); -} - -static struct rcu_torture_ops srcu_expedited_ops = { - .init = rcu_sync_torture_init, - .readlock = srcu_torture_read_lock, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock, - .completed = srcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = srcu_torture_synchronize_expedited, - .call = NULL, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu_expedited" -}; - /* * Definitions for sched torture testing. */ @@ -780,6 +637,8 @@ static struct rcu_torture_ops sched_ops = { .completed = rcu_no_completed, .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, + .exp_sync = synchronize_sched_expedited, + .call = call_rcu_sched, .cb_barrier = rcu_barrier_sched, .fqs = rcu_sched_force_quiescent_state, .stats = NULL, @@ -787,35 +646,6 @@ static struct rcu_torture_ops sched_ops = { .name = "sched" }; -static struct rcu_torture_ops sched_sync_ops = { - .init = rcu_sync_torture_init, - .readlock = sched_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = rcu_no_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_sched, - .cb_barrier = NULL, - .fqs = rcu_sched_force_quiescent_state, - .stats = NULL, - .name = "sched_sync" -}; - -static struct rcu_torture_ops sched_expedited_ops = { - .init = rcu_sync_torture_init, - .readlock = sched_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = rcu_no_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_sched_expedited, - .cb_barrier = NULL, - .fqs = rcu_sched_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .name = "sched_expedited" -}; - /* * RCU torture priority-boost testing. Runs one real-time thread per * CPU for moderate bursts, repeatedly registering RCU callbacks and @@ -965,9 +795,10 @@ rcu_torture_fqs(void *arg) static int rcu_torture_writer(void *arg) { + bool exp; int i; - long oldbatch = rcu_batches_completed(); struct rcu_torture *rp; + struct rcu_torture *rp1; struct rcu_torture *old_rp; static DEFINE_RCU_RANDOM(rand); @@ -992,10 +823,33 @@ rcu_torture_writer(void *arg) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); old_rp->rtort_pipe_count++; - cur_ops->deferred_free(old_rp); + if (gp_normal == gp_exp) + exp = !!(rcu_random(&rand) & 0x80); + else + exp = gp_exp; + if (!exp) { + cur_ops->deferred_free(old_rp); + } else { + cur_ops->exp_sync(); + list_add(&old_rp->rtort_free, + &rcu_torture_removed); + list_for_each_entry_safe(rp, rp1, + &rcu_torture_removed, + rtort_free) { + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= + RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + list_del(&rp->rtort_free); + rcu_torture_free(rp); + } + } + } } rcutorture_record_progress(++rcu_torture_current_version); - oldbatch = cur_ops->completed(); rcu_stutter_wait("rcu_torture_writer"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); @@ -1021,10 +875,18 @@ rcu_torture_fakewriter(void *arg) schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); udelay(rcu_random(&rand) & 0x3ff); if (cur_ops->cb_barrier != NULL && - rcu_random(&rand) % (nfakewriters * 8) == 0) + rcu_random(&rand) % (nfakewriters * 8) == 0) { cur_ops->cb_barrier(); - else + } else if (gp_normal == gp_exp) { + if (rcu_random(&rand) & 0x80) + cur_ops->sync(); + else + cur_ops->exp_sync(); + } else if (gp_normal) { cur_ops->sync(); + } else { + cur_ops->exp_sync(); + } rcu_stutter_wait("rcu_torture_fakewriter"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); @@ -1402,7 +1264,7 @@ rcu_torture_stutter(void *arg) } static inline void -rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) +rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) { pr_alert("%s" TORTURE_FLAG "--- %s: nreaders=%d nfakewriters=%d " @@ -1514,7 +1376,7 @@ rcu_torture_shutdown(void *arg) * Execute random CPU-hotplug operations at the interval specified * by the onoff_interval. */ -static int __cpuinit +static int rcu_torture_onoff(void *arg) { int cpu; @@ -1572,7 +1434,13 @@ rcu_torture_onoff(void *arg) torture_type, cpu); starttime = jiffies; n_online_attempts++; - if (cpu_up(cpu) == 0) { + ret = cpu_up(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: online %d failed: errno %d\n", + torture_type, cpu, ret); + } else { if (verbose) pr_alert("%s" TORTURE_FLAG "rcu_torture_onoff task: onlined %d\n", @@ -1596,7 +1464,7 @@ rcu_torture_onoff(void *arg) return 0; } -static int __cpuinit +static int rcu_torture_onoff_init(void) { int ret; @@ -1639,7 +1507,7 @@ static void rcu_torture_onoff_cleanup(void) * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then * induces a CPU stall for the time specified by stall_cpu. */ -static int __cpuinit rcu_torture_stall(void *args) +static int rcu_torture_stall(void *args) { unsigned long stop_at; @@ -1972,6 +1840,62 @@ rcu_torture_cleanup(void) rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); } +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +static void rcu_torture_leak_cb(struct rcu_head *rhp) +{ +} + +static void rcu_torture_err_cb(struct rcu_head *rhp) +{ + /* + * This -might- happen due to race conditions, but is unlikely. + * The scenario that leads to this happening is that the + * first of the pair of duplicate callbacks is queued, + * someone else starts a grace period that includes that + * callback, then the second of the pair must wait for the + * next grace period. Unlikely, but can happen. If it + * does happen, the debug-objects subsystem won't have splatted. + */ + pr_alert("rcutorture: duplicated callback was invoked.\n"); +} +#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + +/* + * Verify that double-free causes debug-objects to complain, but only + * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test + * cannot be carried out. + */ +static void rcu_test_debug_objects(void) +{ +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD + struct rcu_head rh1; + struct rcu_head rh2; + + init_rcu_head_on_stack(&rh1); + init_rcu_head_on_stack(&rh2); + pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); + + /* Try to queue the rh2 pair of callbacks for the same grace period. */ + preempt_disable(); /* Prevent preemption from interrupting test. */ + rcu_read_lock(); /* Make it impossible to finish a grace period. */ + call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */ + local_irq_disable(); /* Make it harder to start a new grace period. */ + call_rcu(&rh2, rcu_torture_leak_cb); + call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ + local_irq_enable(); + rcu_read_unlock(); + preempt_enable(); + + /* Wait for them all to get done so we can safely return. */ + rcu_barrier(); + pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); + destroy_rcu_head_on_stack(&rh1); + destroy_rcu_head_on_stack(&rh2); +#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); +#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +} + static int __init rcu_torture_init(void) { @@ -1979,12 +1903,9 @@ rcu_torture_init(void) int cpu; int firsterr = 0; int retval; - static struct rcu_torture_ops *torture_ops[] = - { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, - &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, - &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, - &srcu_raw_ops, &srcu_raw_sync_ops, - &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; + static struct rcu_torture_ops *torture_ops[] = { + &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + }; mutex_lock(&fullstop_mutex); @@ -2202,6 +2123,8 @@ rcu_torture_init(void) firsterr = retval; goto unwind; } + if (object_debug) + rcu_test_debug_objects(); rcutorture_record_test_transition(); mutex_unlock(&fullstop_mutex); return 0; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 16ea6792501..32618b3fe4e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -53,18 +53,38 @@ #include <linux/delay.h> #include <linux/stop_machine.h> #include <linux/random.h> +#include <linux/ftrace_event.h> +#include <linux/suspend.h> #include "rcutree.h" #include <trace/events/rcu.h> #include "rcu.h" +/* + * Strings used in tracepoints need to be exported via the + * tracing system such that tools like perf and trace-cmd can + * translate the string address pointers to actual text. + */ +#define TPS(x) tracepoint_string(x) + /* Data structures. */ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; -#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \ +/* + * In order to export the rcu_state name to the tracing tools, it + * needs to be added in the __tracepoint_string section. + * This requires defining a separate variable tp_<sname>_varname + * that points to the string being used, and this will allow + * the tracing userspace tools to be able to decipher the string + * address to the matching string. + */ +#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ +static char sname##_varname[] = #sname; \ +static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \ +struct rcu_state sname##_state = { \ .level = { &sname##_state.node[0] }, \ .call = cr, \ .fqs_state = RCU_GP_IDLE, \ @@ -75,16 +95,13 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ - .name = #sname, \ + .name = sname##_varname, \ .abbr = sabbr, \ -} - -struct rcu_state rcu_sched_state = - RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); -DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); +}; \ +DEFINE_PER_CPU(struct rcu_data, sname##_data) -struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); -DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); +RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); +RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); static struct rcu_state *rcu_state; LIST_HEAD(rcu_struct_flavors); @@ -178,7 +195,7 @@ void rcu_sched_qs(int cpu) struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); if (rdp->passed_quiesce == 0) - trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); + trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs")); rdp->passed_quiesce = 1; } @@ -187,7 +204,7 @@ void rcu_bh_qs(int cpu) struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); if (rdp->passed_quiesce == 0) - trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); + trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs")); rdp->passed_quiesce = 1; } @@ -198,16 +215,20 @@ void rcu_bh_qs(int cpu) */ void rcu_note_context_switch(int cpu) { - trace_rcu_utilization("Start context switch"); + trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(cpu); rcu_preempt_note_context_switch(cpu); - trace_rcu_utilization("End context switch"); + trace_rcu_utilization(TPS("End context switch")); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(1), +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, + .dynticks_idle = ATOMIC_INIT(1), +#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ @@ -218,15 +239,18 @@ module_param(blimit, long, 0444); module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); -static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; -static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; +static ulong jiffies_till_first_fqs = ULONG_MAX; +static ulong jiffies_till_next_fqs = ULONG_MAX; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); -static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); +static void force_qs_rnp(struct rcu_state *rsp, + int (*f)(struct rcu_data *rsp, bool *isidle, + unsigned long *maxj), + bool *isidle, unsigned long *maxj); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(int cpu); @@ -345,11 +369,11 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, bool user) { - trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); - trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); + trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, @@ -411,6 +435,7 @@ void rcu_idle_enter(void) local_irq_save(flags); rcu_eqs_enter(false); + rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -428,27 +453,6 @@ void rcu_user_enter(void) { rcu_eqs_enter(1); } - -/** - * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace - * after the current irq returns. - * - * This is similar to rcu_user_enter() but in the context of a non-nesting - * irq. After this call, RCU enters into idle mode when the interrupt - * returns. - */ -void rcu_user_enter_after_irq(void) -{ - unsigned long flags; - struct rcu_dynticks *rdtp; - - local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); - /* Ensure this irq is interrupting a non-idle RCU state. */ - WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK)); - rdtp->dynticks_nesting = 1; - local_irq_restore(flags); -} #endif /* CONFIG_RCU_USER_QS */ /** @@ -479,9 +483,10 @@ void rcu_irq_exit(void) rdtp->dynticks_nesting--; WARN_ON_ONCE(rdtp->dynticks_nesting < 0); if (rdtp->dynticks_nesting) - trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); else rcu_eqs_enter_common(rdtp, oldval, true); + rcu_sysidle_enter(rdtp, 1); local_irq_restore(flags); } @@ -501,11 +506,11 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, smp_mb__after_atomic_inc(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); rcu_cleanup_after_idle(smp_processor_id()); - trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); - trace_rcu_dyntick("Error on exit: not idle task", + trace_rcu_dyntick(TPS("Error on exit: not idle task"), oldval, rdtp->dynticks_nesting); ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", @@ -550,6 +555,7 @@ void rcu_idle_exit(void) local_irq_save(flags); rcu_eqs_exit(false); + rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -565,28 +571,6 @@ void rcu_user_exit(void) { rcu_eqs_exit(1); } - -/** - * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace - * idle mode after the current non-nesting irq returns. - * - * This is similar to rcu_user_exit() but in the context of an irq. - * This is called when the irq has interrupted a userspace RCU idle mode - * context. When the current non-nesting interrupt returns after this call, - * the CPU won't restore the RCU idle mode. - */ -void rcu_user_exit_after_irq(void) -{ - unsigned long flags; - struct rcu_dynticks *rdtp; - - local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); - /* Ensure we are interrupting an RCU idle mode. */ - WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK); - rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE; - local_irq_restore(flags); -} #endif /* CONFIG_RCU_USER_QS */ /** @@ -620,9 +604,10 @@ void rcu_irq_enter(void) rdtp->dynticks_nesting++; WARN_ON_ONCE(rdtp->dynticks_nesting == 0); if (oldval) - trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); else rcu_eqs_exit_common(rdtp, oldval, true); + rcu_sysidle_exit(rdtp, 1); local_irq_restore(flags); } @@ -746,9 +731,11 @@ static int rcu_is_cpu_rrupt_from_idle(void) * credit them with an implicit quiescent state. Return 1 if this CPU * is in dynticks idle mode, which is an extended quiescent state. */ -static int dyntick_save_progress_counter(struct rcu_data *rdp) +static int dyntick_save_progress_counter(struct rcu_data *rdp, + bool *isidle, unsigned long *maxj) { rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); + rcu_sysidle_check_cpu(rdp, isidle, maxj); return (rdp->dynticks_snap & 0x1) == 0; } @@ -758,7 +745,8 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) * idle state since the last call to dyntick_save_progress_counter() * for this same CPU, or by virtue of having been offline. */ -static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) +static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, + bool *isidle, unsigned long *maxj) { unsigned int curr; unsigned int snap; @@ -775,7 +763,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * of the current RCU grace period. */ if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti"); + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); rdp->dynticks_fqs++; return 1; } @@ -795,7 +783,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 0; /* Grace period is not old enough. */ barrier(); if (cpu_is_offline(rdp->cpu)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); rdp->offline_fqs++; return 1; } @@ -866,7 +854,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:", + pr_err("INFO: %s detected stalls on CPUs/tasks:", rsp->name); print_cpu_stall_info_begin(); rcu_for_each_leaf_node(rsp, rnp) { @@ -899,7 +887,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) smp_processor_id(), (long)(jiffies - rsp->gp_start), rsp->gpnum, rsp->completed, totqlen); if (ndetected == 0) - printk(KERN_ERR "INFO: Stall ended before state dump start\n"); + pr_err("INFO: Stall ended before state dump start\n"); else if (!trigger_all_cpu_backtrace()) rcu_dump_cpu_stacks(rsp); @@ -922,7 +910,7 @@ static void print_cpu_stall(struct rcu_state *rsp) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name); + pr_err("INFO: %s self-detected stall on CPU", rsp->name); print_cpu_stall_info_begin(); print_cpu_stall_info(rsp, smp_processor_id()); print_cpu_stall_info_end(); @@ -985,65 +973,6 @@ void rcu_cpu_stall_reset(void) } /* - * Update CPU-local rcu_data state to record the newly noticed grace period. - * This is used both when we started the grace period and when we notice - * that someone else started the grace period. The caller must hold the - * ->lock of the leaf rcu_node structure corresponding to the current CPU, - * and must have irqs disabled. - */ -static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) -{ - if (rdp->gpnum != rnp->gpnum) { - /* - * If the current grace period is waiting for this CPU, - * set up to detect a quiescent state, otherwise don't - * go looking for one. - */ - rdp->gpnum = rnp->gpnum; - trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); - rdp->passed_quiesce = 0; - rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); - zero_cpu_stall_ticks(rdp); - } -} - -static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long flags; - struct rcu_node *rnp; - - local_irq_save(flags); - rnp = rdp->mynode; - if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ - !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ - local_irq_restore(flags); - return; - } - __note_new_gpnum(rsp, rnp, rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - -/* - * Did someone else start a new RCU grace period start since we last - * checked? Update local state appropriately if so. Must be called - * on the CPU corresponding to rdp. - */ -static int -check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long flags; - int ret = 0; - - local_irq_save(flags); - if (rdp->gpnum != rsp->gpnum) { - note_new_gpnum(rsp, rdp); - ret = 1; - } - local_irq_restore(flags); - return ret; -} - -/* * Initialize the specified rcu_data structure's callback list to empty. */ static void init_callback_list(struct rcu_data *rdp) @@ -1091,7 +1020,7 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp, * rcu_nocb_wait_gp(). */ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long c, char *s) + unsigned long c, const char *s) { trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, rnp->completed, c, rnp->level, @@ -1117,9 +1046,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) * grace period is already marked as needed, return to the caller. */ c = rcu_cbs_completed(rdp->rsp, rnp); - trace_rcu_future_gp(rnp, rdp, c, "Startleaf"); + trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); if (rnp->need_future_gp[c & 0x1]) { - trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf"); + trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); return c; } @@ -1133,7 +1062,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) if (rnp->gpnum != rnp->completed || ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { rnp->need_future_gp[c & 0x1]++; - trace_rcu_future_gp(rnp, rdp, c, "Startedleaf"); + trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); return c; } @@ -1161,7 +1090,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) * recorded, trace and leave. */ if (rnp_root->need_future_gp[c & 0x1]) { - trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot"); + trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); goto unlock_out; } @@ -1170,9 +1099,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) /* If a grace period is not already in progress, start one. */ if (rnp_root->gpnum != rnp_root->completed) { - trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot"); + trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); } else { - trace_rcu_future_gp(rnp, rdp, c, "Startedroot"); + trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); } unlock_out: @@ -1196,7 +1125,8 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) rcu_nocb_gp_cleanup(rsp, rnp); rnp->need_future_gp[c & 0x1] = 0; needmore = rnp->need_future_gp[(c + 1) & 0x1]; - trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup"); + trace_rcu_future_gp(rnp, rdp, c, + needmore ? TPS("CleanupMore") : TPS("Cleanup")); return needmore; } @@ -1264,9 +1194,9 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, /* Trace depending on how much we were able to accelerate. */ if (!*rdp->nxttail[RCU_WAIT_TAIL]) - trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB"); + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); else - trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB"); + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); } /* @@ -1313,18 +1243,16 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, } /* - * Advance this CPU's callbacks, but only if the current grace period - * has ended. This may be called only from the CPU to whom the rdp - * belongs. In addition, the corresponding leaf rcu_node structure's - * ->lock must be held by the caller, with irqs disabled. + * Update CPU-local rcu_data state to record the beginnings and ends of + * grace periods. The caller must hold the ->lock of the leaf rcu_node + * structure corresponding to the current CPU, and must have irqs disabled. */ -static void -__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - /* Did another grace period end? */ + /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed) { - /* No, so just accelerate recent callbacks. */ + /* No grace period end, so just accelerate recent callbacks. */ rcu_accelerate_cbs(rsp, rnp, rdp); } else { @@ -1334,69 +1262,41 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; - trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); - - /* - * If we were in an extended quiescent state, we may have - * missed some grace periods that others CPUs handled on - * our behalf. Catch up with this state to avoid noting - * spurious new grace periods. If another grace period - * has started, then rnp->gpnum will have advanced, so - * we will detect this later on. Of course, any quiescent - * states we found for the old GP are now invalid. - */ - if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) { - rdp->gpnum = rdp->completed; - rdp->passed_quiesce = 0; - } + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); + } + if (rdp->gpnum != rnp->gpnum) { /* - * If RCU does not need a quiescent state from this CPU, - * then make sure that this CPU doesn't go looking for one. + * If the current grace period is waiting for this CPU, + * set up to detect a quiescent state, otherwise don't + * go looking for one. */ - if ((rnp->qsmask & rdp->grpmask) == 0) - rdp->qs_pending = 0; + rdp->gpnum = rnp->gpnum; + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); + rdp->passed_quiesce = 0; + rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); + zero_cpu_stall_ticks(rdp); } } -/* - * Advance this CPU's callbacks, but only if the current grace period - * has ended. This may be called only from the CPU to whom the rdp - * belongs. - */ -static void -rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) +static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_node *rnp; local_irq_save(flags); rnp = rdp->mynode; - if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ + if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && + rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */ !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ local_irq_restore(flags); return; } - __rcu_process_gp_end(rsp, rnp, rdp); + __note_gp_changes(rsp, rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* - * Do per-CPU grace-period initialization for running CPU. The caller - * must hold the lock of the leaf rcu_node structure corresponding to - * this CPU. - */ -static void -rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) -{ - /* Prior grace period ended, so advance callbacks for current CPU. */ - __rcu_process_gp_end(rsp, rnp, rdp); - - /* Set state so that this CPU will detect the next quiescent state. */ - __note_new_gpnum(rsp, rnp, rdp); -} - -/* * Initialize a new grace period. */ static int rcu_gp_init(struct rcu_state *rsp) @@ -1404,6 +1304,7 @@ static int rcu_gp_init(struct rcu_state *rsp) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); rsp->gp_flags = 0; /* Clear all flags: New grace period. */ @@ -1415,7 +1316,7 @@ static int rcu_gp_init(struct rcu_state *rsp) /* Advance to a new grace period and initialize state. */ rsp->gpnum++; - trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); + trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); record_gp_stall_check_time(rsp); raw_spin_unlock_irq(&rnp->lock); @@ -1444,16 +1345,16 @@ static int rcu_gp_init(struct rcu_state *rsp) WARN_ON_ONCE(rnp->completed != rsp->completed); ACCESS_ONCE(rnp->completed) = rsp->completed; if (rnp == rdp->mynode) - rcu_start_gp_per_cpu(rsp, rnp, rdp); + __note_gp_changes(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); trace_rcu_grace_period_init(rsp->name, rnp->gpnum, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); #ifdef CONFIG_PROVE_RCU_DELAY - if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 && + if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 && system_state == SYSTEM_RUNNING) - schedule_timeout_uninterruptible(2); + udelay(200); #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ cond_resched(); } @@ -1468,16 +1369,25 @@ static int rcu_gp_init(struct rcu_state *rsp) int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) { int fqs_state = fqs_state_in; + bool isidle = false; + unsigned long maxj; struct rcu_node *rnp = rcu_get_root(rsp); rsp->n_force_qs++; if (fqs_state == RCU_SAVE_DYNTICK) { /* Collect dyntick-idle snapshots. */ - force_qs_rnp(rsp, dyntick_save_progress_counter); + if (is_sysidle_rcu_state(rsp)) { + isidle = 1; + maxj = jiffies - ULONG_MAX / 4; + } + force_qs_rnp(rsp, dyntick_save_progress_counter, + &isidle, &maxj); + rcu_sysidle_report_gp(rsp, isidle, maxj); fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ - force_qs_rnp(rsp, rcu_implicit_dynticks_qs); + isidle = 0; + force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); } /* Clear flag to prevent immediate re-entry. */ if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { @@ -1527,7 +1437,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) ACCESS_ONCE(rnp->completed) = rsp->gpnum; rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) - __rcu_process_gp_end(rsp, rnp, rdp); + __note_gp_changes(rsp, rnp, rdp); nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched(); @@ -1537,7 +1447,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_nocb_gp_set(rnp, nocb); rsp->completed = rsp->gpnum; /* Declare grace period done. */ - trace_rcu_grace_period(rsp->name, rsp->completed, "end"); + trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); rsp->fqs_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ @@ -1613,6 +1523,14 @@ static int __noreturn rcu_gp_kthread(void *arg) } } +static void rsp_wakeup(struct irq_work *work) +{ + struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work); + + /* Wake up rcu_gp_kthread() to start the grace period. */ + wake_up(&rsp->gp_wq); +} + /* * Start a new RCU grace period if warranted, re-initializing the hierarchy * in preparation for detecting the next grace period. The caller must hold @@ -1637,8 +1555,14 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, } rsp->gp_flags = RCU_GP_FLAG_INIT; - /* Wake up rcu_gp_kthread() to start the grace period. */ - wake_up(&rsp->gp_wq); + /* + * We can't do wakeups while holding the rnp->lock, as that + * could cause possible deadlocks with the rq->lock. Defer + * the wakeup to interrupt context. And don't bother waking + * up the running kthread. + */ + if (current != rsp->gp_kthread) + irq_work_queue(&rsp->wakeup_work); } /* @@ -1793,9 +1717,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) static void rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) { - /* If there is now a new grace period, record and return. */ - if (check_for_new_grace_period(rsp, rdp)) - return; + /* Check for grace-period ends and beginnings. */ + note_gp_changes(rsp, rdp); /* * Does this CPU still need to do its part for current grace period? @@ -1935,7 +1858,7 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) RCU_TRACE(mask = rdp->grpmask); trace_rcu_grace_period(rsp->name, rnp->gpnum + 1 - !!(rnp->qsmask & mask), - "cpuofl"); + TPS("cpuofl")); } /* @@ -2122,7 +2045,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) */ void rcu_check_callbacks(int cpu, int user) { - trace_rcu_utilization("Start scheduler-tick"); + trace_rcu_utilization(TPS("Start scheduler-tick")); increment_cpu_stall_ticks(); if (user || rcu_is_cpu_rrupt_from_idle()) { @@ -2155,7 +2078,7 @@ void rcu_check_callbacks(int cpu, int user) rcu_preempt_check_callbacks(cpu); if (rcu_pending(cpu)) invoke_rcu_core(); - trace_rcu_utilization("End scheduler-tick"); + trace_rcu_utilization(TPS("End scheduler-tick")); } /* @@ -2165,7 +2088,10 @@ void rcu_check_callbacks(int cpu, int user) * * The caller must have suppressed start of new grace periods. */ -static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) +static void force_qs_rnp(struct rcu_state *rsp, + int (*f)(struct rcu_data *rsp, bool *isidle, + unsigned long *maxj), + bool *isidle, unsigned long *maxj) { unsigned long bit; int cpu; @@ -2188,9 +2114,12 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) cpu = rnp->grplo; bit = 1; for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { - if ((rnp->qsmask & bit) != 0 && - f(per_cpu_ptr(rsp->rda, cpu))) - mask |= bit; + if ((rnp->qsmask & bit) != 0) { + if ((rnp->qsmaskinit & bit) != 0) + *isidle = 0; + if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) + mask |= bit; + } } if (mask != 0) { @@ -2259,9 +2188,6 @@ __rcu_process_callbacks(struct rcu_state *rsp) WARN_ON_ONCE(rdp->beenonline == 0); - /* Handle the end of a grace period that some other CPU ended. */ - rcu_process_gp_end(rsp, rdp); - /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); @@ -2289,10 +2215,10 @@ static void rcu_process_callbacks(struct softirq_action *unused) if (cpu_is_offline(smp_processor_id())) return; - trace_rcu_utilization("Start RCU core"); + trace_rcu_utilization(TPS("Start RCU core")); for_each_rcu_flavor(rsp) __rcu_process_callbacks(rsp); - trace_rcu_utilization("End RCU core"); + trace_rcu_utilization(TPS("End RCU core")); } /* @@ -2346,8 +2272,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { /* Are we ignoring a completed grace period? */ - rcu_process_gp_end(rsp, rdp); - check_for_new_grace_period(rsp, rdp); + note_gp_changes(rsp, rdp); /* Start a new grace period if one not already started. */ if (!rcu_gp_in_progress(rsp)) { @@ -2369,6 +2294,13 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, } /* + * RCU callback function to leak a callback. + */ +static void rcu_leak_callback(struct rcu_head *rhp) +{ +} + +/* * Helper function for call_rcu() and friends. The cpu argument will * normally be -1, indicating "currently running CPU". It may specify * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() @@ -2382,7 +2314,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_data *rdp; WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ - debug_rcu_head_queue(head); + if (debug_rcu_head_queue(head)) { + /* Probable double call_rcu(), so leak the callback. */ + ACCESS_ONCE(head->func) = rcu_leak_callback; + WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); + return; + } head->func = func; head->next = NULL; @@ -2802,7 +2739,7 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) * Helper function for _rcu_barrier() tracing. If tracing is disabled, * the compiler is expected to optimize this away. */ -static void _rcu_barrier_trace(struct rcu_state *rsp, char *s, +static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s, int cpu, unsigned long done) { trace_rcu_barrier(rsp->name, s, cpu, @@ -2867,9 +2804,20 @@ static void _rcu_barrier(struct rcu_state *rsp) * transition. The "if" expression below therefore rounds the old * value up to the next even number and adds two before comparing. */ - snap_done = ACCESS_ONCE(rsp->n_barrier_done); + snap_done = rsp->n_barrier_done; _rcu_barrier_trace(rsp, "Check", -1, snap_done); - if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { + + /* + * If the value in snap is odd, we needed to wait for the current + * rcu_barrier() to complete, then wait for the next one, in other + * words, we need the value of snap_done to be three larger than + * the value of snap. On the other hand, if the value in snap is + * even, we only had to wait for the next rcu_barrier() to complete, + * in other words, we need the value of snap_done to be only two + * greater than the value of snap. The "(snap + 3) & ~0x1" computes + * this for us (thank you, Linus!). + */ + if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) { _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rsp->barrier_mutex); @@ -2992,7 +2940,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) * can accept some slop in the rsp->completed access due to the fact * that this CPU cannot possibly have any RCU callbacks in flight yet. */ -static void __cpuinit +static void rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) { unsigned long flags; @@ -3012,6 +2960,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->blimit = blimit; init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; + rcu_sysidle_init_percpu_data(rdp->dynticks); atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ @@ -3034,7 +2983,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->completed = rnp->completed; rdp->passed_quiesce = 0; rdp->qs_pending = 0; - trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); } raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ rnp = rnp->parent; @@ -3044,7 +2993,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) mutex_unlock(&rsp->onoff_mutex); } -static void __cpuinit rcu_prepare_cpu(int cpu) +static void rcu_prepare_cpu(int cpu) { struct rcu_state *rsp; @@ -3056,7 +3005,7 @@ static void __cpuinit rcu_prepare_cpu(int cpu) /* * Handle CPU online/offline notification events. */ -static int __cpuinit rcu_cpu_notify(struct notifier_block *self, +static int rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -3064,7 +3013,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, struct rcu_node *rnp = rdp->mynode; struct rcu_state *rsp; - trace_rcu_utilization("Start CPU hotplug"); + trace_rcu_utilization(TPS("Start CPU hotplug")); switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: @@ -3093,7 +3042,26 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, default: break; } - trace_rcu_utilization("End CPU hotplug"); + trace_rcu_utilization(TPS("End CPU hotplug")); + return NOTIFY_OK; +} + +static int rcu_pm_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + switch (action) { + case PM_HIBERNATION_PREPARE: + case PM_SUSPEND_PREPARE: + if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ + rcu_expedited = 1; + break; + case PM_POST_HIBERNATION: + case PM_POST_SUSPEND: + rcu_expedited = 0; + break; + default: + break; + } return NOTIFY_OK; } @@ -3108,7 +3076,7 @@ static int __init rcu_spawn_gp_kthread(void) struct task_struct *t; for_each_rcu_flavor(rsp) { - t = kthread_run(rcu_gp_kthread, rsp, rsp->name); + t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); BUG_ON(IS_ERR(t)); rnp = rcu_get_root(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); @@ -3235,6 +3203,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, rsp->rda = rda; init_waitqueue_head(&rsp->gp_wq); + init_irq_work(&rsp->wakeup_work, rsp_wakeup); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) @@ -3252,11 +3221,25 @@ static void __init rcu_init_one(struct rcu_state *rsp, */ static void __init rcu_init_geometry(void) { + ulong d; int i; int j; int n = nr_cpu_ids; int rcu_capacity[MAX_RCU_LVLS + 1]; + /* + * Initialize any unspecified boot parameters. + * The default values of jiffies_till_first_fqs and + * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS + * value, which is a function of HZ, then adding one for each + * RCU_JIFFIES_FQS_DIV CPUs that might be on the system. + */ + d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV; + if (jiffies_till_first_fqs == ULONG_MAX) + jiffies_till_first_fqs = d; + if (jiffies_till_next_fqs == ULONG_MAX) + jiffies_till_next_fqs = d; + /* If the compile-time values are accurate, just leave. */ if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) @@ -3323,6 +3306,7 @@ void __init rcu_init(void) * or the scheduler are operational. */ cpu_notifier(rcu_cpu_notify, 0); + pm_notifier(rcu_pm_notify, 0); for_each_online_cpu(cpu) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index da77a8f57ff..5f97eab602c 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -27,6 +27,7 @@ #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/seqlock.h> +#include <linux/irq_work.h> /* * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and @@ -87,6 +88,14 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + long long dynticks_idle_nesting; + /* irq/process nesting level from idle. */ + atomic_t dynticks_idle; /* Even value for idle, else odd. */ + /* "Idle" excludes userspace execution. */ + unsigned long dynticks_idle_jiffies; + /* End of last non-NMI non-idle period. */ +#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ #ifdef CONFIG_RCU_FAST_NO_HZ bool all_lazy; /* Are all CPU's CBs lazy? */ unsigned long nonlazy_posted; @@ -342,12 +351,17 @@ struct rcu_data { #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK -#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ +#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) + /* For jiffies_till_first_fqs and */ + /* and jiffies_till_next_fqs. */ -#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ - /* to take at least one */ - /* scheduling clock irq */ - /* before ratting on them. */ +#define RCU_JIFFIES_FQS_DIV 256 /* Very large systems need more */ + /* delay between bouts of */ + /* quiescent-state forcing. */ + +#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time to take */ + /* at least one scheduling clock */ + /* irq before ratting on them. */ #define rcu_wait(cond) \ do { \ @@ -439,9 +453,10 @@ struct rcu_state { /* for CPU stalls. */ unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ - char *name; /* Name of structure. */ + const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ struct list_head flavors; /* List of RCU flavors. */ + struct irq_work wakeup_work; /* Postponed wakeups */ }; /* Values for rcu_state structure's gp_flags field. */ @@ -514,10 +529,10 @@ static void invoke_rcu_callbacks_kthread(void); static bool rcu_is_callbacks_kthread(void); #ifdef CONFIG_RCU_BOOST static void rcu_preempt_do_callbacks(void); -static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, +static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ -static void __cpuinit rcu_prepare_kthreads(int cpu); +static void rcu_prepare_kthreads(int cpu); static void rcu_cleanup_after_idle(int cpu); static void rcu_prepare_for_idle(int cpu); static void rcu_idle_count_callbacks_posted(void); @@ -538,6 +553,15 @@ static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); static void rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); +static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); +static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj); +static bool is_sysidle_rcu_state(struct rcu_state *rsp); +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj); +static void rcu_bind_gp_kthread(void); +static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3db5a375d8d..130c97b027f 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -28,7 +28,7 @@ #include <linux/gfp.h> #include <linux/oom.h> #include <linux/smpboot.h> -#include <linux/tick.h> +#include "time/tick-internal.h" #define RCU_KTHREAD_PRIO 1 @@ -53,38 +53,37 @@ static char __initdata nocb_buf[NR_CPUS * 5]; static void __init rcu_bootup_announce_oddness(void) { #ifdef CONFIG_RCU_TRACE - printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n"); + pr_info("\tRCU debugfs-based tracing is enabled.\n"); #endif #if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) - printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n", + pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", CONFIG_RCU_FANOUT); #endif #ifdef CONFIG_RCU_FANOUT_EXACT - printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n"); + pr_info("\tHierarchical RCU autobalancing is disabled.\n"); #endif #ifdef CONFIG_RCU_FAST_NO_HZ - printk(KERN_INFO - "\tRCU dyntick-idle grace-period acceleration is enabled.\n"); + pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); #endif #ifdef CONFIG_PROVE_RCU - printk(KERN_INFO "\tRCU lockdep checking is enabled.\n"); + pr_info("\tRCU lockdep checking is enabled.\n"); #endif #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE - printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); + pr_info("\tRCU torture testing starts during boot.\n"); #endif #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) - printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n"); + pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n"); #endif #if defined(CONFIG_RCU_CPU_STALL_INFO) - printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); + pr_info("\tAdditional per-CPU info printed with stalls.\n"); #endif #if NUM_RCU_LVL_4 != 0 - printk(KERN_INFO "\tFour-level hierarchy is enabled.\n"); + pr_info("\tFour-level hierarchy is enabled.\n"); #endif if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) - printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); + pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) - printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); + pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_NOCB_CPU #ifndef CONFIG_RCU_NOCB_CPU_NONE if (!have_rcu_nocb_mask) { @@ -92,28 +91,26 @@ static void __init rcu_bootup_announce_oddness(void) have_rcu_nocb_mask = true; } #ifdef CONFIG_RCU_NOCB_CPU_ZERO - pr_info("\tExperimental no-CBs CPU 0\n"); + pr_info("\tOffload RCU callbacks from CPU 0\n"); cpumask_set_cpu(0, rcu_nocb_mask); #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ #ifdef CONFIG_RCU_NOCB_CPU_ALL - pr_info("\tExperimental no-CBs for all CPUs\n"); + pr_info("\tOffload RCU callbacks from all CPUs\n"); cpumask_setall(rcu_nocb_mask); #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ if (have_rcu_nocb_mask) { cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); - pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); + pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); if (rcu_nocb_poll) - pr_info("\tExperimental polled no-CBs CPUs.\n"); + pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); } #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ } #ifdef CONFIG_TREE_PREEMPT_RCU -struct rcu_state rcu_preempt_state = - RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); -DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); +RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); static struct rcu_state *rcu_state = &rcu_preempt_state; static int rcu_preempted_readers_exp(struct rcu_node *rnp); @@ -123,7 +120,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp); */ static void __init rcu_bootup_announce(void) { - printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n"); + pr_info("Preemptible hierarchical RCU implementation.\n"); rcu_bootup_announce_oddness(); } @@ -170,7 +167,7 @@ static void rcu_preempt_qs(int cpu) struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); if (rdp->passed_quiesce == 0) - trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); + trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); rdp->passed_quiesce = 1; current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; } @@ -389,7 +386,7 @@ void rcu_read_unlock_special(struct task_struct *t) np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); t->rcu_blocked_node = NULL; - trace_rcu_unlock_preempted_task("rcu_preempt", + trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), rnp->gpnum, t->pid); if (&t->rcu_node_entry == rnp->gp_tasks) rnp->gp_tasks = np; @@ -413,7 +410,7 @@ void rcu_read_unlock_special(struct task_struct *t) */ empty_exp_now = !rcu_preempted_readers_exp(rnp); if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { - trace_rcu_quiescent_state_report("preempt_rcu", + trace_rcu_quiescent_state_report(TPS("preempt_rcu"), rnp->gpnum, 0, rnp->qsmask, rnp->level, @@ -490,13 +487,13 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) static void rcu_print_task_stall_begin(struct rcu_node *rnp) { - printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", + pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", rnp->level, rnp->grplo, rnp->grphi); } static void rcu_print_task_stall_end(void) { - printk(KERN_CONT "\n"); + pr_cont("\n"); } #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ @@ -526,7 +523,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) t = list_entry(rnp->gp_tasks, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - printk(KERN_CONT " P%d", t->pid); + pr_cont(" P%d", t->pid); ndetected++; } rcu_print_task_stall_end(); @@ -933,6 +930,24 @@ static void __init __rcu_init_preempt(void) rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); } +/* + * Check for a task exiting while in a preemptible-RCU read-side + * critical section, clean up if so. No need to issue warnings, + * as debug_check_no_locks_held() already does this if lockdep + * is enabled. + */ +void exit_rcu(void) +{ + struct task_struct *t = current; + + if (likely(list_empty(¤t->rcu_node_entry))) + return; + t->rcu_read_lock_nesting = 1; + barrier(); + t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; + __rcu_read_unlock(); +} + #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ static struct rcu_state *rcu_state = &rcu_sched_state; @@ -942,7 +957,7 @@ static struct rcu_state *rcu_state = &rcu_sched_state; */ static void __init rcu_bootup_announce(void) { - printk(KERN_INFO "Hierarchical RCU implementation.\n"); + pr_info("Hierarchical RCU implementation.\n"); rcu_bootup_announce_oddness(); } @@ -1101,6 +1116,14 @@ static void __init __rcu_init_preempt(void) { } +/* + * Because preemptible RCU does not exist, tasks cannot possibly exit + * while in preemptible RCU read-side critical sections. + */ +void exit_rcu(void) +{ +} + #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ #ifdef CONFIG_RCU_BOOST @@ -1225,12 +1248,12 @@ static int rcu_boost_kthread(void *arg) int spincnt = 0; int more2boost; - trace_rcu_utilization("Start boost kthread@init"); + trace_rcu_utilization(TPS("Start boost kthread@init")); for (;;) { rnp->boost_kthread_status = RCU_KTHREAD_WAITING; - trace_rcu_utilization("End boost kthread@rcu_wait"); + trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); rcu_wait(rnp->boost_tasks || rnp->exp_tasks); - trace_rcu_utilization("Start boost kthread@rcu_wait"); + trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; more2boost = rcu_boost(rnp); if (more2boost) @@ -1239,14 +1262,14 @@ static int rcu_boost_kthread(void *arg) spincnt = 0; if (spincnt > 10) { rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; - trace_rcu_utilization("End boost kthread@rcu_yield"); + trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); schedule_timeout_interruptible(2); - trace_rcu_utilization("Start boost kthread@rcu_yield"); + trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); spincnt = 0; } } /* NOTREACHED */ - trace_rcu_utilization("End boost kthread@notreached"); + trace_rcu_utilization(TPS("End boost kthread@notreached")); return 0; } @@ -1327,7 +1350,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) * already exist. We only create this kthread for preemptible RCU. * Returns zero if all is well, a negated errno otherwise. */ -static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, +static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp) { int rnp_index = rnp - &rsp->node[0]; @@ -1394,7 +1417,7 @@ static void rcu_cpu_kthread(unsigned int cpu) int spincnt; for (spincnt = 0; spincnt < 10; spincnt++) { - trace_rcu_utilization("Start CPU kthread@rcu_wait"); + trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); local_bh_disable(); *statusp = RCU_KTHREAD_RUNNING; this_cpu_inc(rcu_cpu_kthread_loops); @@ -1406,15 +1429,15 @@ static void rcu_cpu_kthread(unsigned int cpu) rcu_kthread_do_work(); local_bh_enable(); if (*workp == 0) { - trace_rcu_utilization("End CPU kthread@rcu_wait"); + trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); *statusp = RCU_KTHREAD_WAITING; return; } } *statusp = RCU_KTHREAD_YIELDING; - trace_rcu_utilization("Start CPU kthread@rcu_yield"); + trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); schedule_timeout_interruptible(2); - trace_rcu_utilization("End CPU kthread@rcu_yield"); + trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); *statusp = RCU_KTHREAD_WAITING; } @@ -1482,7 +1505,7 @@ static int __init rcu_spawn_kthreads(void) } early_initcall(rcu_spawn_kthreads); -static void __cpuinit rcu_prepare_kthreads(int cpu) +static void rcu_prepare_kthreads(int cpu) { struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; @@ -1524,7 +1547,7 @@ static int __init rcu_scheduler_really_started(void) } early_initcall(rcu_scheduler_really_started); -static void __cpuinit rcu_prepare_kthreads(int cpu) +static void rcu_prepare_kthreads(int cpu) { } @@ -1629,7 +1652,7 @@ static bool rcu_try_advance_all_cbs(void) */ if (rdp->completed != rnp->completed && rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) - rcu_process_gp_end(rsp, rdp); + note_gp_changes(rsp, rdp); if (cpu_has_callbacks_ready_to_invoke(rdp)) cbs_ready = true; @@ -1883,7 +1906,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) /* Initiate the stall-info list. */ static void print_cpu_stall_info_begin(void) { - printk(KERN_CONT "\n"); + pr_cont("\n"); } /* @@ -1914,7 +1937,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", + pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", cpu, ticks_value, ticks_title, atomic_read(&rdtp->dynticks) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, @@ -1925,7 +1948,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) /* Terminate the stall-info list. */ static void print_cpu_stall_info_end(void) { - printk(KERN_ERR "\t"); + pr_err("\t"); } /* Zero ->ticks_this_gp for all flavors of RCU. */ @@ -1948,17 +1971,17 @@ static void increment_cpu_stall_ticks(void) static void print_cpu_stall_info_begin(void) { - printk(KERN_CONT " {"); + pr_cont(" {"); } static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) { - printk(KERN_CONT " %d", cpu); + pr_cont(" %d", cpu); } static void print_cpu_stall_info_end(void) { - printk(KERN_CONT "} "); + pr_cont("} "); } static void zero_cpu_stall_ticks(struct rcu_data *rdp) @@ -2177,7 +2200,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) * Wait for the grace period. Do so interruptibly to avoid messing * up the load average. */ - trace_rcu_future_gp(rnp, rdp, c, "StartWait"); + trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { wait_event_interruptible( rnp->nocb_gp_wq[c & 0x1], @@ -2185,9 +2208,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) if (likely(d)) break; flush_signals(current); - trace_rcu_future_gp(rnp, rdp, c, "ResumeWait"); + trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); } - trace_rcu_future_gp(rnp, rdp, c, "EndWait"); + trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); smp_mb(); /* Ensure that CB invocation happens after GP end. */ } @@ -2350,3 +2373,425 @@ static void rcu_kick_nohz_cpu(int cpu) smp_send_reschedule(cpu); #endif /* #ifdef CONFIG_NO_HZ_FULL */ } + + +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + +/* + * Define RCU flavor that holds sysidle state. This needs to be the + * most active flavor of RCU. + */ +#ifdef CONFIG_PREEMPT_RCU +static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; +#else /* #ifdef CONFIG_PREEMPT_RCU */ +static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + +static int full_sysidle_state; /* Current system-idle state. */ +#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ +#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ +#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ +#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ +#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ + +/* + * Invoked to note exit from irq or task transition to idle. Note that + * usermode execution does -not- count as idle here! After all, we want + * to detect full-system idle states, not RCU quiescent states and grace + * periods. The caller must have disabled interrupts. + */ +static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) +{ + unsigned long j; + + /* Adjust nesting, check for fully idle. */ + if (irq) { + rdtp->dynticks_idle_nesting--; + WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); + if (rdtp->dynticks_idle_nesting != 0) + return; /* Still not fully idle. */ + } else { + if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == + DYNTICK_TASK_NEST_VALUE) { + rdtp->dynticks_idle_nesting = 0; + } else { + rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; + WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); + return; /* Still not fully idle. */ + } + } + + /* Record start of fully idle period. */ + j = jiffies; + ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; + smp_mb__before_atomic_inc(); + atomic_inc(&rdtp->dynticks_idle); + smp_mb__after_atomic_inc(); + WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); +} + +/* + * Unconditionally force exit from full system-idle state. This is + * invoked when a normal CPU exits idle, but must be called separately + * for the timekeeping CPU (tick_do_timer_cpu). The reason for this + * is that the timekeeping CPU is permitted to take scheduling-clock + * interrupts while the system is in system-idle state, and of course + * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock + * interrupt from any other type of interrupt. + */ +void rcu_sysidle_force_exit(void) +{ + int oldstate = ACCESS_ONCE(full_sysidle_state); + int newoldstate; + + /* + * Each pass through the following loop attempts to exit full + * system-idle state. If contention proves to be a problem, + * a trylock-based contention tree could be used here. + */ + while (oldstate > RCU_SYSIDLE_SHORT) { + newoldstate = cmpxchg(&full_sysidle_state, + oldstate, RCU_SYSIDLE_NOT); + if (oldstate == newoldstate && + oldstate == RCU_SYSIDLE_FULL_NOTED) { + rcu_kick_nohz_cpu(tick_do_timer_cpu); + return; /* We cleared it, done! */ + } + oldstate = newoldstate; + } + smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ +} + +/* + * Invoked to note entry to irq or task transition from idle. Note that + * usermode execution does -not- count as idle here! The caller must + * have disabled interrupts. + */ +static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) +{ + /* Adjust nesting, check for already non-idle. */ + if (irq) { + rdtp->dynticks_idle_nesting++; + WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); + if (rdtp->dynticks_idle_nesting != 1) + return; /* Already non-idle. */ + } else { + /* + * Allow for irq misnesting. Yes, it really is possible + * to enter an irq handler then never leave it, and maybe + * also vice versa. Handle both possibilities. + */ + if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { + rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; + WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); + return; /* Already non-idle. */ + } else { + rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; + } + } + + /* Record end of idle period. */ + smp_mb__before_atomic_inc(); + atomic_inc(&rdtp->dynticks_idle); + smp_mb__after_atomic_inc(); + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); + + /* + * If we are the timekeeping CPU, we are permitted to be non-idle + * during a system-idle state. This must be the case, because + * the timekeeping CPU has to take scheduling-clock interrupts + * during the time that the system is transitioning to full + * system-idle state. This means that the timekeeping CPU must + * invoke rcu_sysidle_force_exit() directly if it does anything + * more than take a scheduling-clock interrupt. + */ + if (smp_processor_id() == tick_do_timer_cpu) + return; + + /* Update system-idle state: We are clearly no longer fully idle! */ + rcu_sysidle_force_exit(); +} + +/* + * Check to see if the current CPU is idle. Note that usermode execution + * does not count as idle. The caller must have disabled interrupts. + */ +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj) +{ + int cur; + unsigned long j; + struct rcu_dynticks *rdtp = rdp->dynticks; + + /* + * If some other CPU has already reported non-idle, if this is + * not the flavor of RCU that tracks sysidle state, or if this + * is an offline or the timekeeping CPU, nothing to do. + */ + if (!*isidle || rdp->rsp != rcu_sysidle_state || + cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) + return; + if (rcu_gp_in_progress(rdp->rsp)) + WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); + + /* Pick up current idle and NMI-nesting counter and check. */ + cur = atomic_read(&rdtp->dynticks_idle); + if (cur & 0x1) { + *isidle = false; /* We are not idle! */ + return; + } + smp_mb(); /* Read counters before timestamps. */ + + /* Pick up timestamps. */ + j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies); + /* If this CPU entered idle more recently, update maxj timestamp. */ + if (ULONG_CMP_LT(*maxj, j)) + *maxj = j; +} + +/* + * Is this the flavor of RCU that is handling full-system idle? + */ +static bool is_sysidle_rcu_state(struct rcu_state *rsp) +{ + return rsp == rcu_sysidle_state; +} + +/* + * Bind the grace-period kthread for the sysidle flavor of RCU to the + * timekeeping CPU. + */ +static void rcu_bind_gp_kthread(void) +{ + int cpu = ACCESS_ONCE(tick_do_timer_cpu); + + if (cpu < 0 || cpu >= nr_cpu_ids) + return; + if (raw_smp_processor_id() != cpu) + set_cpus_allowed_ptr(current, cpumask_of(cpu)); +} + +/* + * Return a delay in jiffies based on the number of CPUs, rcu_node + * leaf fanout, and jiffies tick rate. The idea is to allow larger + * systems more time to transition to full-idle state in order to + * avoid the cache thrashing that otherwise occur on the state variable. + * Really small systems (less than a couple of tens of CPUs) should + * instead use a single global atomically incremented counter, and later + * versions of this will automatically reconfigure themselves accordingly. + */ +static unsigned long rcu_sysidle_delay(void) +{ + if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) + return 0; + return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); +} + +/* + * Advance the full-system-idle state. This is invoked when all of + * the non-timekeeping CPUs are idle. + */ +static void rcu_sysidle(unsigned long j) +{ + /* Check the current state. */ + switch (ACCESS_ONCE(full_sysidle_state)) { + case RCU_SYSIDLE_NOT: + + /* First time all are idle, so note a short idle period. */ + ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT; + break; + + case RCU_SYSIDLE_SHORT: + + /* + * Idle for a bit, time to advance to next state? + * cmpxchg failure means race with non-idle, let them win. + */ + if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) + (void)cmpxchg(&full_sysidle_state, + RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); + break; + + case RCU_SYSIDLE_LONG: + + /* + * Do an additional check pass before advancing to full. + * cmpxchg failure means race with non-idle, let them win. + */ + if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) + (void)cmpxchg(&full_sysidle_state, + RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); + break; + + default: + break; + } +} + +/* + * Found a non-idle non-timekeeping CPU, so kick the system-idle state + * back to the beginning. + */ +static void rcu_sysidle_cancel(void) +{ + smp_mb(); + ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; +} + +/* + * Update the sysidle state based on the results of a force-quiescent-state + * scan of the CPUs' dyntick-idle state. + */ +static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, + unsigned long maxj, bool gpkt) +{ + if (rsp != rcu_sysidle_state) + return; /* Wrong flavor, ignore. */ + if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) + return; /* Running state machine from timekeeping CPU. */ + if (isidle) + rcu_sysidle(maxj); /* More idle! */ + else + rcu_sysidle_cancel(); /* Idle is over. */ +} + +/* + * Wrapper for rcu_sysidle_report() when called from the grace-period + * kthread's context. + */ +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj) +{ + rcu_sysidle_report(rsp, isidle, maxj, true); +} + +/* Callback and function for forcing an RCU grace period. */ +struct rcu_sysidle_head { + struct rcu_head rh; + int inuse; +}; + +static void rcu_sysidle_cb(struct rcu_head *rhp) +{ + struct rcu_sysidle_head *rshp; + + /* + * The following memory barrier is needed to replace the + * memory barriers that would normally be in the memory + * allocator. + */ + smp_mb(); /* grace period precedes setting inuse. */ + + rshp = container_of(rhp, struct rcu_sysidle_head, rh); + ACCESS_ONCE(rshp->inuse) = 0; +} + +/* + * Check to see if the system is fully idle, other than the timekeeping CPU. + * The caller must have disabled interrupts. + */ +bool rcu_sys_is_idle(void) +{ + static struct rcu_sysidle_head rsh; + int rss = ACCESS_ONCE(full_sysidle_state); + + if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) + return false; + + /* Handle small-system case by doing a full scan of CPUs. */ + if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { + int oldrss = rss - 1; + + /* + * One pass to advance to each state up to _FULL. + * Give up if any pass fails to advance the state. + */ + while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { + int cpu; + bool isidle = true; + unsigned long maxj = jiffies - ULONG_MAX / 4; + struct rcu_data *rdp; + + /* Scan all the CPUs looking for nonidle CPUs. */ + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); + rcu_sysidle_check_cpu(rdp, &isidle, &maxj); + if (!isidle) + break; + } + rcu_sysidle_report(rcu_sysidle_state, + isidle, maxj, false); + oldrss = rss; + rss = ACCESS_ONCE(full_sysidle_state); + } + } + + /* If this is the first observation of an idle period, record it. */ + if (rss == RCU_SYSIDLE_FULL) { + rss = cmpxchg(&full_sysidle_state, + RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); + return rss == RCU_SYSIDLE_FULL; + } + + smp_mb(); /* ensure rss load happens before later caller actions. */ + + /* If already fully idle, tell the caller (in case of races). */ + if (rss == RCU_SYSIDLE_FULL_NOTED) + return true; + + /* + * If we aren't there yet, and a grace period is not in flight, + * initiate a grace period. Either way, tell the caller that + * we are not there yet. We use an xchg() rather than an assignment + * to make up for the memory barriers that would otherwise be + * provided by the memory allocator. + */ + if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && + !rcu_gp_in_progress(rcu_sysidle_state) && + !rsh.inuse && xchg(&rsh.inuse, 1) == 0) + call_rcu(&rsh.rh, rcu_sysidle_cb); + return false; +} + +/* + * Initialize dynticks sysidle state for CPUs coming online. + */ +static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) +{ + rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; +} + +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + +static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) +{ +} + +static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) +{ +} + +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj) +{ +} + +static bool is_sysidle_rcu_state(struct rcu_state *rsp) +{ + return false; +} + +static void rcu_bind_gp_kthread(void) +{ +} + +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj) +{ +} + +static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) +{ +} + +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ diff --git a/kernel/reboot.c b/kernel/reboot.c new file mode 100644 index 00000000000..f813b347464 --- /dev/null +++ b/kernel/reboot.c @@ -0,0 +1,426 @@ +/* + * linux/kernel/reboot.c + * + * Copyright (C) 2013 Linus Torvalds + */ + +#define pr_fmt(fmt) "reboot: " fmt + +#include <linux/ctype.h> +#include <linux/export.h> +#include <linux/kexec.h> +#include <linux/kmod.h> +#include <linux/kmsg_dump.h> +#include <linux/reboot.h> +#include <linux/suspend.h> +#include <linux/syscalls.h> +#include <linux/syscore_ops.h> +#include <linux/uaccess.h> + +/* + * this indicates whether you can reboot with ctrl-alt-del: the default is yes + */ + +int C_A_D = 1; +struct pid *cad_pid; +EXPORT_SYMBOL(cad_pid); + +#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32) +#define DEFAULT_REBOOT_MODE = REBOOT_HARD +#else +#define DEFAULT_REBOOT_MODE +#endif +enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; + +/* + * This variable is used privately to keep track of whether or not + * reboot_type is still set to its default value (i.e., reboot= hasn't + * been set on the command line). This is needed so that we can + * suppress DMI scanning for reboot quirks. Without it, it's + * impossible to override a faulty reboot quirk without recompiling. + */ +int reboot_default = 1; +int reboot_cpu; +enum reboot_type reboot_type = BOOT_ACPI; +int reboot_force; + +/* + * If set, this is used for preparing the system to power off. + */ + +void (*pm_power_off_prepare)(void); + +/** + * emergency_restart - reboot the system + * + * Without shutting down any hardware or taking any locks + * reboot the system. This is called when we know we are in + * trouble so this is our best effort to reboot. This is + * safe to call in interrupt context. + */ +void emergency_restart(void) +{ + kmsg_dump(KMSG_DUMP_EMERG); + machine_emergency_restart(); +} +EXPORT_SYMBOL_GPL(emergency_restart); + +void kernel_restart_prepare(char *cmd) +{ + blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); + system_state = SYSTEM_RESTART; + usermodehelper_disable(); + device_shutdown(); +} + +/** + * register_reboot_notifier - Register function to be called at reboot time + * @nb: Info about notifier function to be called + * + * Registers a function with the list of functions + * to be called at reboot time. + * + * Currently always returns zero, as blocking_notifier_chain_register() + * always returns zero. + */ +int register_reboot_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&reboot_notifier_list, nb); +} +EXPORT_SYMBOL(register_reboot_notifier); + +/** + * unregister_reboot_notifier - Unregister previously registered reboot notifier + * @nb: Hook to be unregistered + * + * Unregisters a previously registered reboot + * notifier function. + * + * Returns zero on success, or %-ENOENT on failure. + */ +int unregister_reboot_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); +} +EXPORT_SYMBOL(unregister_reboot_notifier); + +static void migrate_to_reboot_cpu(void) +{ + /* The boot cpu is always logical cpu 0 */ + int cpu = reboot_cpu; + + cpu_hotplug_disable(); + + /* Make certain the cpu I'm about to reboot on is online */ + if (!cpu_online(cpu)) + cpu = cpumask_first(cpu_online_mask); + + /* Prevent races with other tasks migrating this task */ + current->flags |= PF_NO_SETAFFINITY; + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed_ptr(current, cpumask_of(cpu)); +} + +/** + * kernel_restart - reboot the system + * @cmd: pointer to buffer containing command to execute for restart + * or %NULL + * + * Shutdown everything and perform a clean reboot. + * This is not safe to call in interrupt context. + */ +void kernel_restart(char *cmd) +{ + kernel_restart_prepare(cmd); + migrate_to_reboot_cpu(); + syscore_shutdown(); + if (!cmd) + pr_emerg("Restarting system\n"); + else + pr_emerg("Restarting system with command '%s'\n", cmd); + kmsg_dump(KMSG_DUMP_RESTART); + machine_restart(cmd); +} +EXPORT_SYMBOL_GPL(kernel_restart); + +static void kernel_shutdown_prepare(enum system_states state) +{ + blocking_notifier_call_chain(&reboot_notifier_list, + (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL); + system_state = state; + usermodehelper_disable(); + device_shutdown(); +} +/** + * kernel_halt - halt the system + * + * Shutdown everything and perform a clean system halt. + */ +void kernel_halt(void) +{ + kernel_shutdown_prepare(SYSTEM_HALT); + migrate_to_reboot_cpu(); + syscore_shutdown(); + pr_emerg("System halted\n"); + kmsg_dump(KMSG_DUMP_HALT); + machine_halt(); +} +EXPORT_SYMBOL_GPL(kernel_halt); + +/** + * kernel_power_off - power_off the system + * + * Shutdown everything and perform a clean system power_off. + */ +void kernel_power_off(void) +{ + kernel_shutdown_prepare(SYSTEM_POWER_OFF); + if (pm_power_off_prepare) + pm_power_off_prepare(); + migrate_to_reboot_cpu(); + syscore_shutdown(); + pr_emerg("Power down\n"); + kmsg_dump(KMSG_DUMP_POWEROFF); + machine_power_off(); +} +EXPORT_SYMBOL_GPL(kernel_power_off); + +static DEFINE_MUTEX(reboot_mutex); + +/* + * Reboot system call: for obvious reasons only root may call it, + * and even root needs to set up some magic numbers in the registers + * so that some mistake won't make this reboot the whole machine. + * You can also set the meaning of the ctrl-alt-del-key here. + * + * reboot doesn't sync: do that yourself before calling this. + */ +SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, + void __user *, arg) +{ + struct pid_namespace *pid_ns = task_active_pid_ns(current); + char buffer[256]; + int ret = 0; + + /* We only trust the superuser with rebooting the system. */ + if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) + return -EPERM; + + /* For safety, we require "magic" arguments. */ + if (magic1 != LINUX_REBOOT_MAGIC1 || + (magic2 != LINUX_REBOOT_MAGIC2 && + magic2 != LINUX_REBOOT_MAGIC2A && + magic2 != LINUX_REBOOT_MAGIC2B && + magic2 != LINUX_REBOOT_MAGIC2C)) + return -EINVAL; + + /* + * If pid namespaces are enabled and the current task is in a child + * pid_namespace, the command is handled by reboot_pid_ns() which will + * call do_exit(). + */ + ret = reboot_pid_ns(pid_ns, cmd); + if (ret) + return ret; + + /* Instead of trying to make the power_off code look like + * halt when pm_power_off is not set do it the easy way. + */ + if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) + cmd = LINUX_REBOOT_CMD_HALT; + + mutex_lock(&reboot_mutex); + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART: + kernel_restart(NULL); + break; + + case LINUX_REBOOT_CMD_CAD_ON: + C_A_D = 1; + break; + + case LINUX_REBOOT_CMD_CAD_OFF: + C_A_D = 0; + break; + + case LINUX_REBOOT_CMD_HALT: + kernel_halt(); + do_exit(0); + panic("cannot halt"); + + case LINUX_REBOOT_CMD_POWER_OFF: + kernel_power_off(); + do_exit(0); + break; + + case LINUX_REBOOT_CMD_RESTART2: + ret = strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1); + if (ret < 0) { + ret = -EFAULT; + break; + } + buffer[sizeof(buffer) - 1] = '\0'; + + kernel_restart(buffer); + break; + +#ifdef CONFIG_KEXEC + case LINUX_REBOOT_CMD_KEXEC: + ret = kernel_kexec(); + break; +#endif + +#ifdef CONFIG_HIBERNATION + case LINUX_REBOOT_CMD_SW_SUSPEND: + ret = hibernate(); + break; +#endif + + default: + ret = -EINVAL; + break; + } + mutex_unlock(&reboot_mutex); + return ret; +} + +static void deferred_cad(struct work_struct *dummy) +{ + kernel_restart(NULL); +} + +/* + * This function gets called by ctrl-alt-del - ie the keyboard interrupt. + * As it's called within an interrupt, it may NOT sync: the only choice + * is whether to reboot at once, or just ignore the ctrl-alt-del. + */ +void ctrl_alt_del(void) +{ + static DECLARE_WORK(cad_work, deferred_cad); + + if (C_A_D) + schedule_work(&cad_work); + else + kill_cad_pid(SIGINT, 1); +} + +char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; + +static int __orderly_poweroff(bool force) +{ + char **argv; + static char *envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL + }; + int ret; + + argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); + if (argv) { + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + argv_free(argv); + } else { + ret = -ENOMEM; + } + + if (ret && force) { + pr_warn("Failed to start orderly shutdown: forcing the issue\n"); + /* + * I guess this should try to kick off some daemon to sync and + * poweroff asap. Or not even bother syncing if we're doing an + * emergency shutdown? + */ + emergency_sync(); + kernel_power_off(); + } + + return ret; +} + +static bool poweroff_force; + +static void poweroff_work_func(struct work_struct *work) +{ + __orderly_poweroff(poweroff_force); +} + +static DECLARE_WORK(poweroff_work, poweroff_work_func); + +/** + * orderly_poweroff - Trigger an orderly system poweroff + * @force: force poweroff if command execution fails + * + * This may be called from any context to trigger a system shutdown. + * If the orderly shutdown fails, it will force an immediate shutdown. + */ +int orderly_poweroff(bool force) +{ + if (force) /* do not override the pending "true" */ + poweroff_force = true; + schedule_work(&poweroff_work); + return 0; +} +EXPORT_SYMBOL_GPL(orderly_poweroff); + +static int __init reboot_setup(char *str) +{ + for (;;) { + /* + * Having anything passed on the command line via + * reboot= will cause us to disable DMI checking + * below. + */ + reboot_default = 0; + + switch (*str) { + case 'w': + reboot_mode = REBOOT_WARM; + break; + + case 'c': + reboot_mode = REBOOT_COLD; + break; + + case 'h': + reboot_mode = REBOOT_HARD; + break; + + case 's': + if (isdigit(*(str+1))) + reboot_cpu = simple_strtoul(str+1, NULL, 0); + else if (str[1] == 'm' && str[2] == 'p' && + isdigit(*(str+3))) + reboot_cpu = simple_strtoul(str+3, NULL, 0); + else + reboot_mode = REBOOT_SOFT; + break; + + case 'g': + reboot_mode = REBOOT_GPIO; + break; + + case 'b': + case 'a': + case 'k': + case 't': + case 'e': + case 'p': + reboot_type = *str; + break; + + case 'f': + reboot_force = 1; + break; + } + + str = strchr(str, ','); + if (str) + str++; + else + break; + } + return 1; +} +__setup("reboot=", reboot_setup); diff --git a/kernel/relay.c b/kernel/relay.c index b91488ba2e5..5001c9887db 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -516,7 +516,7 @@ static void setup_callbacks(struct rchan *chan, * * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) */ -static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, +static int relay_hotcpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { diff --git a/kernel/res_counter.c b/kernel/res_counter.c index ff55247e704..4aa8a305aed 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -17,8 +17,8 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) { spin_lock_init(&counter->lock); - counter->limit = RESOURCE_MAX; - counter->soft_limit = RESOURCE_MAX; + counter->limit = RES_COUNTER_MAX; + counter->soft_limit = RES_COUNTER_MAX; counter->parent = parent; } @@ -178,23 +178,30 @@ u64 res_counter_read_u64(struct res_counter *counter, int member) #endif int res_counter_memparse_write_strategy(const char *buf, - unsigned long long *res) + unsigned long long *resp) { char *end; + unsigned long long res; - /* return RESOURCE_MAX(unlimited) if "-1" is specified */ + /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ if (*buf == '-') { - *res = simple_strtoull(buf + 1, &end, 10); - if (*res != 1 || *end != '\0') + res = simple_strtoull(buf + 1, &end, 10); + if (res != 1 || *end != '\0') return -EINVAL; - *res = RESOURCE_MAX; + *resp = RES_COUNTER_MAX; return 0; } - *res = memparse(buf, &end); + res = memparse(buf, &end); if (*end != '\0') return -EINVAL; - *res = PAGE_ALIGN(*res); + if (PAGE_ALIGN(res) >= res) + res = PAGE_ALIGN(res); + else + res = RES_COUNTER_MAX; + + *resp = res; + return 0; } diff --git a/kernel/resource.c b/kernel/resource.c index d7386986e10..3f285dce934 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -409,6 +409,7 @@ int __weak page_is_ram(unsigned long pfn) { return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; } +EXPORT_SYMBOL_GPL(page_is_ram); void __weak arch_remove_reservations(struct resource *avail) { @@ -448,7 +449,6 @@ static int __find_resource(struct resource *root, struct resource *old, struct resource *this = root->child; struct resource tmp = *new, avail, alloc; - tmp.flags = new->flags; tmp.start = root->start; /* * Skip past an allocated resource that starts at 0, since the assignment diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 1e09308bf2a..0dd6aec1cb6 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -145,6 +145,19 @@ int max_lock_depth = 1024; /* * Adjust the priority chain. Also used for deadlock detection. * Decreases task's usage by one - may thus free the task. + * + * @task: the task owning the mutex (owner) for which a chain walk is probably + * needed + * @deadlock_detect: do we have to carry out deadlock detection? + * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck + * things for a task that has just got its priority adjusted, and + * is waiting on a mutex) + * @orig_waiter: rt_mutex_waiter struct for the task that has just donated + * its priority to the mutex owner (can be NULL in the case + * depicted above or if the top waiter is gone away and we are + * actually deboosting the owner) + * @top_task: the current top waiter + * * Returns 0 or -EDEADLK. */ static int rt_mutex_adjust_prio_chain(struct task_struct *task, diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index deaf90e4a1d..54adcf35f49 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o +obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 64de5f8b0c9..4a073539c58 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -77,8 +77,6 @@ static inline struct autogroup *autogroup_create(void) if (IS_ERR(tg)) goto out_free; - sched_online_group(tg, &root_task_group); - kref_init(&ag->kref); init_rwsem(&ag->lock); ag->id = atomic_inc_return(&autogroup_seq_nr); @@ -98,6 +96,7 @@ static inline struct autogroup *autogroup_create(void) #endif tg->autogroup = ag; + sched_online_group(tg, &root_task_group); return ag; out_free: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 58453b8272f..5ac63c9a995 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -370,13 +370,6 @@ static struct rq *this_rq_lock(void) #ifdef CONFIG_SCHED_HRTICK /* * Use HR-timers to deliver accurate preemption points. - * - * Its all a bit involved since we cannot program an hrt while holding the - * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a - * reschedule event. - * - * When we get rescheduled we reprogram the hrtick_timer outside of the - * rq->lock. */ static void hrtick_clear(struct rq *rq) @@ -404,6 +397,15 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) } #ifdef CONFIG_SMP + +static int __hrtick_restart(struct rq *rq) +{ + struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = hrtimer_get_softexpires(timer); + + return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0); +} + /* * called from hardirq (IPI) context */ @@ -412,7 +414,7 @@ static void __hrtick_start(void *arg) struct rq *rq = arg; raw_spin_lock(&rq->lock); - hrtimer_restart(&rq->hrtick_timer); + __hrtick_restart(rq); rq->hrtick_csd_pending = 0; raw_spin_unlock(&rq->lock); } @@ -430,7 +432,7 @@ void hrtick_start(struct rq *rq, u64 delay) hrtimer_set_expires(timer, time); if (rq == this_rq()) { - hrtimer_restart(timer); + __hrtick_restart(rq); } else if (!rq->hrtick_csd_pending) { __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); rq->hrtick_csd_pending = 1; @@ -633,7 +635,19 @@ void wake_up_nohz_cpu(int cpu) static inline bool got_nohz_idle_kick(void) { int cpu = smp_processor_id(); - return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + + if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) + return false; + + if (idle_cpu(cpu) && !need_resched()) + return true; + + /* + * We can't run Idle Load Balance on this CPU for this time so we + * cancel it and clear NOHZ_BALANCE_KICK + */ + clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + return false; } #else /* CONFIG_NO_HZ_COMMON */ @@ -667,7 +681,7 @@ void sched_avg_update(struct rq *rq) { s64 period = sched_avg_period(); - while ((s64)(rq->clock - rq->age_stamp) > period) { + while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { /* * Inline assembly required to prevent the compiler * optimising this loop into a divmod call. @@ -919,6 +933,8 @@ static int effective_prio(struct task_struct *p) /** * task_curr - is this task currently executing on a CPU? * @p: the task in question. + * + * Return: 1 if the task is currently executing. 0 otherwise. */ inline int task_curr(const struct task_struct *p) { @@ -962,13 +978,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) rq->skip_clock_update = 1; } -static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); - -void register_task_migration_notifier(struct notifier_block *n) -{ - atomic_notifier_chain_register(&task_migration_notifier, n); -} - #ifdef CONFIG_SMP void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { @@ -999,18 +1008,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { - struct task_migration_notifier tmn; - if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); - - tmn.task = p; - tmn.from_cpu = task_cpu(p); - tmn.to_cpu = new_cpu; - - atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); } __set_task_cpu(p, new_cpu); @@ -1328,7 +1329,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) p->sched_class->task_woken(rq, p); if (rq->idle_stamp) { - u64 delta = rq->clock - rq->idle_stamp; + u64 delta = rq_clock(rq) - rq->idle_stamp; u64 max = 2*sysctl_sched_migration_cost; if (delta > max) @@ -1365,6 +1366,8 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) rq = __task_rq_lock(p); if (p->on_rq) { + /* check_preempt_curr() may use rq clock */ + update_rq_clock(rq); ttwu_do_wakeup(rq, p, wake_flags); ret = 1; } @@ -1393,8 +1396,9 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() - && !tick_nohz_full_cpu(smp_processor_id())) + if (llist_empty(&this_rq()->wake_list) + && !tick_nohz_full_cpu(smp_processor_id()) + && !got_nohz_idle_kick()) return; /* @@ -1417,7 +1421,7 @@ void scheduler_ipi(void) /* * Check if someone kicked us for doing the nohz idle load balance. */ - if (unlikely(got_nohz_idle_kick() && !need_resched())) { + if (unlikely(got_nohz_idle_kick())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } @@ -1465,7 +1469,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) * the simpler "current->state = TASK_RUNNING" to mark yourself * runnable without the overhead of this. * - * Returns %true if @p was woken up, %false if it was already running + * Return: %true if @p was woken up, %false if it was already running. * or @state didn't match @p's state. */ static int @@ -1474,7 +1478,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) unsigned long flags; int cpu, success = 0; - smp_wmb(); + /* + * If we are going to wake up a thread waiting for CONDITION we + * need to ensure that CONDITION=1 done by the caller can not be + * reordered with p->state check below. This pairs with mb() in + * set_current_state() the waiting thread does. + */ + smp_mb__before_spinlock(); raw_spin_lock_irqsave(&p->pi_lock, flags); if (!(p->state & state)) goto out; @@ -1560,8 +1570,9 @@ out: * @p: The process to be woken up. * * Attempt to wake up the nominated process and move it to the set of runnable - * processes. Returns 1 if the process was woken up, 0 if it was already - * running. + * processes. + * + * Return: 1 if the process was woken up, 0 if it was already running. * * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. @@ -1596,15 +1607,6 @@ static void __sched_fork(struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) - p->se.avg.runnable_avg_period = 0; - p->se.avg.runnable_avg_sum = 0; -#endif #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif @@ -1748,6 +1750,8 @@ void wake_up_new_task(struct task_struct *p) set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); #endif + /* Initialize new task's runnable average */ + init_task_runnable_average(p); rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = 1; @@ -2056,575 +2060,6 @@ unsigned long nr_iowait_cpu(int cpu) return atomic_read(&this->nr_iowait); } -unsigned long this_cpu_load(void) -{ - struct rq *this = this_rq(); - return this->cpu_load[0]; -} - - -/* - * Global load-average calculations - * - * We take a distributed and async approach to calculating the global load-avg - * in order to minimize overhead. - * - * The global load average is an exponentially decaying average of nr_running + - * nr_uninterruptible. - * - * Once every LOAD_FREQ: - * - * nr_active = 0; - * for_each_possible_cpu(cpu) - * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; - * - * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) - * - * Due to a number of reasons the above turns in the mess below: - * - * - for_each_possible_cpu() is prohibitively expensive on machines with - * serious number of cpus, therefore we need to take a distributed approach - * to calculating nr_active. - * - * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 - * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } - * - * So assuming nr_active := 0 when we start out -- true per definition, we - * can simply take per-cpu deltas and fold those into a global accumulate - * to obtain the same result. See calc_load_fold_active(). - * - * Furthermore, in order to avoid synchronizing all per-cpu delta folding - * across the machine, we assume 10 ticks is sufficient time for every - * cpu to have completed this task. - * - * This places an upper-bound on the IRQ-off latency of the machine. Then - * again, being late doesn't loose the delta, just wrecks the sample. - * - * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because - * this would add another cross-cpu cacheline miss and atomic operation - * to the wakeup path. Instead we increment on whatever cpu the task ran - * when it went into uninterruptible state and decrement on whatever cpu - * did the wakeup. This means that only the sum of nr_uninterruptible over - * all cpus yields the correct result. - * - * This covers the NO_HZ=n code, for extra head-aches, see the comment below. - */ - -/* Variables and functions for calc_load */ -static atomic_long_t calc_load_tasks; -static unsigned long calc_load_update; -unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); /* should be removed */ - -/** - * get_avenrun - get the load average array - * @loads: pointer to dest load array - * @offset: offset to add - * @shift: shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ - loads[0] = (avenrun[0] + offset) << shift; - loads[1] = (avenrun[1] + offset) << shift; - loads[2] = (avenrun[2] + offset) << shift; -} - -static long calc_load_fold_active(struct rq *this_rq) -{ - long nr_active, delta = 0; - - nr_active = this_rq->nr_running; - nr_active += (long) this_rq->nr_uninterruptible; - - if (nr_active != this_rq->calc_load_active) { - delta = nr_active - this_rq->calc_load_active; - this_rq->calc_load_active = nr_active; - } - - return delta; -} - -/* - * a1 = a0 * e + a * (1 - e) - */ -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ - load *= exp; - load += active * (FIXED_1 - exp); - load += 1UL << (FSHIFT - 1); - return load >> FSHIFT; -} - -#ifdef CONFIG_NO_HZ_COMMON -/* - * Handle NO_HZ for the global load-average. - * - * Since the above described distributed algorithm to compute the global - * load-average relies on per-cpu sampling from the tick, it is affected by - * NO_HZ. - * - * The basic idea is to fold the nr_active delta into a global idle-delta upon - * entering NO_HZ state such that we can include this as an 'extra' cpu delta - * when we read the global state. - * - * Obviously reality has to ruin such a delightfully simple scheme: - * - * - When we go NO_HZ idle during the window, we can negate our sample - * contribution, causing under-accounting. - * - * We avoid this by keeping two idle-delta counters and flipping them - * when the window starts, thus separating old and new NO_HZ load. - * - * The only trick is the slight shift in index flip for read vs write. - * - * 0s 5s 10s 15s - * +10 +10 +10 +10 - * |-|-----------|-|-----------|-|-----------|-| - * r:0 0 1 1 0 0 1 1 0 - * w:0 1 1 0 0 1 1 0 0 - * - * This ensures we'll fold the old idle contribution in this window while - * accumlating the new one. - * - * - When we wake up from NO_HZ idle during the window, we push up our - * contribution, since we effectively move our sample point to a known - * busy state. - * - * This is solved by pushing the window forward, and thus skipping the - * sample, for this cpu (effectively using the idle-delta for this cpu which - * was in effect at the time the window opened). This also solves the issue - * of having to deal with a cpu having been in NOHZ idle for multiple - * LOAD_FREQ intervals. - * - * When making the ILB scale, we should try to pull this in as well. - */ -static atomic_long_t calc_load_idle[2]; -static int calc_load_idx; - -static inline int calc_load_write_idx(void) -{ - int idx = calc_load_idx; - - /* - * See calc_global_nohz(), if we observe the new index, we also - * need to observe the new update time. - */ - smp_rmb(); - - /* - * If the folding window started, make sure we start writing in the - * next idle-delta. - */ - if (!time_before(jiffies, calc_load_update)) - idx++; - - return idx & 1; -} - -static inline int calc_load_read_idx(void) -{ - return calc_load_idx & 1; -} - -void calc_load_enter_idle(void) -{ - struct rq *this_rq = this_rq(); - long delta; - - /* - * We're going into NOHZ mode, if there's any pending delta, fold it - * into the pending idle delta. - */ - delta = calc_load_fold_active(this_rq); - if (delta) { - int idx = calc_load_write_idx(); - atomic_long_add(delta, &calc_load_idle[idx]); - } -} - -void calc_load_exit_idle(void) -{ - struct rq *this_rq = this_rq(); - - /* - * If we're still before the sample window, we're done. - */ - if (time_before(jiffies, this_rq->calc_load_update)) - return; - - /* - * We woke inside or after the sample window, this means we're already - * accounted through the nohz accounting, so skip the entire deal and - * sync up for the next window. - */ - this_rq->calc_load_update = calc_load_update; - if (time_before(jiffies, this_rq->calc_load_update + 10)) - this_rq->calc_load_update += LOAD_FREQ; -} - -static long calc_load_fold_idle(void) -{ - int idx = calc_load_read_idx(); - long delta = 0; - - if (atomic_long_read(&calc_load_idle[idx])) - delta = atomic_long_xchg(&calc_load_idle[idx], 0); - - return delta; -} - -/** - * fixed_power_int - compute: x^n, in O(log n) time - * - * @x: base of the power - * @frac_bits: fractional bits of @x - * @n: power to raise @x to. - * - * By exploiting the relation between the definition of the natural power - * function: x^n := x*x*...*x (x multiplied by itself for n times), and - * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, - * (where: n_i \elem {0, 1}, the binary vector representing n), - * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is - * of course trivially computable in O(log_2 n), the length of our binary - * vector. - */ -static unsigned long -fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -{ - unsigned long result = 1UL << frac_bits; - - if (n) for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; - } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; - } - - return result; -} - -/* - * a1 = a0 * e + a * (1 - e) - * - * a2 = a1 * e + a * (1 - e) - * = (a0 * e + a * (1 - e)) * e + a * (1 - e) - * = a0 * e^2 + a * (1 - e) * (1 + e) - * - * a3 = a2 * e + a * (1 - e) - * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) - * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) - * - * ... - * - * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] - * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) - * = a0 * e^n + a * (1 - e^n) - * - * [1] application of the geometric series: - * - * n 1 - x^(n+1) - * S_n := \Sum x^i = ------------- - * i=0 1 - x - */ -static unsigned long -calc_load_n(unsigned long load, unsigned long exp, - unsigned long active, unsigned int n) -{ - - return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -} - -/* - * NO_HZ can leave us missing all per-cpu ticks calling - * calc_load_account_active(), but since an idle CPU folds its delta into - * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold - * in the pending idle delta if our idle period crossed a load cycle boundary. - * - * Once we've updated the global active value, we need to apply the exponential - * weights adjusted to the number of cycles missed. - */ -static void calc_global_nohz(void) -{ - long delta, active, n; - - if (!time_before(jiffies, calc_load_update + 10)) { - /* - * Catch-up, fold however many we are behind still - */ - delta = jiffies - calc_load_update - 10; - n = 1 + (delta / LOAD_FREQ); - - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; - - avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); - avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); - avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - - calc_load_update += n * LOAD_FREQ; - } - - /* - * Flip the idle index... - * - * Make sure we first write the new time then flip the index, so that - * calc_load_write_idx() will see the new time when it reads the new - * index, this avoids a double flip messing things up. - */ - smp_wmb(); - calc_load_idx++; -} -#else /* !CONFIG_NO_HZ_COMMON */ - -static inline long calc_load_fold_idle(void) { return 0; } -static inline void calc_global_nohz(void) { } - -#endif /* CONFIG_NO_HZ_COMMON */ - -/* - * calc_load - update the avenrun load estimates 10 ticks after the - * CPUs have updated calc_load_tasks. - */ -void calc_global_load(unsigned long ticks) -{ - long active, delta; - - if (time_before(jiffies, calc_load_update + 10)) - return; - - /* - * Fold the 'old' idle-delta to include all NO_HZ cpus. - */ - delta = calc_load_fold_idle(); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; - - avenrun[0] = calc_load(avenrun[0], EXP_1, active); - avenrun[1] = calc_load(avenrun[1], EXP_5, active); - avenrun[2] = calc_load(avenrun[2], EXP_15, active); - - calc_load_update += LOAD_FREQ; - - /* - * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. - */ - calc_global_nohz(); -} - -/* - * Called from update_cpu_load() to periodically update this CPU's - * active count. - */ -static void calc_load_account_active(struct rq *this_rq) -{ - long delta; - - if (time_before(jiffies, this_rq->calc_load_update)) - return; - - delta = calc_load_fold_active(this_rq); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - this_rq->calc_load_update += LOAD_FREQ; -} - -/* - * End of global load-average stuff - */ - -/* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load - * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load - * - * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load - * - * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). - * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. - */ -#define DEGRADE_SHIFT 7 -static const unsigned char - degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char - degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - {0, 0, 0, 0, 0, 0, 0, 0}, - {64, 32, 8, 0, 0, 0, 0, 0}, - {96, 72, 40, 12, 1, 0, 0}, - {112, 98, 75, 43, 15, 1, 0}, - {120, 112, 98, 76, 45, 16, 2} }; - -/* - * Update cpu_load for any missed ticks, due to tickless idle. The backlog - * would be when CPU is idle and so we just decay the old load without - * adding any new load. - */ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) -{ - int j = 0; - - if (!missed_updates) - return load; - - if (missed_updates >= degrade_zero_ticks[idx]) - return 0; - - if (idx == 1) - return load >> missed_updates; - - while (missed_updates) { - if (missed_updates % 2) - load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; - - missed_updates >>= 1; - j++; - } - return load; -} - -/* - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). With tickless idle this will not be called - * every tick. We fix it up based on jiffies. - */ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates) -{ - int i, scale; - - this_rq->nr_load_updates++; - - /* Update our load: */ - this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ - for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; - old_load = decay_load_missed(old_load, pending_updates - 1, i); - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale - 1; - - this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; - } - - sched_avg_update(this_rq); -} - -#ifdef CONFIG_NO_HZ_COMMON -/* - * There is no sane way to deal with nohz on smp when using jiffies because the - * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. - * - * Therefore we cannot use the delta approach from the regular tick since that - * would seriously skew the load calculation. However we'll make do for those - * updates happening while idle (nohz_idle_balance) or coming out of idle - * (tick_nohz_idle_exit). - * - * This means we might still be one tick off for nohz periods. - */ - -/* - * Called from nohz_idle_balance() to update the load ratings before doing the - * idle balance. - */ -void update_idle_cpu_load(struct rq *this_rq) -{ - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long load = this_rq->load.weight; - unsigned long pending_updates; - - /* - * bail if there's load or we're actually up-to-date. - */ - if (load || curr_jiffies == this_rq->last_load_update_tick) - return; - - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - - __update_cpu_load(this_rq, load, pending_updates); -} - -/* - * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. - */ -void update_cpu_load_nohz(void) -{ - struct rq *this_rq = this_rq(); - unsigned long curr_jiffies = ACCESS_ONCE(jiffies); - unsigned long pending_updates; - - if (curr_jiffies == this_rq->last_load_update_tick) - return; - - raw_spin_lock(&this_rq->lock); - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * We were idle, this means load 0, the current load might be - * !0 due to remote wakeups and the sort. - */ - __update_cpu_load(this_rq, 0, pending_updates); - } - raw_spin_unlock(&this_rq->lock); -} -#endif /* CONFIG_NO_HZ_COMMON */ - -/* - * Called from scheduler_tick() - */ -static void update_cpu_load_active(struct rq *this_rq) -{ - /* - * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). - */ - this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, this_rq->load.weight, 1); - - calc_load_account_active(this_rq); -} - #ifdef CONFIG_SMP /* @@ -2673,7 +2108,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) if (task_current(rq, p)) { update_rq_clock(rq); - ns = rq->clock_task - p->se.exec_start; + ns = rq_clock_task(rq) - p->se.exec_start; if ((s64)ns < 0) ns = 0; } @@ -2726,8 +2161,8 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); update_rq_clock(rq); - update_cpu_load_active(rq); curr->sched_class->task_tick(rq, curr, 0); + update_cpu_load_active(rq); raw_spin_unlock(&rq->lock); perf_event_task_tick(); @@ -2750,6 +2185,8 @@ void scheduler_tick(void) * This makes sure that uptime, CFS vruntime, load * balancing, etc... continue to move forward, even * with a very low granularity. + * + * Return: Maximum deferment in nanoseconds. */ u64 scheduler_tick_max_deferment(void) { @@ -2953,6 +2390,12 @@ need_resched: if (sched_feat(HRTICK)) hrtick_clear(rq); + /* + * Make sure that signal_pending_state()->signal_pending() below + * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) + * done by the caller to avoid the race with signal_wake_up(). + */ + smp_mb__before_spinlock(); raw_spin_lock_irq(&rq->lock); switch_count = &prev->nivcsw; @@ -3069,13 +2512,11 @@ void __sched schedule_preempt_disabled(void) */ asmlinkage void __sched notrace preempt_schedule(void) { - struct thread_info *ti = current_thread_info(); - /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. */ - if (likely(ti->preempt_count || irqs_disabled())) + if (likely(!preemptible())) return; do { @@ -3219,7 +2660,7 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, if (unlikely(!q)) return; - if (unlikely(!nr_exclusive)) + if (unlikely(nr_exclusive != 1)) wake_flags = 0; spin_lock_irqsave(&q->lock, flags); @@ -3355,8 +2796,8 @@ EXPORT_SYMBOL(wait_for_completion); * specified timeout to expire. The timeout is in jiffies. It is not * interruptible. * - * The return value is 0 if timed out, and positive (at least 1, or number of - * jiffies left till timeout) if completed. + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left + * till timeout) if completed. */ unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout) @@ -3388,8 +2829,8 @@ EXPORT_SYMBOL(wait_for_completion_io); * specified timeout to expire. The timeout is in jiffies. It is not * interruptible. The caller is accounted as waiting for IO. * - * The return value is 0 if timed out, and positive (at least 1, or number of - * jiffies left till timeout) if completed. + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left + * till timeout) if completed. */ unsigned long __sched wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) @@ -3405,7 +2846,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout); * This waits for completion of a specific task to be signaled. It is * interruptible. * - * The return value is -ERESTARTSYS if interrupted, 0 if completed. + * Return: -ERESTARTSYS if interrupted, 0 if completed. */ int __sched wait_for_completion_interruptible(struct completion *x) { @@ -3424,8 +2865,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. It is interruptible. The timeout is in jiffies. * - * The return value is -ERESTARTSYS if interrupted, 0 if timed out, - * positive (at least 1, or number of jiffies left till timeout) if completed. + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, + * or number of jiffies left till timeout) if completed. */ long __sched wait_for_completion_interruptible_timeout(struct completion *x, @@ -3442,7 +2883,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); * This waits to be signaled for completion of a specific task. It can be * interrupted by a kill signal. * - * The return value is -ERESTARTSYS if interrupted, 0 if completed. + * Return: -ERESTARTSYS if interrupted, 0 if completed. */ int __sched wait_for_completion_killable(struct completion *x) { @@ -3462,8 +2903,8 @@ EXPORT_SYMBOL(wait_for_completion_killable); * signaled or for a specified timeout to expire. It can be * interrupted by a kill signal. The timeout is in jiffies. * - * The return value is -ERESTARTSYS if interrupted, 0 if timed out, - * positive (at least 1, or number of jiffies left till timeout) if completed. + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, + * or number of jiffies left till timeout) if completed. */ long __sched wait_for_completion_killable_timeout(struct completion *x, @@ -3477,7 +2918,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout); * try_wait_for_completion - try to decrement a completion without blocking * @x: completion structure * - * Returns: 0 if a decrement cannot be done without blocking + * Return: 0 if a decrement cannot be done without blocking * 1 if a decrement succeeded. * * If a completion is being used as a counting completion, @@ -3504,7 +2945,7 @@ EXPORT_SYMBOL(try_wait_for_completion); * completion_done - Test to see if a completion has any waiters * @x: completion structure * - * Returns: 0 if there are waiters (wait_for_completion() in progress) + * Return: 0 if there are waiters (wait_for_completion() in progress) * 1 if there are no waiters. * */ @@ -3741,7 +3182,7 @@ SYSCALL_DEFINE1(nice, int, increment) * task_prio - return the priority value of a given task. * @p: the task in question. * - * This is the priority value as seen by users in /proc. + * Return: The priority value as seen by users in /proc. * RT tasks are offset by -200. Normal tasks are centered * around 0, value goes from -16 to +15. */ @@ -3753,6 +3194,8 @@ int task_prio(const struct task_struct *p) /** * task_nice - return the nice value of a given task. * @p: the task in question. + * + * Return: The nice value [ -20 ... 0 ... 19 ]. */ int task_nice(const struct task_struct *p) { @@ -3763,6 +3206,8 @@ EXPORT_SYMBOL(task_nice); /** * idle_cpu - is a given cpu idle currently? * @cpu: the processor in question. + * + * Return: 1 if the CPU is currently idle. 0 otherwise. */ int idle_cpu(int cpu) { @@ -3785,6 +3230,8 @@ int idle_cpu(int cpu) /** * idle_task - return the idle task for a given cpu. * @cpu: the processor in question. + * + * Return: The idle task for the cpu @cpu. */ struct task_struct *idle_task(int cpu) { @@ -3794,6 +3241,8 @@ struct task_struct *idle_task(int cpu) /** * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. + * + * The task of @pid, if found. %NULL otherwise. */ static struct task_struct *find_process_by_pid(pid_t pid) { @@ -3991,6 +3440,8 @@ recheck: * @policy: new policy. * @param: structure containing the new RT priority. * + * Return: 0 on success. An error code otherwise. + * * NOTE that the task may be already dead. */ int sched_setscheduler(struct task_struct *p, int policy, @@ -4010,6 +3461,8 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); * current context has permission. For example, this is needed in * stop_machine(): we create temporary high priority worker threads, * but our caller might not have that capability. + * + * Return: 0 on success. An error code otherwise. */ int sched_setscheduler_nocheck(struct task_struct *p, int policy, const struct sched_param *param) @@ -4044,6 +3497,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) * @pid: the pid in question. * @policy: new policy. * @param: structure containing the new RT priority. + * + * Return: 0 on success. An error code otherwise. */ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) @@ -4059,6 +3514,8 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, * sys_sched_setparam - set/change the RT priority of a thread * @pid: the pid in question. * @param: structure containing the new RT priority. + * + * Return: 0 on success. An error code otherwise. */ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) { @@ -4068,6 +3525,9 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) /** * sys_sched_getscheduler - get the policy (scheduling class) of a thread * @pid: the pid in question. + * + * Return: On success, the policy of the thread. Otherwise, a negative error + * code. */ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) { @@ -4094,6 +3554,9 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) * sys_sched_getparam - get the RT priority of a thread * @pid: the pid in question. * @param: structure containing the RT priority. + * + * Return: On success, 0 and the RT priority is in @param. Otherwise, an error + * code. */ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) { @@ -4218,6 +3681,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, * @pid: pid of the process * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to the new cpu mask + * + * Return: 0 on success. An error code otherwise. */ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, unsigned long __user *, user_mask_ptr) @@ -4269,6 +3734,8 @@ out_unlock: * @pid: pid of the process * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask + * + * Return: 0 on success. An error code otherwise. */ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, unsigned long __user *, user_mask_ptr) @@ -4303,6 +3770,8 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, * * This function yields the current CPU to other tasks. If there are no * other threads running on this CPU then this function will return. + * + * Return: 0. */ SYSCALL_DEFINE0(sched_yield) { @@ -4428,7 +3897,7 @@ EXPORT_SYMBOL(yield); * It's the caller's job to ensure that the target task struct * can't go away on us before we can do any checks. * - * Returns: + * Return: * true (>0) if we indeed boosted the target task. * false (0) if we failed to boost the target. * -ESRCH if there's no task to yield to. @@ -4531,8 +4000,9 @@ long __sched io_schedule_timeout(long timeout) * sys_sched_get_priority_max - return maximum RT priority. * @policy: scheduling class. * - * this syscall returns the maximum rt_priority that can be used - * by a given scheduling class. + * Return: On success, this syscall returns the maximum + * rt_priority that can be used by a given scheduling class. + * On failure, a negative error code is returned. */ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) { @@ -4556,8 +4026,9 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) * sys_sched_get_priority_min - return minimum RT priority. * @policy: scheduling class. * - * this syscall returns the minimum rt_priority that can be used - * by a given scheduling class. + * Return: On success, this syscall returns the minimum + * rt_priority that can be used by a given scheduling class. + * On failure, a negative error code is returned. */ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) { @@ -4583,6 +4054,9 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) * * this syscall writes the default timeslice value of a given process * into the user-space timespec buffer. A value of '0' means infinity. + * + * Return: On success, 0 and the timeslice is in @interval. Otherwise, + * an error code. */ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, struct timespec __user *, interval) @@ -4692,7 +4166,7 @@ void show_state_filter(unsigned long state_filter) debug_show_all_locks(); } -void __cpuinit init_idle_bootup_task(struct task_struct *idle) +void init_idle_bootup_task(struct task_struct *idle) { idle->sched_class = &idle_sched_class; } @@ -4705,7 +4179,7 @@ void __cpuinit init_idle_bootup_task(struct task_struct *idle) * NOTE: this function does not set the idle thread's NEED_RESCHED * flag, to make booting more robust. */ -void __cpuinit init_idle(struct task_struct *idle, int cpu) +void init_idle(struct task_struct *idle, int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -4745,7 +4219,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) */ idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); - vtime_init_idle(idle); + vtime_init_idle(idle, cpu); #if defined(CONFIG_SMP) sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif @@ -4947,6 +4421,13 @@ static void migrate_tasks(unsigned int dead_cpu) */ rq->stop = NULL; + /* + * put_prev_task() and pick_next_task() sched + * class method both need to have an up-to-date + * value of rq->clock[_task] + */ + update_rq_clock(rq); + for ( ; ; ) { /* * There's this thread running, bail when that's the only @@ -5080,7 +4561,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) return table; } -static ctl_table *sd_alloc_ctl_cpu_table(int cpu) +static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) { struct ctl_table *entry, *table; struct sched_domain *sd; @@ -5182,7 +4663,7 @@ static void set_rq_offline(struct rq *rq) * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. */ -static int __cpuinit +static int migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) { int cpu = (long)hcpu; @@ -5236,12 +4717,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) * happens before everything else. This has to be lower priority than * the notifier in the perf_event subsystem, though. */ -static struct notifier_block __cpuinitdata migration_notifier = { +static struct notifier_block migration_notifier = { .notifier_call = migration_call, .priority = CPU_PRI_MIGRATION, }; -static int __cpuinit sched_cpu_active(struct notifier_block *nfb, +static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { @@ -5254,7 +4735,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb, } } -static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, +static int sched_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { @@ -5466,7 +4947,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUPOWER | - SD_SHARE_PKG_RESOURCES); + SD_SHARE_PKG_RESOURCES | + SD_PREFER_SIBLING); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -5635,18 +5117,23 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * two cpus are in the same cache domain, see cpus_share_cache(). */ DEFINE_PER_CPU(struct sched_domain *, sd_llc); +DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); static void update_top_cache_domain(int cpu) { struct sched_domain *sd; int id = cpu; + int size = 1; sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) + if (sd) { id = cpumask_first(sched_domain_span(sd)); + size = cpumask_weight(sched_domain_span(sd)); + } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); + per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; } @@ -5670,6 +5157,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) tmp->parent = parent->parent; if (parent->parent) parent->parent->child = tmp; + /* + * Transfer SD_PREFER_SIBLING down in case of a + * degenerate parent; the spans match for this + * so the property transfers. + */ + if (parent->flags & SD_PREFER_SIBLING) + tmp->flags |= SD_PREFER_SIBLING; destroy_sched_domain(parent, cpu); } else tmp = tmp->parent; @@ -5894,7 +5388,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) get_group(cpu, sdd, &sd->groups); atomic_inc(&sd->groups->ref); - if (cpu != cpumask_first(sched_domain_span(sd))) + if (cpu != cpumask_first(span)) return 0; lockdep_assert_held(&sched_domains_mutex); @@ -5904,12 +5398,12 @@ build_sched_groups(struct sched_domain *sd, int cpu) for_each_cpu(i, span) { struct sched_group *sg; - int group = get_group(i, sdd, &sg); - int j; + int group, j; if (cpumask_test_cpu(i, covered)) continue; + group = get_group(i, sdd, &sg); cpumask_clear(sched_group_cpus(sg)); sg->sgp->power = 0; cpumask_setall(sched_group_mask(sg)); @@ -5947,7 +5441,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; - WARN_ON(!sd || !sg); + WARN_ON(!sg); do { sg->group_weight = cpumask_weight(sched_group_cpus(sg)); @@ -6112,6 +5606,9 @@ static struct sched_domain_topology_level default_topology[] = { static struct sched_domain_topology_level *sched_domain_topology = default_topology; +#define for_each_sd_topology(tl) \ + for (tl = sched_domain_topology; tl->init; tl++) + #ifdef CONFIG_NUMA static int sched_domains_numa_levels; @@ -6409,7 +5906,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) struct sched_domain_topology_level *tl; int j; - for (tl = sched_domain_topology; tl->init; tl++) { + for_each_sd_topology(tl) { struct sd_data *sdd = &tl->data; sdd->sd = alloc_percpu(struct sched_domain *); @@ -6462,7 +5959,7 @@ static void __sdt_free(const struct cpumask *cpu_map) struct sched_domain_topology_level *tl; int j; - for (tl = sched_domain_topology; tl->init; tl++) { + for_each_sd_topology(tl) { struct sd_data *sdd = &tl->data; for_each_cpu(j, cpu_map) { @@ -6490,9 +5987,8 @@ static void __sdt_free(const struct cpumask *cpu_map) } struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, - struct s_data *d, const struct cpumask *cpu_map, - struct sched_domain_attr *attr, struct sched_domain *child, - int cpu) + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *child, int cpu) { struct sched_domain *sd = tl->init(tl, cpu); if (!sd) @@ -6503,8 +5999,8 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, sd->level = child->level + 1; sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; + sd->child = child; } - sd->child = child; set_domain_attribute(sd, attr); return sd; @@ -6517,7 +6013,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { - enum s_alloc alloc_state = sa_none; + enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; int i, ret = -ENOMEM; @@ -6531,18 +6027,15 @@ static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_topology_level *tl; sd = NULL; - for (tl = sched_domain_topology; tl->init; tl++) { - sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); + for_each_sd_topology(tl) { + sd = build_sched_domain(tl, cpu_map, attr, sd, i); + if (tl == sched_domain_topology) + *per_cpu_ptr(d.sd, i) = sd; if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) sd->flags |= SD_OVERLAP; if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; } - - while (sd->child) - sd = sd->child; - - *per_cpu_ptr(d.sd, i) = sd; } /* Build the groups for the domains */ @@ -6737,8 +6230,9 @@ match1: ; } + n = ndoms_cur; if (doms_new == NULL) { - ndoms_cur = 0; + n = 0; doms_new = &fallback_doms; cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); @@ -6746,7 +6240,7 @@ match1: /* Build new domains */ for (i = 0; i < ndoms_new; i++) { - for (j = 0; j < ndoms_cur && !new_topology; j++) { + for (j = 0; j < n && !new_topology; j++) { if (cpumask_equal(doms_new[i], doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; @@ -6854,9 +6348,6 @@ void __init sched_init_smp(void) hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); - /* RT runtime code needs to handle some hotplug events */ - hotcpu_notifier(update_runtime, 0); - init_hrtick(); /* Move init over to a non-isolated CPU */ @@ -7188,6 +6679,8 @@ void normalize_rt_tasks(void) * @cpu: the processor in question. * * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + * + * Return: The current task for @cpu. */ struct task_struct *curr_task(int cpu) { @@ -7319,7 +6812,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); - tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, + tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, lockdep_is_held(&tsk->sighand->siglock)), struct task_group, css); tg = autogroup_task_group(tsk, tg); @@ -7641,23 +7134,22 @@ int sched_rt_handler(struct ctl_table *table, int write, #ifdef CONFIG_CGROUP_SCHED -/* return corresponding task_group object of a cgroup */ -static inline struct task_group *cgroup_tg(struct cgroup *cgrp) +static inline struct task_group *css_tg(struct cgroup_subsys_state *css) { - return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), - struct task_group, css); + return css ? container_of(css, struct task_group, css) : NULL; } -static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { - struct task_group *tg, *parent; + struct task_group *parent = css_tg(parent_css); + struct task_group *tg; - if (!cgrp->parent) { + if (!parent) { /* This is early initialization for the top cgroup */ return &root_task_group.css; } - parent = cgroup_tg(cgrp->parent); tg = sched_create_group(parent); if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); @@ -7665,41 +7157,38 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) return &tg->css; } -static int cpu_cgroup_css_online(struct cgroup *cgrp) +static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) { - struct task_group *tg = cgroup_tg(cgrp); - struct task_group *parent; + struct task_group *tg = css_tg(css); + struct task_group *parent = css_tg(css_parent(css)); - if (!cgrp->parent) - return 0; - - parent = cgroup_tg(cgrp->parent); - sched_online_group(tg, parent); + if (parent) + sched_online_group(tg, parent); return 0; } -static void cpu_cgroup_css_free(struct cgroup *cgrp) +static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); sched_destroy_group(tg); } -static void cpu_cgroup_css_offline(struct cgroup *cgrp) +static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); sched_offline_group(tg); } -static int cpu_cgroup_can_attach(struct cgroup *cgrp, +static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *task; - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED - if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) + if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; #else /* We don't support RT-tasks being in separate groups */ @@ -7710,18 +7199,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp, return 0; } -static void cpu_cgroup_attach(struct cgroup *cgrp, +static void cpu_cgroup_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *task; - cgroup_taskset_for_each(task, cgrp, tset) + cgroup_taskset_for_each(task, css, tset) sched_move_task(task); } -static void -cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, - struct task_struct *task) +static void cpu_cgroup_exit(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, + struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. @@ -7735,15 +7224,16 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, } #ifdef CONFIG_FAIR_GROUP_SCHED -static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, - u64 shareval) +static int cpu_shares_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 shareval) { - return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); + return sched_group_set_shares(css_tg(css), scale_load(shareval)); } -static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); return (u64) scale_load_down(tg->shares); } @@ -7865,26 +7355,28 @@ long tg_get_cfs_period(struct task_group *tg) return cfs_period_us; } -static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) +static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) { - return tg_get_cfs_quota(cgroup_tg(cgrp)); + return tg_get_cfs_quota(css_tg(css)); } -static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, - s64 cfs_quota_us) +static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 cfs_quota_us) { - return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); + return tg_set_cfs_quota(css_tg(css), cfs_quota_us); } -static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { - return tg_get_cfs_period(cgroup_tg(cgrp)); + return tg_get_cfs_period(css_tg(css)); } -static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, - u64 cfs_period_us) +static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 cfs_period_us) { - return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); + return tg_set_cfs_period(css_tg(css), cfs_period_us); } struct cfs_schedulable_data { @@ -7965,10 +7457,10 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) return ret; } -static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, +static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, struct cgroup_map_cb *cb) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; cb->fill(cb, "nr_periods", cfs_b->nr_periods); @@ -7981,26 +7473,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED -static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, - s64 val) +static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, + struct cftype *cft, s64 val) { - return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); + return sched_group_set_rt_runtime(css_tg(css), val); } -static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) +static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return sched_group_rt_runtime(cgroup_tg(cgrp)); + return sched_group_rt_runtime(css_tg(css)); } -static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, - u64 rt_period_us) +static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 rt_period_us) { - return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); + return sched_group_set_rt_period(css_tg(css), rt_period_us); } -static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, + struct cftype *cft) { - return sched_group_rt_period(cgroup_tg(cgrp)); + return sched_group_rt_period(css_tg(css)); } #endif /* CONFIG_RT_GROUP_SCHED */ diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index dbb7e2cd95e..f64722ff029 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -33,30 +33,20 @@ struct cpuacct { struct kernel_cpustat __percpu *cpustat; }; -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) { - return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), - struct cpuacct, css); + return css ? container_of(css, struct cpuacct, css) : NULL; } /* return cpu accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { - return container_of(task_subsys_state(tsk, cpuacct_subsys_id), - struct cpuacct, css); -} - -static inline struct cpuacct *__parent_ca(struct cpuacct *ca) -{ - return cgroup_ca(ca->css.cgroup->parent); + return css_ca(task_css(tsk, cpuacct_subsys_id)); } static inline struct cpuacct *parent_ca(struct cpuacct *ca) { - if (!ca->css.cgroup->parent) - return NULL; - return cgroup_ca(ca->css.cgroup->parent); + return css_ca(css_parent(&ca->css)); } static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); @@ -66,11 +56,12 @@ static struct cpuacct root_cpuacct = { }; /* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) { struct cpuacct *ca; - if (!cgrp->parent) + if (!parent_css) return &root_cpuacct.css; ca = kzalloc(sizeof(*ca), GFP_KERNEL); @@ -96,9 +87,9 @@ out: } /* destroy an existing cpu accounting group */ -static void cpuacct_css_free(struct cgroup *cgrp) +static void cpuacct_css_free(struct cgroup_subsys_state *css) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); free_percpu(ca->cpustat); free_percpu(ca->cpuusage); @@ -141,9 +132,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) } /* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) +static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); u64 totalcpuusage = 0; int i; @@ -153,10 +144,10 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) return totalcpuusage; } -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, - u64 reset) +static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 reset) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); int err = 0; int i; @@ -172,10 +163,10 @@ out: return err; } -static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) +static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *m) { - struct cpuacct *ca = cgroup_ca(cgroup); + struct cpuacct *ca = css_ca(css); u64 percpu; int i; @@ -192,10 +183,10 @@ static const char * const cpuacct_stat_desc[] = { [CPUACCT_STAT_SYSTEM] = "system", }; -static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) +static int cpuacct_stats_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct cgroup_map_cb *cb) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); int cpu; s64 val = 0; @@ -281,7 +272,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) while (ca != &root_cpuacct) { kcpustat = this_cpu_ptr(ca->cpustat); kcpustat->cpustat[index] += val; - ca = __parent_ca(ca); + ca = parent_ca(ca); } rcu_read_unlock(); } diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 1095e878a46..8b836b376d9 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -62,7 +62,7 @@ static int convert_prio(int prio) * any discrepancies created by racing against the uncertainty of the current * priority configuration. * - * Returns: (int)bool - CPUs were found + * Return: (int)bool - CPUs were found */ int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask) @@ -203,7 +203,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * cpupri_init - initialize the cpupri structure * @cp: The cpupri context * - * Returns: -ENOMEM if memory fails. + * Return: -ENOMEM on memory allocation failure. */ int cpupri_init(struct cpupri *cp) { diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index cc2dc3eea8a..99947919e30 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -121,7 +121,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, * is the only cgroup, then nothing else should be necessary. * */ - __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; + __this_cpu_add(kernel_cpustat.cpustat[index], tmp); cpuacct_account_field(p, index, tmp); } @@ -378,11 +378,8 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ #ifdef CONFIG_VIRT_CPU_ACCOUNTING #ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_task_switch(struct task_struct *prev) +void vtime_common_task_switch(struct task_struct *prev) { - if (!vtime_accounting_enabled()) - return; - if (is_idle_task(prev)) vtime_account_idle(prev); else @@ -404,11 +401,8 @@ void vtime_task_switch(struct task_struct *prev) * vtime_account(). */ #ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_account_irq_enter(struct task_struct *tsk) +void vtime_common_account_irq_enter(struct task_struct *tsk) { - if (!vtime_accounting_enabled()) - return; - if (!in_interrupt()) { /* * If we interrupted user, context_tracking_in_user() @@ -428,7 +422,7 @@ void vtime_account_irq_enter(struct task_struct *tsk) } vtime_account_system(tsk); } -EXPORT_SYMBOL_GPL(vtime_account_irq_enter); +EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ @@ -515,9 +509,8 @@ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) for (;;) { /* Make sure "rtime" is the bigger of stime/rtime */ - if (stime > rtime) { - u64 tmp = rtime; rtime = stime; stime = tmp; - } + if (stime > rtime) + swap(rtime, stime); /* Make sure 'total' fits in 32 bits */ if (total >> 32) @@ -558,16 +551,7 @@ static void cputime_adjust(struct task_cputime *curr, struct cputime *prev, cputime_t *ut, cputime_t *st) { - cputime_t rtime, stime, utime, total; - - if (vtime_accounting_enabled()) { - *ut = curr->utime; - *st = curr->stime; - return; - } - - stime = curr->stime; - total = stime + curr->utime; + cputime_t rtime, stime, utime; /* * Tick based cputime accounting depend on random scheduling @@ -589,13 +573,19 @@ static void cputime_adjust(struct task_cputime *curr, if (prev->stime + prev->utime >= rtime) goto out; - if (total) { + stime = curr->stime; + utime = curr->utime; + + if (utime == 0) { + stime = rtime; + } else if (stime == 0) { + utime = rtime; + } else { + cputime_t total = stime + utime; + stime = scale_stime((__force u64)stime, (__force u64)rtime, (__force u64)total); utime = rtime - stime; - } else { - stime = rtime; - utime = 0; } /* @@ -665,23 +655,17 @@ static void __vtime_account_system(struct task_struct *tsk) void vtime_account_system(struct task_struct *tsk) { - if (!vtime_accounting_enabled()) - return; - write_seqlock(&tsk->vtime_seqlock); __vtime_account_system(tsk); write_sequnlock(&tsk->vtime_seqlock); } -void vtime_account_irq_exit(struct task_struct *tsk) +void vtime_gen_account_irq_exit(struct task_struct *tsk) { - if (!vtime_accounting_enabled()) - return; - write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); if (context_tracking_in_user()) tsk->vtime_snap_whence = VTIME_USER; - __vtime_account_system(tsk); write_sequnlock(&tsk->vtime_seqlock); } @@ -689,12 +673,8 @@ void vtime_account_user(struct task_struct *tsk) { cputime_t delta_cpu; - if (!vtime_accounting_enabled()) - return; - - delta_cpu = get_vtime_delta(tsk); - write_seqlock(&tsk->vtime_seqlock); + delta_cpu = get_vtime_delta(tsk); tsk->vtime_snap_whence = VTIME_SYS; account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); write_sequnlock(&tsk->vtime_seqlock); @@ -702,22 +682,27 @@ void vtime_account_user(struct task_struct *tsk) void vtime_user_enter(struct task_struct *tsk) { - if (!vtime_accounting_enabled()) - return; - write_seqlock(&tsk->vtime_seqlock); - tsk->vtime_snap_whence = VTIME_USER; __vtime_account_system(tsk); + tsk->vtime_snap_whence = VTIME_USER; write_sequnlock(&tsk->vtime_seqlock); } void vtime_guest_enter(struct task_struct *tsk) { + /* + * The flags must be updated under the lock with + * the vtime_snap flush and update. + * That enforces a right ordering and update sequence + * synchronization against the reader (task_gtime()) + * that can thus safely catch up with a tickless delta. + */ write_seqlock(&tsk->vtime_seqlock); __vtime_account_system(tsk); current->flags |= PF_VCPU; write_sequnlock(&tsk->vtime_seqlock); } +EXPORT_SYMBOL_GPL(vtime_guest_enter); void vtime_guest_exit(struct task_struct *tsk) { @@ -726,6 +711,7 @@ void vtime_guest_exit(struct task_struct *tsk) current->flags &= ~PF_VCPU; write_sequnlock(&tsk->vtime_seqlock); } +EXPORT_SYMBOL_GPL(vtime_guest_exit); void vtime_account_idle(struct task_struct *tsk) { @@ -734,11 +720,6 @@ void vtime_account_idle(struct task_struct *tsk) account_idle_time(delta_cpu); } -bool vtime_accounting_enabled(void) -{ - return context_tracking_active(); -} - void arch_vtime_task_switch(struct task_struct *prev) { write_seqlock(&prev->vtime_seqlock); @@ -747,17 +728,17 @@ void arch_vtime_task_switch(struct task_struct *prev) write_seqlock(¤t->vtime_seqlock); current->vtime_snap_whence = VTIME_SYS; - current->vtime_snap = sched_clock(); + current->vtime_snap = sched_clock_cpu(smp_processor_id()); write_sequnlock(¤t->vtime_seqlock); } -void vtime_init_idle(struct task_struct *t) +void vtime_init_idle(struct task_struct *t, int cpu) { unsigned long flags; write_seqlock_irqsave(&t->vtime_seqlock, flags); t->vtime_snap_whence = VTIME_SYS; - t->vtime_snap = sched_clock(); + t->vtime_snap = sched_clock_cpu(cpu); write_sequnlock_irqrestore(&t->vtime_seqlock, flags); } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 75024a67352..196559994f7 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -124,7 +124,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SEQ_printf(m, " "); SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", - p->comm, p->pid, + p->comm, task_pid_nr(p), SPLIT_NS(p->se.vruntime), (long long)(p->nvcsw + p->nivcsw), p->prio); @@ -209,22 +209,24 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); -#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", + SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg", cfs_rq->runnable_load_avg); - SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", + SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", cfs_rq->blocked_load_avg); - SEQ_printf(m, " .%-30s: %lld\n", "tg_load_avg", - (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg)); - SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", +#ifdef CONFIG_FAIR_GROUP_SCHED + SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", cfs_rq->tg_load_contrib); SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", cfs_rq->tg_runnable_contrib); + SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", + atomic_long_read(&cfs_rq->tg->load_avg)); SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", atomic_read(&cfs_rq->tg->runnable_avg)); #endif +#endif +#ifdef CONFIG_FAIR_GROUP_SCHED print_cfs_group_stats(m, cpu, cfs_rq->tg); #endif } @@ -287,7 +289,7 @@ do { \ P(nr_load_updates); P(nr_uninterruptible); PN(next_balance); - P(curr->pid); + SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); PN(clock); P(cpu_load[0]); P(cpu_load[1]); @@ -490,18 +492,19 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) { unsigned long nr_switches; - SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, + SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p), get_nr_threads(p)); SEQ_printf(m, - "---------------------------------------------------------\n"); + "---------------------------------------------------------" + "----------\n"); #define __P(F) \ - SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) #define P(F) \ - SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) #define __PN(F) \ - SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) #define PN(F) \ - SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) PN(se.exec_start); PN(se.vruntime); @@ -560,12 +563,18 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) } #endif __P(nr_switches); - SEQ_printf(m, "%-35s:%21Ld\n", + SEQ_printf(m, "%-45s:%21Ld\n", "nr_voluntary_switches", (long long)p->nvcsw); - SEQ_printf(m, "%-35s:%21Ld\n", + SEQ_printf(m, "%-45s:%21Ld\n", "nr_involuntary_switches", (long long)p->nivcsw); P(se.load.weight); +#ifdef CONFIG_SMP + P(se.avg.runnable_avg_sum); + P(se.avg.runnable_avg_period); + P(se.avg.load_avg_contrib); + P(se.avg.decay_count); +#endif P(policy); P(prio); #undef PN @@ -579,7 +588,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) t0 = cpu_clock(this_cpu); t1 = cpu_clock(this_cpu); - SEQ_printf(m, "%-35s:%21Ld\n", + SEQ_printf(m, "%-45s:%21Ld\n", "clock-delta", (long long)(t1-t0)); } } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c61a614465c..7c70201fbc6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -113,6 +113,24 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +static inline void update_load_add(struct load_weight *lw, unsigned long inc) +{ + lw->weight += inc; + lw->inv_weight = 0; +} + +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) +{ + lw->weight -= dec; + lw->inv_weight = 0; +} + +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ + lw->weight = w; + lw->inv_weight = 0; +} + /* * Increase the granularity value when there are more CPUs, * because with more CPUs the 'effective latency' as visible @@ -662,6 +680,26 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) return calc_delta_fair(sched_slice(cfs_rq, se), se); } +#ifdef CONFIG_SMP +static inline void __update_task_entity_contrib(struct sched_entity *se); + +/* Give new task start runnable values to heavy its load in infant time */ +void init_task_runnable_average(struct task_struct *p) +{ + u32 slice; + + p->se.avg.decay_count = 0; + slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; + p->se.avg.runnable_avg_sum = slice; + p->se.avg.runnable_avg_period = slice; + __update_task_entity_contrib(&p->se); +} +#else +void init_task_runnable_average(struct task_struct *p) +{ +} +#endif + /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -686,7 +724,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_of(cfs_rq)->clock_task; + u64 now = rq_clock_task(rq_of(cfs_rq)); unsigned long delta_exec; if (unlikely(!curr)) @@ -718,7 +756,7 @@ static void update_curr(struct cfs_rq *cfs_rq) static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); + schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq))); } /* @@ -738,14 +776,14 @@ static void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, - rq_of(cfs_rq)->clock - se->statistics.wait_start)); + rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start)); schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + - rq_of(cfs_rq)->clock - se->statistics.wait_start); + rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { trace_sched_stat_wait(task_of(se), - rq_of(cfs_rq)->clock - se->statistics.wait_start); + rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); } #endif schedstat_set(se->statistics.wait_start, 0); @@ -771,7 +809,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * We are starting a new run period: */ - se->exec_start = rq_of(cfs_rq)->clock_task; + se->exec_start = rq_clock_task(rq_of(cfs_rq)); } /************************************************** @@ -813,7 +851,7 @@ void task_numa_fault(int node, int pages, bool migrated) { struct task_struct *p = current; - if (!sched_feat_numa(NUMA)) + if (!numabalancing_enabled) return; /* FIXME: Allocate task-specific structure for placement policy here */ @@ -1037,7 +1075,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) * to gain a more accurate current total weight. See * update_cfs_rq_load_contribution(). */ - tg_weight = atomic64_read(&tg->load_avg); + tg_weight = atomic_long_read(&tg->load_avg); tg_weight -= cfs_rq->tg_load_contrib; tg_weight += cfs_rq->load.weight; @@ -1110,8 +1148,7 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) } #endif /* CONFIG_FAIR_GROUP_SCHED */ -/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +#ifdef CONFIG_SMP /* * We choose a half-life close to 1 scheduling period. * Note: The tables below are dependent on this value. @@ -1319,13 +1356,13 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, int force_update) { struct task_group *tg = cfs_rq->tg; - s64 tg_contrib; + long tg_contrib; tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; tg_contrib -= cfs_rq->tg_load_contrib; - if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { - atomic64_add(tg_contrib, &tg->load_avg); + if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { + atomic_long_add(tg_contrib, &tg->load_avg); cfs_rq->tg_load_contrib += tg_contrib; } } @@ -1360,8 +1397,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) u64 contrib; contrib = cfs_rq->tg_load_contrib * tg->shares; - se->avg.load_avg_contrib = div64_u64(contrib, - atomic64_read(&tg->load_avg) + 1); + se->avg.load_avg_contrib = div_u64(contrib, + atomic_long_read(&tg->load_avg) + 1); /* * For group entities we need to compute a correction term in the case @@ -1480,8 +1517,9 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) if (!decays && !force_update) return; - if (atomic64_read(&cfs_rq->removed_load)) { - u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); + if (atomic_long_read(&cfs_rq->removed_load)) { + unsigned long removed_load; + removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0); subtract_blocked_load_contrib(cfs_rq, removed_load); } @@ -1497,7 +1535,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { - __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); + __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); __update_tg_runnable_avg(&rq->avg, &rq->cfs); } @@ -1510,9 +1548,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, * We track migrations using entity decay_count <= 0, on a wake-up * migration we use a negative decay count to track the remote decays * accumulated while sleeping. + * + * Newly forked tasks are enqueued with se->avg.decay_count == 0, they + * are seen by enqueue_entity_load_avg() as a migration with an already + * constructed load_avg_contrib. */ if (unlikely(se->avg.decay_count <= 0)) { - se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; + se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq)); if (se->avg.decay_count) { /* * In a wake-up migration we have to approximate the @@ -1530,7 +1572,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, } wakeup = 0; } else { - __synchronize_entity_decay(se); + /* + * Task re-woke on same cpu (or else migrate_task_rq_fair() + * would have made count negative); we must be careful to avoid + * double-accounting blocked time after synchronizing decays. + */ + se->avg.last_runnable_update += __synchronize_entity_decay(se) + << 20; } /* migrated tasks did not contribute to our blocked load */ @@ -1607,7 +1655,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) tsk = task_of(se); if (se->statistics.sleep_start) { - u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; if ((s64)delta < 0) delta = 0; @@ -1624,7 +1672,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) } } if (se->statistics.block_start) { - u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; if ((s64)delta < 0) delta = 0; @@ -1712,7 +1760,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Update the normalized vruntime before updating min_vruntime - * through callig update_curr(). + * through calling update_curr(). */ if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) se->vruntime += cfs_rq->min_vruntime; @@ -1805,9 +1853,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) struct task_struct *tsk = task_of(se); if (tsk->state & TASK_INTERRUPTIBLE) - se->statistics.sleep_start = rq_of(cfs_rq)->clock; + se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); if (tsk->state & TASK_UNINTERRUPTIBLE) - se->statistics.block_start = rq_of(cfs_rq)->clock; + se->statistics.block_start = rq_clock(rq_of(cfs_rq)); } #endif } @@ -1984,6 +2032,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) */ update_entity_load_avg(curr, 1); update_cfs_rq_blocked_load(cfs_rq, 1); + update_cfs_shares(cfs_rq); #ifdef CONFIG_SCHED_HRTICK /* @@ -2082,7 +2131,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) if (unlikely(cfs_rq->throttle_count)) return cfs_rq->throttled_clock_task; - return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; + return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; } /* returns 0 on failure to allocate runtime */ @@ -2138,10 +2187,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) { struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct rq *rq = rq_of(cfs_rq); /* if the deadline is ahead of our clock, nothing to do */ - if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) + if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) return; if (cfs_rq->runtime_remaining < 0) @@ -2230,7 +2278,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) #ifdef CONFIG_SMP if (!cfs_rq->throttle_count) { /* adjust cfs_rq_clock_task() */ - cfs_rq->throttled_clock_task_time += rq->clock_task - + cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; } #endif @@ -2245,7 +2293,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) - cfs_rq->throttled_clock_task = rq->clock_task; + cfs_rq->throttled_clock_task = rq_clock_task(rq); cfs_rq->throttle_count++; return 0; @@ -2284,7 +2332,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) rq->nr_running -= task_delta; cfs_rq->throttled = 1; - cfs_rq->throttled_clock = rq->clock; + cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); raw_spin_unlock(&cfs_b->lock); @@ -2298,15 +2346,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) int enqueue = 1; long task_delta; - se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + se = cfs_rq->tg->se[cpu_of(rq)]; cfs_rq->throttled = 0; + + update_rq_clock(rq); + raw_spin_lock(&cfs_b->lock); - cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; + cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); - update_rq_clock(rq); /* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); @@ -2599,10 +2649,6 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) throttle_cfs_rq(cfs_rq); } -static inline u64 default_cfs_period(void); -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); -static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); - static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = @@ -2706,7 +2752,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) #else /* CONFIG_CFS_BANDWIDTH */ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) { - return rq_of(cfs_rq)->clock_task; + return rq_clock_task(rq_of(cfs_rq)); } static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, @@ -2919,7 +2965,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* Used instead of source_load when we know the type == 0 */ static unsigned long weighted_cpuload(const int cpu) { - return cpu_rq(cpu)->load.weight; + return cpu_rq(cpu)->cfs.runnable_load_avg; } /* @@ -2964,13 +3010,31 @@ static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = ACCESS_ONCE(rq->nr_running); + unsigned long load_avg = rq->cfs.runnable_load_avg; if (nr_running) - return rq->load.weight / nr_running; + return load_avg / nr_running; return 0; } +static void record_wakee(struct task_struct *p) +{ + /* + * Rough decay (wiping) for cost saving, don't worry + * about the boundary, really active task won't care + * about the loss. + */ + if (jiffies > current->wakee_flip_decay_ts + HZ) { + current->wakee_flips = 0; + current->wakee_flip_decay_ts = jiffies; + } + + if (current->last_wakee != p) { + current->last_wakee = p; + current->wakee_flips++; + } +} static void task_waking_fair(struct task_struct *p) { @@ -2991,6 +3055,7 @@ static void task_waking_fair(struct task_struct *p) #endif se->vruntime -= min_vruntime; + record_wakee(p); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -3109,6 +3174,28 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, #endif +static int wake_wide(struct task_struct *p) +{ + int factor = this_cpu_read(sd_llc_size); + + /* + * Yeah, it's the switching-frequency, could means many wakee or + * rapidly switch, use factor here will just help to automatically + * adjust the loose-degree, so bigger node will lead to more pull. + */ + if (p->wakee_flips > factor) { + /* + * wakee is somewhat hot, it needs certain amount of cpu + * resource, so if waker is far more hot, prefer to leave + * it alone. + */ + if (current->wakee_flips > (factor * p->wakee_flips)) + return 1; + } + + return 0; +} + static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { s64 this_load, load; @@ -3118,6 +3205,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) unsigned long weight; int balanced; + /* + * If we wake multiple tasks be careful to not bounce + * ourselves around too much. + */ + if (wake_wide(p)) + return 0; + idx = sd->wake_idx; this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); @@ -3416,12 +3510,6 @@ unlock: } /* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#ifdef CONFIG_FAIR_GROUP_SCHED -/* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the * previous cpu. However, the caller only guarantees p->pi_lock is held; no @@ -3441,10 +3529,10 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) */ if (se->avg.decay_count) { se->avg.decay_count = -__synchronize_entity_decay(se); - atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); + atomic_long_add(se->avg.load_avg_contrib, + &cfs_rq->removed_load); } } -#endif #endif /* CONFIG_SMP */ static unsigned long @@ -3946,7 +4034,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); + tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { @@ -4131,118 +4219,120 @@ static void update_blocked_averages(int cpu) } /* - * Compute the cpu's hierarchical load factor for each task group. + * Compute the hierarchical load factor for cfs_rq and all its ascendants. * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ -static int tg_load_down(struct task_group *tg, void *data) +static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) { - unsigned long load; - long cpu = (long)data; - - if (!tg->parent) { - load = cpu_rq(cpu)->load.weight; - } else { - load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->se[cpu]->load.weight; - load /= tg->parent->cfs_rq[cpu]->load.weight + 1; - } - - tg->cfs_rq[cpu]->h_load = load; - - return 0; -} - -static void update_h_load(long cpu) -{ - struct rq *rq = cpu_rq(cpu); + struct rq *rq = rq_of(cfs_rq); + struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; unsigned long now = jiffies; + unsigned long load; - if (rq->h_load_throttle == now) + if (cfs_rq->last_h_load_update == now) return; - rq->h_load_throttle = now; + cfs_rq->h_load_next = NULL; + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + cfs_rq->h_load_next = se; + if (cfs_rq->last_h_load_update == now) + break; + } - rcu_read_lock(); - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); - rcu_read_unlock(); + if (!se) { + cfs_rq->h_load = cfs_rq->runnable_load_avg; + cfs_rq->last_h_load_update = now; + } + + while ((se = cfs_rq->h_load_next) != NULL) { + load = cfs_rq->h_load; + load = div64_ul(load * se->avg.load_avg_contrib, + cfs_rq->runnable_load_avg + 1); + cfs_rq = group_cfs_rq(se); + cfs_rq->h_load = load; + cfs_rq->last_h_load_update = now; + } } static unsigned long task_h_load(struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(p); - unsigned long load; - - load = p->se.load.weight; - load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1); - return load; + update_cfs_rq_h_load(cfs_rq); + return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, + cfs_rq->runnable_load_avg + 1); } #else static inline void update_blocked_averages(int cpu) { } -static inline void update_h_load(long cpu) -{ -} - static unsigned long task_h_load(struct task_struct *p) { - return p->se.load.weight; + return p->se.avg.load_avg_contrib; } #endif /********** Helpers for find_busiest_group ************************/ /* - * sd_lb_stats - Structure to store the statistics of a sched_domain - * during load balancing. - */ -struct sd_lb_stats { - struct sched_group *busiest; /* Busiest group in this sd */ - struct sched_group *this; /* Local group in this sd */ - unsigned long total_load; /* Total load of all groups in sd */ - unsigned long total_pwr; /* Total power of all groups in sd */ - unsigned long avg_load; /* Average load across all groups in sd */ - - /** Statistics of this group */ - unsigned long this_load; - unsigned long this_load_per_task; - unsigned long this_nr_running; - unsigned long this_has_capacity; - unsigned int this_idle_cpus; - - /* Statistics of the busiest group */ - unsigned int busiest_idle_cpus; - unsigned long max_load; - unsigned long busiest_load_per_task; - unsigned long busiest_nr_running; - unsigned long busiest_group_capacity; - unsigned long busiest_has_capacity; - unsigned int busiest_group_weight; - - int group_imb; /* Is there imbalance in this sd */ -}; - -/* * sg_lb_stats - stats of a sched_group required for load_balancing */ struct sg_lb_stats { unsigned long avg_load; /*Avg load across the CPUs of the group */ unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long sum_nr_running; /* Nr tasks running in the group */ unsigned long sum_weighted_load; /* Weighted load of group's tasks */ - unsigned long group_capacity; - unsigned long idle_cpus; - unsigned long group_weight; + unsigned long load_per_task; + unsigned long group_power; + unsigned int sum_nr_running; /* Nr tasks running in the group */ + unsigned int group_capacity; + unsigned int idle_cpus; + unsigned int group_weight; int group_imb; /* Is there an imbalance in the group ? */ int group_has_capacity; /* Is there extra capacity in the group? */ }; +/* + * sd_lb_stats - Structure to store the statistics of a sched_domain + * during load balancing. + */ +struct sd_lb_stats { + struct sched_group *busiest; /* Busiest group in this sd */ + struct sched_group *local; /* Local group in this sd */ + unsigned long total_load; /* Total load of all groups in sd */ + unsigned long total_pwr; /* Total power of all groups in sd */ + unsigned long avg_load; /* Average load across all groups in sd */ + + struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ + struct sg_lb_stats local_stat; /* Statistics of the local group */ +}; + +static inline void init_sd_lb_stats(struct sd_lb_stats *sds) +{ + /* + * Skimp on the clearing to avoid duplicate work. We can avoid clearing + * local_stat because update_sg_lb_stats() does a full clear/assignment. + * We must however clear busiest_stat::avg_load because + * update_sd_pick_busiest() reads this before assignment. + */ + *sds = (struct sd_lb_stats){ + .busiest = NULL, + .local = NULL, + .total_load = 0UL, + .total_pwr = 0UL, + .busiest_stat = { + .avg_load = 0UL, + }, + }; +} + /** * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. * @idle: The Idle status of the CPU for whose sd load_icx is obtained. + * + * Return: The load index. */ static inline int get_sd_load_idx(struct sched_domain *sd, enum cpu_idle_type idle) @@ -4302,7 +4392,7 @@ static unsigned long scale_rt_power(int cpu) age_stamp = ACCESS_ONCE(rq->age_stamp); avg = ACCESS_ONCE(rq->rt_avg); - total = sched_avg_period() + (rq->clock - age_stamp); + total = sched_avg_period() + (rq_clock(rq) - age_stamp); if (unlikely(total < avg)) { /* Ensures that power won't end up being negative */ @@ -4420,33 +4510,99 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) return 0; } +/* + * Group imbalance indicates (and tries to solve) the problem where balancing + * groups is inadequate due to tsk_cpus_allowed() constraints. + * + * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a + * cpumask covering 1 cpu of the first group and 3 cpus of the second group. + * Something like: + * + * { 0 1 2 3 } { 4 5 6 7 } + * * * * * + * + * If we were to balance group-wise we'd place two tasks in the first group and + * two tasks in the second group. Clearly this is undesired as it will overload + * cpu 3 and leave one of the cpus in the second group unused. + * + * The current solution to this issue is detecting the skew in the first group + * by noticing it has a cpu that is overloaded while the remaining cpus are + * idle -- or rather, there's a distinct imbalance in the cpus; see + * sg_imbalanced(). + * + * When this is so detected; this group becomes a candidate for busiest; see + * update_sd_pick_busiest(). And calculcate_imbalance() and + * find_busiest_group() avoid some of the usual balance conditional to allow it + * to create an effective group imbalance. + * + * This is a somewhat tricky proposition since the next run might not find the + * group imbalance and decide the groups need to be balanced again. A most + * subtle and fragile situation. + */ + +struct sg_imb_stats { + unsigned long max_nr_running, min_nr_running; + unsigned long max_cpu_load, min_cpu_load; +}; + +static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) +{ + sgi->max_cpu_load = sgi->max_nr_running = 0UL; + sgi->min_cpu_load = sgi->min_nr_running = ~0UL; +} + +static inline void +update_sg_imb_stats(struct sg_imb_stats *sgi, + unsigned long load, unsigned long nr_running) +{ + if (load > sgi->max_cpu_load) + sgi->max_cpu_load = load; + if (sgi->min_cpu_load > load) + sgi->min_cpu_load = load; + + if (nr_running > sgi->max_nr_running) + sgi->max_nr_running = nr_running; + if (sgi->min_nr_running > nr_running) + sgi->min_nr_running = nr_running; +} + +static inline int +sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) +{ + /* + * Consider the group unbalanced when the imbalance is larger + * than the average weight of a task. + * + * APZ: with cgroup the avg task weight can vary wildly and + * might not be a suitable number - should we keep a + * normalized nr_running number somewhere that negates + * the hierarchy? + */ + if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && + (sgi->max_nr_running - sgi->min_nr_running) > 1) + return 1; + + return 0; +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. * @group: sched_group whose statistics are to be updated. * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. - * @balance: Should we balance. * @sgs: variable to hold the statistics for this group. */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, - int local_group, int *balance, struct sg_lb_stats *sgs) + int local_group, struct sg_lb_stats *sgs) { - unsigned long nr_running, max_nr_running, min_nr_running; - unsigned long load, max_cpu_load, min_cpu_load; - unsigned int balance_cpu = -1, first_idle_cpu = 0; - unsigned long avg_load_per_task = 0; + struct sg_imb_stats sgi; + unsigned long nr_running; + unsigned long load; int i; - if (local_group) - balance_cpu = group_balance_cpu(group); - - /* Tally up the load of all CPUs in the group */ - max_cpu_load = 0; - min_cpu_load = ~0UL; - max_nr_running = 0; - min_nr_running = ~0UL; + init_sg_imb_stats(&sgi); for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); @@ -4455,24 +4611,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Bias balancing toward cpus of our domain */ if (local_group) { - if (idle_cpu(i) && !first_idle_cpu && - cpumask_test_cpu(i, sched_group_mask(group))) { - first_idle_cpu = 1; - balance_cpu = i; - } - load = target_load(i, load_idx); } else { load = source_load(i, load_idx); - if (load > max_cpu_load) - max_cpu_load = load; - if (min_cpu_load > load) - min_cpu_load = load; - - if (nr_running > max_nr_running) - max_nr_running = nr_running; - if (min_nr_running > nr_running) - min_nr_running = nr_running; + update_sg_imb_stats(&sgi, load, nr_running); } sgs->group_load += load; @@ -4482,46 +4624,25 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++; } - /* - * First idle cpu or the first cpu(busiest) in this sched group - * is eligible for doing load balancing at this and above - * domains. In the newly idle case, we will allow all the cpu's - * to do the newly idle load balance. - */ - if (local_group) { - if (env->idle != CPU_NEWLY_IDLE) { - if (balance_cpu != env->dst_cpu) { - *balance = 0; - return; - } - update_group_power(env->sd, env->dst_cpu); - } else if (time_after_eq(jiffies, group->sgp->next_update)) - update_group_power(env->sd, env->dst_cpu); - } + if (local_group && (env->idle != CPU_NEWLY_IDLE || + time_after_eq(jiffies, group->sgp->next_update))) + update_group_power(env->sd, env->dst_cpu); /* Adjust by relative CPU power of the group */ - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; + sgs->group_power = group->sgp->power; + sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; - /* - * Consider the group unbalanced when the imbalance is larger - * than the average weight of a task. - * - * APZ: with cgroup the avg task weight can vary wildly and - * might not be a suitable number - should we keep a - * normalized nr_running number somewhere that negates - * the hierarchy? - */ if (sgs->sum_nr_running) - avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; + sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; + + sgs->group_imb = sg_imbalanced(sgs, &sgi); - if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && - (max_nr_running - min_nr_running) > 1) - sgs->group_imb = 1; + sgs->group_capacity = + DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); - sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, - SCHED_POWER_SCALE); if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(env->sd, group); + sgs->group_weight = group->group_weight; if (sgs->group_capacity > sgs->sum_nr_running) @@ -4537,13 +4658,16 @@ static inline void update_sg_lb_stats(struct lb_env *env, * * Determine if @sg is a busier group than the previously selected * busiest group. + * + * Return: %true if @sg is a busier group than the previously selected + * busiest group. %false otherwise. */ static bool update_sd_pick_busiest(struct lb_env *env, struct sd_lb_stats *sds, struct sched_group *sg, struct sg_lb_stats *sgs) { - if (sgs->avg_load <= sds->max_load) + if (sgs->avg_load <= sds->busiest_stat.avg_load) return false; if (sgs->sum_nr_running > sgs->group_capacity) @@ -4576,11 +4700,11 @@ static bool update_sd_pick_busiest(struct lb_env *env, * @sds: variable to hold the statistics for this sched_domain. */ static inline void update_sd_lb_stats(struct lb_env *env, - int *balance, struct sd_lb_stats *sds) + struct sd_lb_stats *sds) { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; - struct sg_lb_stats sgs; + struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; if (child && child->flags & SD_PREFER_SIBLING) @@ -4589,17 +4713,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, load_idx = get_sd_load_idx(env->sd, env->idle); do { + struct sg_lb_stats *sgs = &tmp_sgs; int local_group; local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); - memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); - - if (local_group && !(*balance)) - return; + if (local_group) { + sds->local = sg; + sgs = &sds->local_stat; + } - sds->total_load += sgs.group_load; - sds->total_pwr += sg->sgp->power; + memset(sgs, 0, sizeof(*sgs)); + update_sg_lb_stats(env, sg, load_idx, local_group, sgs); /* * In case the child domain prefers tasks go to siblings @@ -4611,26 +4735,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, * heaviest group when it is already under-utilized (possible * with a large weight task outweighs the tasks on the system). */ - if (prefer_sibling && !local_group && sds->this_has_capacity) - sgs.group_capacity = min(sgs.group_capacity, 1UL); + if (prefer_sibling && !local_group && + sds->local && sds->local_stat.group_has_capacity) + sgs->group_capacity = min(sgs->group_capacity, 1U); - if (local_group) { - sds->this_load = sgs.avg_load; - sds->this = sg; - sds->this_nr_running = sgs.sum_nr_running; - sds->this_load_per_task = sgs.sum_weighted_load; - sds->this_has_capacity = sgs.group_has_capacity; - sds->this_idle_cpus = sgs.idle_cpus; - } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) { - sds->max_load = sgs.avg_load; + /* Now, start updating sd_lb_stats */ + sds->total_load += sgs->group_load; + sds->total_pwr += sgs->group_power; + + if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; - sds->busiest_nr_running = sgs.sum_nr_running; - sds->busiest_idle_cpus = sgs.idle_cpus; - sds->busiest_group_capacity = sgs.group_capacity; - sds->busiest_load_per_task = sgs.sum_weighted_load; - sds->busiest_has_capacity = sgs.group_has_capacity; - sds->busiest_group_weight = sgs.group_weight; - sds->group_imb = sgs.group_imb; + sds->busiest_stat = *sgs; } sg = sg->next; @@ -4654,7 +4769,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, * assuming lower CPU number will be equivalent to lower a SMT thread * number. * - * Returns 1 when packing is required and a task should be moved to + * Return: 1 when packing is required and a task should be moved to * this CPU. The amount of the imbalance is returned in *imbalance. * * @env: The load balancing environment. @@ -4675,7 +4790,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) return 0; env->imbalance = DIV_ROUND_CLOSEST( - sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); + sds->busiest_stat.avg_load * sds->busiest_stat.group_power, + SCHED_POWER_SCALE); return 1; } @@ -4693,24 +4809,23 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) unsigned long tmp, pwr_now = 0, pwr_move = 0; unsigned int imbn = 2; unsigned long scaled_busy_load_per_task; + struct sg_lb_stats *local, *busiest; - if (sds->this_nr_running) { - sds->this_load_per_task /= sds->this_nr_running; - if (sds->busiest_load_per_task > - sds->this_load_per_task) - imbn = 1; - } else { - sds->this_load_per_task = - cpu_avg_load_per_task(env->dst_cpu); - } + local = &sds->local_stat; + busiest = &sds->busiest_stat; - scaled_busy_load_per_task = sds->busiest_load_per_task - * SCHED_POWER_SCALE; - scaled_busy_load_per_task /= sds->busiest->sgp->power; + if (!local->sum_nr_running) + local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); + else if (busiest->load_per_task > local->load_per_task) + imbn = 1; - if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= - (scaled_busy_load_per_task * imbn)) { - env->imbalance = sds->busiest_load_per_task; + scaled_busy_load_per_task = + (busiest->load_per_task * SCHED_POWER_SCALE) / + busiest->group_power; + + if (busiest->avg_load + scaled_busy_load_per_task >= + local->avg_load + (scaled_busy_load_per_task * imbn)) { + env->imbalance = busiest->load_per_task; return; } @@ -4720,34 +4835,37 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) * moving them. */ - pwr_now += sds->busiest->sgp->power * - min(sds->busiest_load_per_task, sds->max_load); - pwr_now += sds->this->sgp->power * - min(sds->this_load_per_task, sds->this_load); + pwr_now += busiest->group_power * + min(busiest->load_per_task, busiest->avg_load); + pwr_now += local->group_power * + min(local->load_per_task, local->avg_load); pwr_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ - tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / - sds->busiest->sgp->power; - if (sds->max_load > tmp) - pwr_move += sds->busiest->sgp->power * - min(sds->busiest_load_per_task, sds->max_load - tmp); + tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / + busiest->group_power; + if (busiest->avg_load > tmp) { + pwr_move += busiest->group_power * + min(busiest->load_per_task, + busiest->avg_load - tmp); + } /* Amount of load we'd add */ - if (sds->max_load * sds->busiest->sgp->power < - sds->busiest_load_per_task * SCHED_POWER_SCALE) - tmp = (sds->max_load * sds->busiest->sgp->power) / - sds->this->sgp->power; - else - tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / - sds->this->sgp->power; - pwr_move += sds->this->sgp->power * - min(sds->this_load_per_task, sds->this_load + tmp); + if (busiest->avg_load * busiest->group_power < + busiest->load_per_task * SCHED_POWER_SCALE) { + tmp = (busiest->avg_load * busiest->group_power) / + local->group_power; + } else { + tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / + local->group_power; + } + pwr_move += local->group_power * + min(local->load_per_task, local->avg_load + tmp); pwr_move /= SCHED_POWER_SCALE; /* Move if we gain throughput */ if (pwr_move > pwr_now) - env->imbalance = sds->busiest_load_per_task; + env->imbalance = busiest->load_per_task; } /** @@ -4759,11 +4877,18 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { unsigned long max_pull, load_above_capacity = ~0UL; + struct sg_lb_stats *local, *busiest; + + local = &sds->local_stat; + busiest = &sds->busiest_stat; - sds->busiest_load_per_task /= sds->busiest_nr_running; - if (sds->group_imb) { - sds->busiest_load_per_task = - min(sds->busiest_load_per_task, sds->avg_load); + if (busiest->group_imb) { + /* + * In the group_imb case we cannot rely on group-wide averages + * to ensure cpu-load equilibrium, look at wider averages. XXX + */ + busiest->load_per_task = + min(busiest->load_per_task, sds->avg_load); } /* @@ -4771,21 +4896,23 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * max load less than avg load(as we skip the groups at or below * its cpu_power, while calculating max_load..) */ - if (sds->max_load < sds->avg_load) { + if (busiest->avg_load <= sds->avg_load || + local->avg_load >= sds->avg_load) { env->imbalance = 0; return fix_small_imbalance(env, sds); } - if (!sds->group_imb) { + if (!busiest->group_imb) { /* * Don't want to pull so many tasks that a group would go idle. + * Except of course for the group_imb case, since then we might + * have to drop below capacity to reach cpu-load equilibrium. */ - load_above_capacity = (sds->busiest_nr_running - - sds->busiest_group_capacity); + load_above_capacity = + (busiest->sum_nr_running - busiest->group_capacity); load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); - - load_above_capacity /= sds->busiest->sgp->power; + load_above_capacity /= busiest->group_power; } /* @@ -4795,15 +4922,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * we also don't want to reduce the group load below the group capacity * (so that we can implement power-savings policies etc). Thus we look * for the minimum possible imbalance. - * Be careful of negative numbers as they'll appear as very large values - * with unsigned longs. */ - max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); + max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); /* How much load to actually move to equalise the imbalance */ - env->imbalance = min(max_pull * sds->busiest->sgp->power, - (sds->avg_load - sds->this_load) * sds->this->sgp->power) - / SCHED_POWER_SCALE; + env->imbalance = min( + max_pull * busiest->group_power, + (sds->avg_load - local->avg_load) * local->group_power + ) / SCHED_POWER_SCALE; /* * if *imbalance is less than the average load per runnable task @@ -4811,9 +4937,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * a think about bumping its value to force at least one task to be * moved */ - if (env->imbalance < sds->busiest_load_per_task) + if (env->imbalance < busiest->load_per_task) return fix_small_imbalance(env, sds); - } /******* find_busiest_group() helpers end here *********************/ @@ -4829,69 +4954,62 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * to restore balance. * * @env: The load balancing environment. - * @balance: Pointer to a variable indicating if this_cpu - * is the appropriate cpu to perform load balancing at this_level. * - * Returns: - the busiest group if imbalance exists. + * Return: - The busiest group if imbalance exists. * - If no imbalance and user has opted for power-savings balance, * return the least loaded group whose CPUs can be * put to idle by rebalancing its tasks onto our group. */ -static struct sched_group * -find_busiest_group(struct lb_env *env, int *balance) +static struct sched_group *find_busiest_group(struct lb_env *env) { + struct sg_lb_stats *local, *busiest; struct sd_lb_stats sds; - memset(&sds, 0, sizeof(sds)); + init_sd_lb_stats(&sds); /* * Compute the various statistics relavent for load balancing at * this level. */ - update_sd_lb_stats(env, balance, &sds); - - /* - * this_cpu is not the appropriate cpu to perform load balancing at - * this level. - */ - if (!(*balance)) - goto ret; + update_sd_lb_stats(env, &sds); + local = &sds.local_stat; + busiest = &sds.busiest_stat; if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && check_asym_packing(env, &sds)) return sds.busiest; /* There is no busy sibling group to pull tasks from */ - if (!sds.busiest || sds.busiest_nr_running == 0) + if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; /* * If the busiest group is imbalanced the below checks don't - * work because they assumes all things are equal, which typically + * work because they assume all things are equal, which typically * isn't true due to cpus_allowed constraints and the like. */ - if (sds.group_imb) + if (busiest->group_imb) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && - !sds.busiest_has_capacity) + if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && + !busiest->group_has_capacity) goto force_balance; /* * If the local group is more busy than the selected busiest group * don't try and pull any tasks. */ - if (sds.this_load >= sds.max_load) + if (local->avg_load >= busiest->avg_load) goto out_balanced; /* * Don't pull any tasks if this group is already above the domain * average load. */ - if (sds.this_load >= sds.avg_load) + if (local->avg_load >= sds.avg_load) goto out_balanced; if (env->idle == CPU_IDLE) { @@ -4901,15 +5019,16 @@ find_busiest_group(struct lb_env *env, int *balance) * there is no imbalance between this and busiest group * wrt to idle cpu's, it is balanced. */ - if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && - sds.busiest_nr_running <= sds.busiest_group_weight) + if ((local->idle_cpus < busiest->idle_cpus) && + busiest->sum_nr_running <= busiest->group_weight) goto out_balanced; } else { /* * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use * imbalance_pct to be conservative. */ - if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) + if (100 * busiest->avg_load <= + env->sd->imbalance_pct * local->avg_load) goto out_balanced; } @@ -4919,7 +5038,6 @@ force_balance: return sds.busiest; out_balanced: -ret: env->imbalance = 0; return NULL; } @@ -4931,10 +5049,10 @@ static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; - unsigned long max_load = 0; + unsigned long busiest_load = 0, busiest_power = 1; int i; - for_each_cpu(i, sched_group_cpus(group)) { + for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { unsigned long power = power_of(i); unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); @@ -4943,9 +5061,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (!capacity) capacity = fix_small_capacity(env->sd, group); - if (!cpumask_test_cpu(i, env->cpus)) - continue; - rq = cpu_rq(i); wl = weighted_cpuload(i); @@ -4961,11 +5076,15 @@ static struct rq *find_busiest_queue(struct lb_env *env, * the weighted_cpuload() scaled with the cpu power, so that * the load can be moved away from the cpu that is potentially * running at a lower capacity. + * + * Thus we're looking for max(wl_i / power_i), crosswise + * multiplication to rid ourselves of the division works out + * to: wl_i * power_j > wl_j * power_i; where j is our + * previous maximum. */ - wl = (wl * SCHED_POWER_SCALE) / power; - - if (wl > max_load) { - max_load = wl; + if (wl * busiest_power > busiest_load * power) { + busiest_load = wl; + busiest_power = power; busiest = rq; } } @@ -5002,13 +5121,47 @@ static int need_active_balance(struct lb_env *env) static int active_load_balance_cpu_stop(void *data); +static int should_we_balance(struct lb_env *env) +{ + struct sched_group *sg = env->sd->groups; + struct cpumask *sg_cpus, *sg_mask; + int cpu, balance_cpu = -1; + + /* + * In the newly idle case, we will allow all the cpu's + * to do the newly idle load balance. + */ + if (env->idle == CPU_NEWLY_IDLE) + return 1; + + sg_cpus = sched_group_cpus(sg); + sg_mask = sched_group_mask(sg); + /* Try to find first idle cpu */ + for_each_cpu_and(cpu, sg_cpus, env->cpus) { + if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) + continue; + + balance_cpu = cpu; + break; + } + + if (balance_cpu == -1) + balance_cpu = group_balance_cpu(sg); + + /* + * First idle cpu or the first cpu(busiest) in this sched group + * is eligible for doing load balancing at this and above domains. + */ + return balance_cpu == env->dst_cpu; +} + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance) + int *continue_balancing) { int ld_moved, cur_ld_moved, active_balance = 0; struct sched_group *group; @@ -5038,11 +5191,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, schedstat_inc(sd, lb_count[idle]); redo: - group = find_busiest_group(&env, balance); - - if (*balance == 0) + if (!should_we_balance(&env)) { + *continue_balancing = 0; goto out_balanced; + } + group = find_busiest_group(&env); if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; @@ -5071,7 +5225,6 @@ redo: env.src_rq = busiest; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); - update_h_load(env.src_cpu); more_balance: local_irq_save(flags); double_rq_lock(env.dst_rq, busiest); @@ -5241,7 +5394,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) int pulled_task = 0; unsigned long next_balance = jiffies + HZ; - this_rq->idle_stamp = this_rq->clock; + this_rq->idle_stamp = rq_clock(this_rq); if (this_rq->avg_idle < sysctl_sched_migration_cost) return; @@ -5255,7 +5408,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) rcu_read_lock(); for_each_domain(this_cpu, sd) { unsigned long interval; - int balance = 1; + int continue_balancing = 1; if (!(sd->flags & SD_LOAD_BALANCE)) continue; @@ -5263,7 +5416,8 @@ void idle_balance(int this_cpu, struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) { /* If we've pulled tasks over stop searching: */ pulled_task = load_balance(this_cpu, this_rq, - sd, CPU_NEWLY_IDLE, &balance); + sd, CPU_NEWLY_IDLE, + &continue_balancing); } interval = msecs_to_jiffies(sd->balance_interval); @@ -5418,10 +5572,9 @@ static inline void nohz_balance_exit_idle(int cpu) static inline void set_cpu_sd_state_busy(void) { struct sched_domain *sd; - int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + sd = rcu_dereference_check_sched_domain(this_rq()->sd); if (!sd || !sd->nohz_idle) goto unlock; @@ -5436,10 +5589,9 @@ unlock: void set_cpu_sd_state_idle(void) { struct sched_domain *sd; - int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + sd = rcu_dereference_check_sched_domain(this_rq()->sd); if (!sd || sd->nohz_idle) goto unlock; @@ -5471,7 +5623,7 @@ void nohz_balance_enter_idle(int cpu) set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } -static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, +static int sched_ilb_notifier(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { @@ -5503,7 +5655,7 @@ void update_max_interval(void) */ static void rebalance_domains(int cpu, enum cpu_idle_type idle) { - int balance = 1; + int continue_balancing = 1; struct rq *rq = cpu_rq(cpu); unsigned long interval; struct sched_domain *sd; @@ -5535,7 +5687,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance)) { + if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { /* * The LBF_SOME_PINNED logic could have changed * env->dst_cpu, so we can't know our idle @@ -5558,7 +5710,7 @@ out: * CPU in our sched group which is doing load balancing more * actively. */ - if (!balance) + if (!continue_balancing) break; } rcu_read_unlock(); @@ -5751,7 +5903,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } - if (sched_feat_numa(NUMA)) + if (numabalancing_enabled) task_tick_numa(rq, curr); update_rq_runnable_avg(rq, 1); @@ -5777,11 +5929,15 @@ static void task_fork_fair(struct task_struct *p) cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - if (unlikely(task_cpu(p) != this_cpu)) { - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - } + /* + * Not only the cpu but also the task_group of the parent might have + * been changed after parent->se.parent,cfs_rq were copied to + * child->se.parent,cfs_rq. So call __set_task_cpu() to make those + * of child point to valid ones. + */ + rcu_read_lock(); + __set_task_cpu(p, this_cpu); + rcu_read_unlock(); update_curr(cfs_rq); @@ -5848,17 +6004,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) se->vruntime -= cfs_rq->min_vruntime; } -#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) +#ifdef CONFIG_SMP /* * Remove our load from contribution when we leave sched_fair * and ensure we don't carry in an old decay_count if we * switch back. */ - if (p->se.avg.decay_count) { - struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); - __synchronize_entity_decay(&p->se); - subtract_blocked_load_contrib(cfs_rq, - p->se.avg.load_avg_contrib); + if (se->avg.decay_count) { + __synchronize_entity_decay(se); + subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); } #endif } @@ -5907,9 +6061,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif -#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) +#ifdef CONFIG_SMP atomic64_set(&cfs_rq->decay_counter, 1); - atomic64_set(&cfs_rq->removed_load, 0); + atomic_long_set(&cfs_rq->removed_load, 0); #endif } @@ -6091,6 +6245,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) se = tg->se[i]; /* Propagate contribution to hierarchy */ raw_spin_lock_irqsave(&rq->lock, flags); + + /* Possible calls to update_curr() need rq clock */ + update_rq_clock(rq); for_each_sched_entity(se) update_cfs_shares(group_cfs_rq(se)); raw_spin_unlock_irqrestore(&rq->lock, flags); @@ -6146,9 +6303,8 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, -#ifdef CONFIG_FAIR_GROUP_SCHED .migrate_task_rq = migrate_task_rq_fair, -#endif + .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c new file mode 100644 index 00000000000..16f5a30f9c8 --- /dev/null +++ b/kernel/sched/proc.c @@ -0,0 +1,591 @@ +/* + * kernel/sched/proc.c + * + * Kernel load calculations, forked from sched/core.c + */ + +#include <linux/export.h> + +#include "sched.h" + +unsigned long this_cpu_load(void) +{ + struct rq *this = this_rq(); + return this->cpu_load[0]; +} + + +/* + * Global load-average calculations + * + * We take a distributed and async approach to calculating the global load-avg + * in order to minimize overhead. + * + * The global load average is an exponentially decaying average of nr_running + + * nr_uninterruptible. + * + * Once every LOAD_FREQ: + * + * nr_active = 0; + * for_each_possible_cpu(cpu) + * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; + * + * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) + * + * Due to a number of reasons the above turns in the mess below: + * + * - for_each_possible_cpu() is prohibitively expensive on machines with + * serious number of cpus, therefore we need to take a distributed approach + * to calculating nr_active. + * + * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 + * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } + * + * So assuming nr_active := 0 when we start out -- true per definition, we + * can simply take per-cpu deltas and fold those into a global accumulate + * to obtain the same result. See calc_load_fold_active(). + * + * Furthermore, in order to avoid synchronizing all per-cpu delta folding + * across the machine, we assume 10 ticks is sufficient time for every + * cpu to have completed this task. + * + * This places an upper-bound on the IRQ-off latency of the machine. Then + * again, being late doesn't loose the delta, just wrecks the sample. + * + * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because + * this would add another cross-cpu cacheline miss and atomic operation + * to the wakeup path. Instead we increment on whatever cpu the task ran + * when it went into uninterruptible state and decrement on whatever cpu + * did the wakeup. This means that only the sum of nr_uninterruptible over + * all cpus yields the correct result. + * + * This covers the NO_HZ=n code, for extra head-aches, see the comment below. + */ + +/* Variables and functions for calc_load */ +atomic_long_t calc_load_tasks; +unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); /* should be removed */ + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} + +long calc_load_fold_active(struct rq *this_rq) +{ + long nr_active, delta = 0; + + nr_active = this_rq->nr_running; + nr_active += (long) this_rq->nr_uninterruptible; + + if (nr_active != this_rq->calc_load_active) { + delta = nr_active - this_rq->calc_load_active; + this_rq->calc_load_active = nr_active; + } + + return delta; +} + +/* + * a1 = a0 * e + a * (1 - e) + */ +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + load *= exp; + load += active * (FIXED_1 - exp); + load += 1UL << (FSHIFT - 1); + return load >> FSHIFT; +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * Handle NO_HZ for the global load-average. + * + * Since the above described distributed algorithm to compute the global + * load-average relies on per-cpu sampling from the tick, it is affected by + * NO_HZ. + * + * The basic idea is to fold the nr_active delta into a global idle-delta upon + * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * when we read the global state. + * + * Obviously reality has to ruin such a delightfully simple scheme: + * + * - When we go NO_HZ idle during the window, we can negate our sample + * contribution, causing under-accounting. + * + * We avoid this by keeping two idle-delta counters and flipping them + * when the window starts, thus separating old and new NO_HZ load. + * + * The only trick is the slight shift in index flip for read vs write. + * + * 0s 5s 10s 15s + * +10 +10 +10 +10 + * |-|-----------|-|-----------|-|-----------|-| + * r:0 0 1 1 0 0 1 1 0 + * w:0 1 1 0 0 1 1 0 0 + * + * This ensures we'll fold the old idle contribution in this window while + * accumlating the new one. + * + * - When we wake up from NO_HZ idle during the window, we push up our + * contribution, since we effectively move our sample point to a known + * busy state. + * + * This is solved by pushing the window forward, and thus skipping the + * sample, for this cpu (effectively using the idle-delta for this cpu which + * was in effect at the time the window opened). This also solves the issue + * of having to deal with a cpu having been in NOHZ idle for multiple + * LOAD_FREQ intervals. + * + * When making the ILB scale, we should try to pull this in as well. + */ +static atomic_long_t calc_load_idle[2]; +static int calc_load_idx; + +static inline int calc_load_write_idx(void) +{ + int idx = calc_load_idx; + + /* + * See calc_global_nohz(), if we observe the new index, we also + * need to observe the new update time. + */ + smp_rmb(); + + /* + * If the folding window started, make sure we start writing in the + * next idle-delta. + */ + if (!time_before(jiffies, calc_load_update)) + idx++; + + return idx & 1; +} + +static inline int calc_load_read_idx(void) +{ + return calc_load_idx & 1; +} + +void calc_load_enter_idle(void) +{ + struct rq *this_rq = this_rq(); + long delta; + + /* + * We're going into NOHZ mode, if there's any pending delta, fold it + * into the pending idle delta. + */ + delta = calc_load_fold_active(this_rq); + if (delta) { + int idx = calc_load_write_idx(); + atomic_long_add(delta, &calc_load_idle[idx]); + } +} + +void calc_load_exit_idle(void) +{ + struct rq *this_rq = this_rq(); + + /* + * If we're still before the sample window, we're done. + */ + if (time_before(jiffies, this_rq->calc_load_update)) + return; + + /* + * We woke inside or after the sample window, this means we're already + * accounted through the nohz accounting, so skip the entire deal and + * sync up for the next window. + */ + this_rq->calc_load_update = calc_load_update; + if (time_before(jiffies, this_rq->calc_load_update + 10)) + this_rq->calc_load_update += LOAD_FREQ; +} + +static long calc_load_fold_idle(void) +{ + int idx = calc_load_read_idx(); + long delta = 0; + + if (atomic_long_read(&calc_load_idle[idx])) + delta = atomic_long_xchg(&calc_load_idle[idx], 0); + + return delta; +} + +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + + return result; +} + +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +static unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); +} + +/* + * NO_HZ can leave us missing all per-cpu ticks calling + * calc_load_account_active(), but since an idle CPU folds its delta into + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold + * in the pending idle delta if our idle period crossed a load cycle boundary. + * + * Once we've updated the global active value, we need to apply the exponential + * weights adjusted to the number of cycles missed. + */ +static void calc_global_nohz(void) +{ + long delta, active, n; + + if (!time_before(jiffies, calc_load_update + 10)) { + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + + calc_load_update += n * LOAD_FREQ; + } + + /* + * Flip the idle index... + * + * Make sure we first write the new time then flip the index, so that + * calc_load_write_idx() will see the new time when it reads the new + * index, this avoids a double flip messing things up. + */ + smp_wmb(); + calc_load_idx++; +} +#else /* !CONFIG_NO_HZ_COMMON */ + +static inline long calc_load_fold_idle(void) { return 0; } +static inline void calc_global_nohz(void) { } + +#endif /* CONFIG_NO_HZ_COMMON */ + +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(unsigned long ticks) +{ + long active, delta; + + if (time_before(jiffies, calc_load_update + 10)) + return; + + /* + * Fold the 'old' idle-delta to include all NO_HZ cpus. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load(avenrun[0], EXP_1, active); + avenrun[1] = calc_load(avenrun[1], EXP_5, active); + avenrun[2] = calc_load(avenrun[2], EXP_15, active); + + calc_load_update += LOAD_FREQ; + + /* + * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. + */ + calc_global_nohz(); +} + +/* + * Called from update_cpu_load() to periodically update this CPU's + * active count. + */ +static void calc_load_account_active(struct rq *this_rq) +{ + long delta; + + if (time_before(jiffies, this_rq->calc_load_update)) + return; + + delta = calc_load_fold_active(this_rq); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + this_rq->calc_load_update += LOAD_FREQ; +} + +/* + * End of global load-average stuff + */ + +/* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT 7 +static const unsigned char + degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char + degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {64, 32, 8, 0, 0, 0, 0, 0}, + {96, 72, 40, 12, 1, 0, 0}, + {112, 98, 75, 43, 15, 1, 0}, + {120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ + int j = 0; + + if (!missed_updates) + return load; + + if (missed_updates >= degrade_zero_ticks[idx]) + return 0; + + if (idx == 1) + return load >> missed_updates; + + while (missed_updates) { + if (missed_updates % 2) + load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + + missed_updates >>= 1; + j++; + } + return load; +} + +/* + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. + */ +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) +{ + int i, scale; + + this_rq->nr_load_updates++; + + /* Update our load: */ + this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ + for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + unsigned long old_load, new_load; + + /* scale is effectively 1 << i now, and >> i divides by scale */ + + old_load = this_rq->cpu_load[i]; + old_load = decay_load_missed(old_load, pending_updates - 1, i); + new_load = this_load; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale - 1; + + this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; + } + + sched_avg_update(this_rq); +} + +#ifdef CONFIG_SMP +static inline unsigned long get_rq_runnable_load(struct rq *rq) +{ + return rq->cfs.runnable_load_avg; +} +#else +static inline unsigned long get_rq_runnable_load(struct rq *rq) +{ + return rq->load.weight; +} +#endif + +#ifdef CONFIG_NO_HZ_COMMON +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + +/* + * Called from nohz_idle_balance() to update the load ratings before doing the + * idle balance. + */ +void update_idle_cpu_load(struct rq *this_rq) +{ + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long load = get_rq_runnable_load(this_rq); + unsigned long pending_updates; + + /* + * bail if there's load or we're actually up-to-date. + */ + if (load || curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + + __update_cpu_load(this_rq, load, pending_updates); +} + +/* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + +/* + * Called from scheduler_tick() + */ +void update_cpu_load_active(struct rq *this_rq) +{ + unsigned long load = get_rq_runnable_load(this_rq); + /* + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). + */ + this_rq->last_load_update_tick = jiffies; + __update_cpu_load(this_rq, load, 1); + + calc_load_account_active(this_rq); +} diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 127a2c4cf4a..01970c8e64d 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -399,20 +399,6 @@ static inline struct task_group *next_task_group(struct task_group *tg) (iter = next_task_group(iter)) && \ (rt_rq = iter->rt_rq[cpu_of(rq)]);) -static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) -{ - list_add_rcu(&rt_rq->leaf_rt_rq_list, - &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); -} - -static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) -{ - list_del_rcu(&rt_rq->leaf_rt_rq_list); -} - -#define for_each_leaf_rt_rq(rt_rq, rq) \ - list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) - #define for_each_sched_rt_entity(rt_se) \ for (; rt_se; rt_se = rt_se->parent) @@ -472,7 +458,7 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) #ifdef CONFIG_SMP static inline const struct cpumask *sched_rt_period_mask(void) { - return cpu_rq(smp_processor_id())->rd->span; + return this_rq()->rd->span; } #else static inline const struct cpumask *sched_rt_period_mask(void) @@ -509,17 +495,6 @@ typedef struct rt_rq *rt_rq_iter_t; #define for_each_rt_rq(rt_rq, iter, rq) \ for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) -static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) -{ -} - -static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) -{ -} - -#define for_each_leaf_rt_rq(rt_rq, rq) \ - for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) - #define for_each_sched_rt_entity(rt_se) \ for (; rt_se; rt_se = NULL) @@ -699,15 +674,6 @@ balanced: } } -static void disable_runtime(struct rq *rq) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - __disable_runtime(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - static void __enable_runtime(struct rq *rq) { rt_rq_iter_t iter; @@ -732,37 +698,6 @@ static void __enable_runtime(struct rq *rq) } } -static void enable_runtime(struct rq *rq) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - __enable_runtime(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - -int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - disable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - enable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} - static int balance_runtime(struct rt_rq *rt_rq) { int more = 0; @@ -926,7 +861,7 @@ static void update_curr_rt(struct rq *rq) if (curr->sched_class != &rt_sched_class) return; - delta_exec = rq->clock_task - curr->se.exec_start; + delta_exec = rq_clock_task(rq) - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; @@ -936,7 +871,7 @@ static void update_curr_rt(struct rq *rq) curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq->clock_task; + curr->se.exec_start = rq_clock_task(rq); cpuacct_charge(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); @@ -1106,9 +1041,6 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) return; - if (!rt_rq->rt_nr_running) - list_add_leaf_rt_rq(rt_rq); - if (head) list_add(&rt_se->run_list, queue); else @@ -1128,8 +1060,6 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) __clear_bit(rt_se_prio(rt_se), array->bitmap); dec_rt_tasks(rt_se, rt_rq); - if (!rt_rq->rt_nr_running) - list_del_leaf_rt_rq(rt_rq); } /* @@ -1385,7 +1315,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) } while (rt_rq); p = rt_task_of(rt_se); - p->se.exec_start = rq->clock_task; + p->se.exec_start = rq_clock_task(rq); return p; } @@ -1434,42 +1364,24 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) return 0; } -/* Return the second highest RT task, NULL otherwise */ -static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) +/* + * Return the highest pushable rq's task, which is suitable to be executed + * on the cpu, NULL otherwise + */ +static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) { - struct task_struct *next = NULL; - struct sched_rt_entity *rt_se; - struct rt_prio_array *array; - struct rt_rq *rt_rq; - int idx; - - for_each_leaf_rt_rq(rt_rq, rq) { - array = &rt_rq->active; - idx = sched_find_first_bit(array->bitmap); -next_idx: - if (idx >= MAX_RT_PRIO) - continue; - if (next && next->prio <= idx) - continue; - list_for_each_entry(rt_se, array->queue + idx, run_list) { - struct task_struct *p; + struct plist_head *head = &rq->rt.pushable_tasks; + struct task_struct *p; - if (!rt_entity_is_task(rt_se)) - continue; + if (!has_pushable_tasks(rq)) + return NULL; - p = rt_task_of(rt_se); - if (pick_rt_task(rq, p, cpu)) { - next = p; - break; - } - } - if (!next) { - idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); - goto next_idx; - } + plist_for_each_entry(p, head, pushable_tasks) { + if (pick_rt_task(rq, p, cpu)) + return p; } - return next; + return NULL; } static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); @@ -1743,12 +1655,10 @@ static int pull_rt_task(struct rq *this_rq) double_lock_balance(this_rq, src_rq); /* - * Are there still pullable RT tasks? + * We can pull only a task, which is pushable + * on its rq, and no others. */ - if (src_rq->rt.rt_nr_running <= 1) - goto skip; - - p = pick_next_highest_task_rt(src_rq, this_cpu); + p = pick_highest_pushable_task(src_rq, this_cpu); /* * Do we have an RT task that preempts @@ -2037,7 +1947,7 @@ static void set_curr_task_rt(struct rq *rq) { struct task_struct *p = rq->curr; - p->se.exec_start = rq->clock_task; + p->se.exec_start = rq_clock_task(rq); /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ce39224d615..b3c5653e1dc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -10,8 +10,16 @@ #include "cpupri.h" #include "cpuacct.h" +struct rq; + extern __read_mostly int scheduler_running; +extern unsigned long calc_load_update; +extern atomic_long_t calc_load_tasks; + +extern long calc_load_fold_active(struct rq *this_rq); +extern void update_cpu_load_active(struct rq *this_rq); + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -140,10 +148,11 @@ struct task_group { struct cfs_rq **cfs_rq; unsigned long shares; - atomic_t load_weight; - atomic64_t load_avg; +#ifdef CONFIG_SMP + atomic_long_t load_avg; atomic_t runnable_avg; #endif +#endif #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity **rt_se; @@ -261,27 +270,21 @@ struct cfs_rq { #endif #ifdef CONFIG_SMP -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#ifdef CONFIG_FAIR_GROUP_SCHED /* * CFS Load tracking * Under CFS, load is tracked on a per-entity basis and aggregated up. * This allows for the description of both thread and group usage (in * the FAIR_GROUP_SCHED case). */ - u64 runnable_load_avg, blocked_load_avg; - atomic64_t decay_counter, removed_load; + unsigned long runnable_load_avg, blocked_load_avg; + atomic64_t decay_counter; u64 last_decay; -#endif /* CONFIG_FAIR_GROUP_SCHED */ -/* These always depend on CONFIG_FAIR_GROUP_SCHED */ + atomic_long_t removed_load; + #ifdef CONFIG_FAIR_GROUP_SCHED + /* Required to track per-cpu representation of a task_group */ u32 tg_runnable_contrib; - u64 tg_load_contrib; -#endif /* CONFIG_FAIR_GROUP_SCHED */ + unsigned long tg_load_contrib; /* * h_load = weight * f(tg) @@ -290,6 +293,9 @@ struct cfs_rq { * this group. */ unsigned long h_load; + u64 last_h_load_update; + struct sched_entity *h_load_next; +#endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_SMP */ #ifdef CONFIG_FAIR_GROUP_SCHED @@ -353,7 +359,6 @@ struct rt_rq { unsigned long rt_nr_boosted; struct rq *rq; - struct list_head leaf_rt_rq_list; struct task_group *tg; #endif }; @@ -426,9 +431,6 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; -#ifdef CONFIG_SMP - unsigned long h_load_throttle; -#endif /* CONFIG_SMP */ #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED @@ -540,6 +542,16 @@ DECLARE_PER_CPU(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() (&__raw_get_cpu_var(runqueues)) +static inline u64 rq_clock(struct rq *rq) +{ + return rq->clock; +} + +static inline u64 rq_clock_task(struct rq *rq) +{ + return rq->clock_task; +} + #ifdef CONFIG_SMP #define rcu_dereference_check_sched_domain(p) \ @@ -582,6 +594,7 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) } DECLARE_PER_CPU(struct sched_domain *, sd_llc); +DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); struct sched_group_power { @@ -652,9 +665,9 @@ extern int group_balance_cpu(struct sched_group *sg); /* * Return the group to which this tasks belongs. * - * We cannot use task_subsys_state() and friends because the cgroup - * subsystem changes that value before the cgroup_subsys::attach() method - * is called, therefore we cannot pin it and might observe the wrong value. + * We cannot use task_css() and friends because the cgroup subsystem + * changes that value before the cgroup_subsys::attach() method is called, + * therefore we cannot pin it and might observe the wrong value. * * The same is true for autogroup's p->signal->autogroup->tg, the autogroup * core changes this before calling sched_move_task(). @@ -884,24 +897,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) #define WF_FORK 0x02 /* child wakeup after fork */ #define WF_MIGRATED 0x4 /* internal use, task got migrated */ -static inline void update_load_add(struct load_weight *lw, unsigned long inc) -{ - lw->weight += inc; - lw->inv_weight = 0; -} - -static inline void update_load_sub(struct load_weight *lw, unsigned long dec) -{ - lw->weight -= dec; - lw->inv_weight = 0; -} - -static inline void update_load_set(struct load_weight *lw, unsigned long w) -{ - lw->weight = w; - lw->inv_weight = 0; -} - /* * To aid in avoiding the subversion of "niceness" due to uneven distribution * of tasks with abnormal "nice" values across CPUs the contribution that @@ -1028,17 +1023,8 @@ extern void update_group_power(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq, int cpu); extern void idle_balance(int this_cpu, struct rq *this_rq); -/* - * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg - * becomes useful in lb - */ -#if defined(CONFIG_FAIR_GROUP_SCHED) extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); -#else -static inline void idle_enter_fair(struct rq *this_rq) {} -static inline void idle_exit_fair(struct rq *this_rq) {} -#endif #else /* CONFIG_SMP */ @@ -1051,7 +1037,6 @@ static inline void idle_balance(int cpu, struct rq *rq) extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); -extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); @@ -1063,6 +1048,8 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime extern void update_idle_cpu_load(struct rq *this_rq); +extern void init_task_runnable_average(struct task_struct *p); + #ifdef CONFIG_PARAVIRT static inline u64 steal_ticks(u64 steal) { diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 2ef90a51ec5..c7edee71bce 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -61,7 +61,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) */ static inline void sched_info_dequeued(struct task_struct *t) { - unsigned long long now = task_rq(t)->clock, delta = 0; + unsigned long long now = rq_clock(task_rq(t)), delta = 0; if (unlikely(sched_info_on())) if (t->sched_info.last_queued) @@ -79,7 +79,7 @@ static inline void sched_info_dequeued(struct task_struct *t) */ static void sched_info_arrive(struct task_struct *t) { - unsigned long long now = task_rq(t)->clock, delta = 0; + unsigned long long now = rq_clock(task_rq(t)), delta = 0; if (t->sched_info.last_queued) delta = now - t->sched_info.last_queued; @@ -100,19 +100,20 @@ static inline void sched_info_queued(struct task_struct *t) { if (unlikely(sched_info_on())) if (!t->sched_info.last_queued) - t->sched_info.last_queued = task_rq(t)->clock; + t->sched_info.last_queued = rq_clock(task_rq(t)); } /* - * Called when a process ceases being the active-running process, either - * voluntarily or involuntarily. Now we can calculate how long we ran. + * Called when a process ceases being the active-running process involuntarily + * due, typically, to expiring its time slice (this may also be called when + * switching to the idle task). Now we can calculate how long we ran. * Also, if the process is still in the TASK_RUNNING state, call * sched_info_queued() to mark that it has now again started waiting on * the runqueue. */ static inline void sched_info_depart(struct task_struct *t) { - unsigned long long delta = task_rq(t)->clock - + unsigned long long delta = rq_clock(task_rq(t)) - t->sched_info.last_arrival; rq_sched_info_depart(task_rq(t), delta); @@ -162,6 +163,39 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) */ /** + * cputimer_running - return true if cputimer is running + * + * @tsk: Pointer to target task. + */ +static inline bool cputimer_running(struct task_struct *tsk) + +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + + if (!cputimer->running) + return false; + + /* + * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime + * in __exit_signal(), we won't account to the signal struct further + * cputime consumed by that task, even though the task can still be + * ticking after __exit_signal(). + * + * In order to keep a consistent behaviour between thread group cputime + * and thread group cputimer accounting, lets also ignore the cputime + * elapsing after __exit_signal() in any thread group timer running. + * + * This makes sure that POSIX CPU clocks and timers are synchronized, so + * that a POSIX CPU timer won't expire while the corresponding POSIX CPU + * clock delta is behind the expiring timer value. + */ + if (unlikely(!tsk->sighand)) + return false; + + return true; +} + +/** * account_group_user_time - Maintain utime for a thread group. * * @tsk: Pointer to task structure. @@ -176,7 +210,7 @@ static inline void account_group_user_time(struct task_struct *tsk, { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - if (!cputimer->running) + if (!cputimer_running(tsk)) return; raw_spin_lock(&cputimer->lock); @@ -199,7 +233,7 @@ static inline void account_group_system_time(struct task_struct *tsk, { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - if (!cputimer->running) + if (!cputimer_running(tsk)) return; raw_spin_lock(&cputimer->lock); @@ -222,7 +256,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - if (!cputimer->running) + if (!cputimer_running(tsk)) return; raw_spin_lock(&cputimer->lock); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index da5eb5bed84..e08fbeeb54b 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -28,7 +28,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) struct task_struct *stop = rq->stop; if (stop && stop->on_rq) { - stop->se.exec_start = rq->clock_task; + stop->se.exec_start = rq_clock_task(rq); return stop; } @@ -57,7 +57,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) struct task_struct *curr = rq->curr; u64 delta_exec; - delta_exec = rq->clock_task - curr->se.exec_start; + delta_exec = rq_clock_task(rq) - curr->se.exec_start; if (unlikely((s64)delta_exec < 0)) delta_exec = 0; @@ -67,7 +67,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq->clock_task; + curr->se.exec_start = rq_clock_task(rq); cpuacct_charge(curr, delta_exec); } @@ -79,7 +79,7 @@ static void set_curr_task_stop(struct rq *rq) { struct task_struct *stop = rq->stop; - stop->se.exec_start = rq->clock_task; + stop->se.exec_start = rq_clock_task(rq); } static void switched_to_stop(struct rq *rq, struct task_struct *p) diff --git a/kernel/signal.c b/kernel/signal.c index 113411bfe8b..ded28b91fa5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2848,7 +2848,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, recalc_sigpending(); spin_unlock_irq(&tsk->sighand->siglock); - timeout = schedule_timeout_interruptible(timeout); + timeout = freezable_schedule_timeout_interruptible(timeout); spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); @@ -3394,7 +3394,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, new_ka.sa.sa_restorer = compat_ptr(restorer); #endif ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); - ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); + ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags); if (ret) return -EFAULT; sigset_from_compat(&new_ka.sa.sa_mask, &mask); @@ -3406,7 +3406,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler); ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); - ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); + ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags); #ifdef __ARCH_HAS_SA_RESTORER ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer); diff --git a/kernel/smp.c b/kernel/smp.c index 4dba0f7b72a..0564571dcdf 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -48,10 +48,13 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) cpu_to_node(cpu))) return notifier_from_errno(-ENOMEM); if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, - cpu_to_node(cpu))) + cpu_to_node(cpu))) { + free_cpumask_var(cfd->cpumask); return notifier_from_errno(-ENOMEM); + } cfd->csd = alloc_percpu(struct call_single_data); if (!cfd->csd) { + free_cpumask_var(cfd->cpumask_ipi); free_cpumask_var(cfd->cpumask); return notifier_from_errno(-ENOMEM); } @@ -73,7 +76,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block __cpuinitdata hotplug_cfd_notifier = { +static struct notifier_block hotplug_cfd_notifier = { .notifier_call = hotplug_cfd, }; @@ -186,25 +189,13 @@ void generic_smp_call_function_single_interrupt(void) while (!list_empty(&list)) { struct call_single_data *csd; - unsigned int csd_flags; csd = list_entry(list.next, struct call_single_data, list); list_del(&csd->list); - /* - * 'csd' can be invalid after this call if flags == 0 - * (when called through generic_exec_single()), - * so save them away before making the call: - */ - csd_flags = csd->flags; - csd->func(csd->info); - /* - * Unlocked CSDs are valid through generic_exec_single(): - */ - if (csd_flags & CSD_FLAG_LOCK) - csd_unlock(csd); + csd_unlock(csd); } } @@ -278,8 +269,6 @@ EXPORT_SYMBOL(smp_call_function_single); * @wait: If true, wait until function has completed. * * Returns 0 on success, else a negative status code (if no cpus were online). - * Note that @wait will be implicitly turned on in case of allocation failures, - * since we fall back to on-stack allocation. * * Selection preference: * 1) current cpu if in @mask @@ -586,8 +575,10 @@ EXPORT_SYMBOL(on_each_cpu); * * If @wait is true, then returns once @func has returned. * - * You must not call this function with disabled interrupts or - * from a hardware interrupt handler or from a bottom half handler. + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. The + * exception is that it may be used during early boot while + * early_boot_irqs_disabled is set. */ void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) @@ -596,9 +587,10 @@ void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, smp_call_function_many(mask, func, info, wait); if (cpumask_test_cpu(cpu, mask)) { - local_irq_disable(); + unsigned long flags; + local_irq_save(flags); func(info); - local_irq_enable(); + local_irq_restore(flags); } put_cpu(); } diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 02fc5c93367..eb89e180740 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -24,7 +24,7 @@ */ static DEFINE_PER_CPU(struct task_struct *, idle_threads); -struct task_struct * __cpuinit idle_thread_get(unsigned int cpu) +struct task_struct *idle_thread_get(unsigned int cpu) { struct task_struct *tsk = per_cpu(idle_threads, cpu); diff --git a/kernel/softirq.c b/kernel/softirq.c index b5197dcb0da..d7d498d8cc4 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -127,8 +127,7 @@ static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) void local_bh_disable(void) { - __local_bh_disable((unsigned long)__builtin_return_address(0), - SOFTIRQ_DISABLE_OFFSET); + __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET); } EXPORT_SYMBOL(local_bh_disable); @@ -139,7 +138,7 @@ static void __local_bh_enable(unsigned int cnt) WARN_ON_ONCE(!irqs_disabled()); if (softirq_count() == cnt) - trace_softirqs_on((unsigned long)__builtin_return_address(0)); + trace_softirqs_on(_RET_IP_); sub_preempt_count(cnt); } @@ -184,7 +183,7 @@ static inline void _local_bh_enable_ip(unsigned long ip) void local_bh_enable(void) { - _local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + _local_bh_enable_ip(_RET_IP_); } EXPORT_SYMBOL(local_bh_enable); @@ -195,8 +194,12 @@ void local_bh_enable_ip(unsigned long ip) EXPORT_SYMBOL(local_bh_enable_ip); /* - * We restart softirq processing for at most 2 ms, - * and if need_resched() is not set. + * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, + * but break the loop if need_resched() is set or after 2 ms. + * The MAX_SOFTIRQ_TIME provides a nice upper bound in most cases, but in + * certain cases, such as stop_machine(), jiffies may cease to + * increment and so we need the MAX_SOFTIRQ_RESTART limit as + * well to make sure we eventually return from this method. * * These limits have been established via experimentation. * The two things to balance is latency against fairness - @@ -204,6 +207,7 @@ EXPORT_SYMBOL(local_bh_enable_ip); * should not be able to lock up the box. */ #define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) +#define MAX_SOFTIRQ_RESTART 10 asmlinkage void __do_softirq(void) { @@ -212,6 +216,7 @@ asmlinkage void __do_softirq(void) unsigned long end = jiffies + MAX_SOFTIRQ_TIME; int cpu; unsigned long old_flags = current->flags; + int max_restart = MAX_SOFTIRQ_RESTART; /* * Mask out PF_MEMALLOC s current task context is borrowed for the @@ -223,8 +228,7 @@ asmlinkage void __do_softirq(void) pending = local_softirq_pending(); account_irq_enter_time(current); - __local_bh_disable((unsigned long)__builtin_return_address(0), - SOFTIRQ_OFFSET); + __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); lockdep_softirq_enter(); cpu = smp_processor_id(); @@ -265,7 +269,8 @@ restart: pending = local_softirq_pending(); if (pending) { - if (time_before(jiffies, end) && !need_resched()) + if (time_before(jiffies, end) && !need_resched() && + --max_restart) goto restart; wakeup_softirqd(); @@ -323,10 +328,19 @@ void irq_enter(void) static inline void invoke_softirq(void) { - if (!force_irqthreads) - __do_softirq(); - else + if (!force_irqthreads) { + /* + * We can safely execute softirq on the current stack if + * it is the irq stack, because it should be near empty + * at this stage. But we have no way to know if the arch + * calls irq_exit() on the irq stack. So call softirq + * in its own stack to prevent from any overrun on top + * of a potentially deep task stack. + */ + do_softirq(); + } else { wakeup_softirqd(); + } } static inline void tick_irq_exit(void) @@ -694,7 +708,7 @@ void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) } EXPORT_SYMBOL(send_remote_softirq); -static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, +static int remote_softirq_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { /* @@ -723,7 +737,7 @@ static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { +static struct notifier_block remote_softirq_cpu_notifier = { .notifier_call = remote_softirq_cpu_notify, }; @@ -825,7 +839,7 @@ static void takeover_tasklets(unsigned int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int __cpuinit cpu_callback(struct notifier_block *nfb, +static int cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -840,7 +854,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata cpu_nfb = { +static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback }; @@ -871,7 +885,6 @@ int __init __weak early_irq_init(void) return 0; } -#ifdef CONFIG_GENERIC_HARDIRQS int __init __weak arch_probe_nr_irqs(void) { return NR_IRQS_LEGACY; @@ -881,4 +894,3 @@ int __init __weak arch_early_irq_init(void) { return 0; } -#endif diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 5cdd8065a3c..4b082b5cac9 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -34,6 +34,20 @@ #else #define raw_read_can_lock(l) read_can_lock(l) #define raw_write_can_lock(l) write_can_lock(l) + +/* + * Some architectures can relax in favour of the CPU owning the lock. + */ +#ifndef arch_read_relax +# define arch_read_relax(l) cpu_relax() +#endif +#ifndef arch_write_relax +# define arch_write_relax(l) cpu_relax() +#endif +#ifndef arch_spin_relax +# define arch_spin_relax(l) cpu_relax() +#endif + /* * We build the __lock_function inlines here. They are too large for * inlining all over the place, but here is only one user per function diff --git a/kernel/sys.c b/kernel/sys.c index b95d3c72ba2..c18ecca575b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -116,20 +116,6 @@ EXPORT_SYMBOL(fs_overflowuid); EXPORT_SYMBOL(fs_overflowgid); /* - * this indicates whether you can reboot with ctrl-alt-del: the default is yes - */ - -int C_A_D = 1; -struct pid *cad_pid; -EXPORT_SYMBOL(cad_pid); - -/* - * If set, this is used for preparing the system to power off. - */ - -void (*pm_power_off_prepare)(void); - -/* * Returns true if current's euid is same as p's uid or euid, * or has CAP_SYS_NICE to p's user_ns. * @@ -308,243 +294,6 @@ out_unlock: return retval; } -/** - * emergency_restart - reboot the system - * - * Without shutting down any hardware or taking any locks - * reboot the system. This is called when we know we are in - * trouble so this is our best effort to reboot. This is - * safe to call in interrupt context. - */ -void emergency_restart(void) -{ - kmsg_dump(KMSG_DUMP_EMERG); - machine_emergency_restart(); -} -EXPORT_SYMBOL_GPL(emergency_restart); - -void kernel_restart_prepare(char *cmd) -{ - blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); - system_state = SYSTEM_RESTART; - usermodehelper_disable(); - device_shutdown(); -} - -/** - * register_reboot_notifier - Register function to be called at reboot time - * @nb: Info about notifier function to be called - * - * Registers a function with the list of functions - * to be called at reboot time. - * - * Currently always returns zero, as blocking_notifier_chain_register() - * always returns zero. - */ -int register_reboot_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(register_reboot_notifier); - -/** - * unregister_reboot_notifier - Unregister previously registered reboot notifier - * @nb: Hook to be unregistered - * - * Unregisters a previously registered reboot - * notifier function. - * - * Returns zero on success, or %-ENOENT on failure. - */ -int unregister_reboot_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(unregister_reboot_notifier); - -/** - * kernel_restart - reboot the system - * @cmd: pointer to buffer containing command to execute for restart - * or %NULL - * - * Shutdown everything and perform a clean reboot. - * This is not safe to call in interrupt context. - */ -void kernel_restart(char *cmd) -{ - kernel_restart_prepare(cmd); - disable_nonboot_cpus(); - syscore_shutdown(); - if (!cmd) - printk(KERN_EMERG "Restarting system.\n"); - else - printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); - kmsg_dump(KMSG_DUMP_RESTART); - machine_restart(cmd); -} -EXPORT_SYMBOL_GPL(kernel_restart); - -static void kernel_shutdown_prepare(enum system_states state) -{ - blocking_notifier_call_chain(&reboot_notifier_list, - (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); - system_state = state; - usermodehelper_disable(); - device_shutdown(); -} -/** - * kernel_halt - halt the system - * - * Shutdown everything and perform a clean system halt. - */ -void kernel_halt(void) -{ - kernel_shutdown_prepare(SYSTEM_HALT); - disable_nonboot_cpus(); - syscore_shutdown(); - printk(KERN_EMERG "System halted.\n"); - kmsg_dump(KMSG_DUMP_HALT); - machine_halt(); -} - -EXPORT_SYMBOL_GPL(kernel_halt); - -/** - * kernel_power_off - power_off the system - * - * Shutdown everything and perform a clean system power_off. - */ -void kernel_power_off(void) -{ - kernel_shutdown_prepare(SYSTEM_POWER_OFF); - if (pm_power_off_prepare) - pm_power_off_prepare(); - disable_nonboot_cpus(); - syscore_shutdown(); - printk(KERN_EMERG "Power down.\n"); - kmsg_dump(KMSG_DUMP_POWEROFF); - machine_power_off(); -} -EXPORT_SYMBOL_GPL(kernel_power_off); - -static DEFINE_MUTEX(reboot_mutex); - -/* - * Reboot system call: for obvious reasons only root may call it, - * and even root needs to set up some magic numbers in the registers - * so that some mistake won't make this reboot the whole machine. - * You can also set the meaning of the ctrl-alt-del-key here. - * - * reboot doesn't sync: do that yourself before calling this. - */ -SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, - void __user *, arg) -{ - struct pid_namespace *pid_ns = task_active_pid_ns(current); - char buffer[256]; - int ret = 0; - - /* We only trust the superuser with rebooting the system. */ - if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) - return -EPERM; - - /* For safety, we require "magic" arguments. */ - if (magic1 != LINUX_REBOOT_MAGIC1 || - (magic2 != LINUX_REBOOT_MAGIC2 && - magic2 != LINUX_REBOOT_MAGIC2A && - magic2 != LINUX_REBOOT_MAGIC2B && - magic2 != LINUX_REBOOT_MAGIC2C)) - return -EINVAL; - - /* - * If pid namespaces are enabled and the current task is in a child - * pid_namespace, the command is handled by reboot_pid_ns() which will - * call do_exit(). - */ - ret = reboot_pid_ns(pid_ns, cmd); - if (ret) - return ret; - - /* Instead of trying to make the power_off code look like - * halt when pm_power_off is not set do it the easy way. - */ - if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) - cmd = LINUX_REBOOT_CMD_HALT; - - mutex_lock(&reboot_mutex); - switch (cmd) { - case LINUX_REBOOT_CMD_RESTART: - kernel_restart(NULL); - break; - - case LINUX_REBOOT_CMD_CAD_ON: - C_A_D = 1; - break; - - case LINUX_REBOOT_CMD_CAD_OFF: - C_A_D = 0; - break; - - case LINUX_REBOOT_CMD_HALT: - kernel_halt(); - do_exit(0); - panic("cannot halt"); - - case LINUX_REBOOT_CMD_POWER_OFF: - kernel_power_off(); - do_exit(0); - break; - - case LINUX_REBOOT_CMD_RESTART2: - if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { - ret = -EFAULT; - break; - } - buffer[sizeof(buffer) - 1] = '\0'; - - kernel_restart(buffer); - break; - -#ifdef CONFIG_KEXEC - case LINUX_REBOOT_CMD_KEXEC: - ret = kernel_kexec(); - break; -#endif - -#ifdef CONFIG_HIBERNATION - case LINUX_REBOOT_CMD_SW_SUSPEND: - ret = hibernate(); - break; -#endif - - default: - ret = -EINVAL; - break; - } - mutex_unlock(&reboot_mutex); - return ret; -} - -static void deferred_cad(struct work_struct *dummy) -{ - kernel_restart(NULL); -} - -/* - * This function gets called by ctrl-alt-del - ie the keyboard interrupt. - * As it's called within an interrupt, it may NOT sync: the only choice - * is whether to reboot at once, or just ignore the ctrl-alt-del. - */ -void ctrl_alt_del(void) -{ - static DECLARE_WORK(cad_work, deferred_cad); - - if (C_A_D) - schedule_work(&cad_work); - else - kill_cad_pid(SIGINT, 1); -} - /* * Unprivileged users may change the real gid to the effective gid * or vice versa. (BSD-style) @@ -588,7 +337,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) if (rgid != (gid_t) -1) { if (gid_eq(old->gid, krgid) || gid_eq(old->egid, krgid) || - nsown_capable(CAP_SETGID)) + ns_capable(old->user_ns, CAP_SETGID)) new->gid = krgid; else goto error; @@ -597,7 +346,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) if (gid_eq(old->gid, kegid) || gid_eq(old->egid, kegid) || gid_eq(old->sgid, kegid) || - nsown_capable(CAP_SETGID)) + ns_capable(old->user_ns, CAP_SETGID)) new->egid = kegid; else goto error; @@ -638,7 +387,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) old = current_cred(); retval = -EPERM; - if (nsown_capable(CAP_SETGID)) + if (ns_capable(old->user_ns, CAP_SETGID)) new->gid = new->egid = new->sgid = new->fsgid = kgid; else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) new->egid = new->fsgid = kgid; @@ -722,7 +471,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) new->uid = kruid; if (!uid_eq(old->uid, kruid) && !uid_eq(old->euid, kruid) && - !nsown_capable(CAP_SETUID)) + !ns_capable(old->user_ns, CAP_SETUID)) goto error; } @@ -731,7 +480,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) if (!uid_eq(old->uid, keuid) && !uid_eq(old->euid, keuid) && !uid_eq(old->suid, keuid) && - !nsown_capable(CAP_SETUID)) + !ns_capable(old->user_ns, CAP_SETUID)) goto error; } @@ -785,7 +534,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) old = current_cred(); retval = -EPERM; - if (nsown_capable(CAP_SETUID)) { + if (ns_capable(old->user_ns, CAP_SETUID)) { new->suid = new->uid = kuid; if (!uid_eq(kuid, old->uid)) { retval = set_user(new); @@ -842,7 +591,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) old = current_cred(); retval = -EPERM; - if (!nsown_capable(CAP_SETUID)) { + if (!ns_capable(old->user_ns, CAP_SETUID)) { if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) goto error; @@ -924,7 +673,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) old = current_cred(); retval = -EPERM; - if (!nsown_capable(CAP_SETGID)) { + if (!ns_capable(old->user_ns, CAP_SETGID)) { if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) goto error; @@ -995,7 +744,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || - nsown_capable(CAP_SETUID)) { + ns_capable(old->user_ns, CAP_SETUID)) { if (!uid_eq(kuid, old->fsuid)) { new->fsuid = kuid; if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) @@ -1034,7 +783,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || - nsown_capable(CAP_SETGID)) { + ns_capable(old->user_ns, CAP_SETGID)) { if (!gid_eq(kgid, old->fsgid)) { new->fsgid = kgid; goto change_okay; @@ -1286,6 +1035,17 @@ out: return retval; } +static void set_special_pids(struct pid *pid) +{ + struct task_struct *curr = current->group_leader; + + if (task_session(curr) != pid) + change_pid(curr, PIDTYPE_SID, pid); + + if (task_pgrp(curr) != pid) + change_pid(curr, PIDTYPE_PGID, pid); +} + SYSCALL_DEFINE0(setsid) { struct task_struct *group_leader = current->group_leader; @@ -1305,7 +1065,7 @@ SYSCALL_DEFINE0(setsid) goto out; group_leader->signal->leader = 1; - __set_special_pids(sid); + set_special_pids(sid); proc_clear_tty(group_leader); @@ -2258,68 +2018,6 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, return err ? -EFAULT : 0; } -char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; - -static int __orderly_poweroff(bool force) -{ - char **argv; - static char *envp[] = { - "HOME=/", - "PATH=/sbin:/bin:/usr/sbin:/usr/bin", - NULL - }; - int ret; - - argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); - if (argv) { - ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - argv_free(argv); - } else { - printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", - __func__, poweroff_cmd); - ret = -ENOMEM; - } - - if (ret && force) { - printk(KERN_WARNING "Failed to start orderly shutdown: " - "forcing the issue\n"); - /* - * I guess this should try to kick off some daemon to sync and - * poweroff asap. Or not even bother syncing if we're doing an - * emergency shutdown? - */ - emergency_sync(); - kernel_power_off(); - } - - return ret; -} - -static bool poweroff_force; - -static void poweroff_work_func(struct work_struct *work) -{ - __orderly_poweroff(poweroff_force); -} - -static DECLARE_WORK(poweroff_work, poweroff_work_func); - -/** - * orderly_poweroff - Trigger an orderly system poweroff - * @force: force poweroff if command execution fails - * - * This may be called from any context to trigger a system shutdown. - * If the orderly shutdown fails, it will force an immediate shutdown. - */ -int orderly_poweroff(bool force) -{ - if (force) /* do not override the pending "true" */ - poweroff_force = true; - schedule_work(&poweroff_work); - return 0; -} -EXPORT_SYMBOL_GPL(orderly_poweroff); - /** * do_sysinfo - fill in sysinfo struct * @info: pointer to buffer to fill @@ -2332,8 +2030,7 @@ static int do_sysinfo(struct sysinfo *info) memset(info, 0, sizeof(struct sysinfo)); - ktime_get_ts(&tp); - monotonic_to_bootbased(&tp); + get_monotonic_boottime(&tp); info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9edcf456e0f..b2f06f3c6a3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -120,7 +120,6 @@ extern int blk_iopoll_enabled; /* Constants used for minimum and maximum */ #ifdef CONFIG_LOCKUP_DETECTOR static int sixty = 60; -static int neg_one = -1; #endif static int zero; @@ -600,6 +599,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "traceoff_on_warning", + .data = &__disable_trace_on_warning, + .maxlen = sizeof(__disable_trace_on_warning), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif #ifdef CONFIG_MODULES { @@ -801,7 +807,7 @@ static struct ctl_table kern_table[] = { #if defined(CONFIG_LOCKUP_DETECTOR) { .procname = "watchdog", - .data = &watchdog_enabled, + .data = &watchdog_user_enabled, .maxlen = sizeof (int), .mode = 0644, .proc_handler = proc_dowatchdog, @@ -814,7 +820,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dowatchdog, - .extra1 = &neg_one, + .extra1 = &zero, .extra2 = &sixty, }, { @@ -828,7 +834,7 @@ static struct ctl_table kern_table[] = { }, { .procname = "nmi_watchdog", - .data = &watchdog_enabled, + .data = &watchdog_user_enabled, .maxlen = sizeof (int), .mode = 0644, .proc_handler = proc_dowatchdog, @@ -1044,6 +1050,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = perf_proc_update_handler, }, + { + .procname = "perf_cpu_time_max_percent", + .data = &sysctl_perf_cpu_time_max_percent, + .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), + .mode = 0644, + .proc_handler = perf_cpu_time_max_percent_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, #endif #ifdef CONFIG_KMEMCHECK { @@ -1210,7 +1225,7 @@ static struct ctl_table vm_table[] = { .data = &hugepages_treat_as_movable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = hugetlb_treat_movable_handler, + .proc_handler = proc_dointvec, }, { .procname = "nr_overcommit_hugepages", @@ -1456,14 +1471,14 @@ static struct ctl_table fs_table[] = { { .procname = "inode-nr", .data = &inodes_stat, - .maxlen = 2*sizeof(int), + .maxlen = 2*sizeof(long), .mode = 0444, .proc_handler = proc_nr_inodes, }, { .procname = "inode-state", .data = &inodes_stat, - .maxlen = 7*sizeof(int), + .maxlen = 7*sizeof(long), .mode = 0444, .proc_handler = proc_nr_inodes, }, @@ -1493,7 +1508,7 @@ static struct ctl_table fs_table[] = { { .procname = "dentry-state", .data = &dentry_stat, - .maxlen = 6*sizeof(int), + .maxlen = 6*sizeof(long), .mode = 0444, .proc_handler = proc_nr_dentry, }, @@ -2331,7 +2346,11 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, int write, void *data) { if (write) { - *valp = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); + unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); + + if (jif > INT_MAX) + return 1; + *valp = (int)jif; } else { int val = *valp; unsigned long lval; diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index aea4a9ea6fc..b609213ca9a 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -3,7 +3,6 @@ #include "../fs/xfs/xfs_sysctl.h" #include <linux/sunrpc/debug.h> #include <linux/string.h> -#include <net/ip_vs.h> #include <linux/syscalls.h> #include <linux/namei.h> #include <linux/mount.h> diff --git a/kernel/task_work.c b/kernel/task_work.c index 65bd3c92d6f..8727032e3a6 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -4,6 +4,23 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ +/** + * task_work_add - ask the @task to execute @work->func() + * @task: the task which should run the callback + * @work: the callback to run + * @notify: send the notification if true + * + * Queue @work for task_work_run() below and notify the @task if @notify. + * Fails if the @task is exiting/exited and thus it can't process this @work. + * Otherwise @work->func() will be called when the @task returns from kernel + * mode or exits. + * + * This is like the signal handler which runs in kernel mode, but it doesn't + * try to wake up the @task. + * + * RETURNS: + * 0 if succeeds or -ESRCH. + */ int task_work_add(struct task_struct *task, struct callback_head *work, bool notify) { @@ -21,11 +38,22 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify) return 0; } +/** + * task_work_cancel - cancel a pending work added by task_work_add() + * @task: the task which should execute the work + * @func: identifies the work to remove + * + * Find the last queued pending work with ->func == @func and remove + * it from queue. + * + * RETURNS: + * The found work or NULL if not found. + */ struct callback_head * task_work_cancel(struct task_struct *task, task_work_func_t func) { struct callback_head **pprev = &task->task_works; - struct callback_head *work = NULL; + struct callback_head *work; unsigned long flags; /* * If cmpxchg() fails we continue without updating pprev. @@ -35,7 +63,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) */ raw_spin_lock_irqsave(&task->pi_lock, flags); while ((work = ACCESS_ONCE(*pprev))) { - read_barrier_depends(); + smp_read_barrier_depends(); if (work->func != func) pprev = &work->next; else if (cmpxchg(pprev, work, work->next) == work) @@ -46,6 +74,14 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) return work; } +/** + * task_work_run - execute the works added by task_work_add() + * + * Flush the pending works. Should be used by the core kernel code. + * Called before the task returns to the user-mode or stops, or when + * it exits. In the latter case task_work_add() can no longer add the + * new work after task_work_run() returns. + */ void task_work_run(void) { struct task_struct *task = current; diff --git a/kernel/time.c b/kernel/time.c index d3617dbd3dc..7c7964c33ae 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -11,7 +11,7 @@ * Modification history kernel/time.c * * 1993-09-02 Philip Gladstone - * Created file with time related functions from sched.c and adjtimex() + * Created file with time related functions from sched/core.c and adjtimex() * 1993-10-08 Torsten Duwe * adjtime interface update and CMOS clock write code * 1995-08-13 Torsten Duwe diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 70f27e89012..2b62fe86f9e 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -105,7 +105,6 @@ config NO_HZ_FULL select RCU_USER_QS select RCU_NOCB_CPU select VIRT_CPU_ACCOUNTING_GEN - select CONTEXT_TRACKING_FORCE select IRQ_WORK help Adaptively try to shutdown the tick whenever possible, even when @@ -134,6 +133,56 @@ config NO_HZ_FULL_ALL Note the boot CPU will still be kept outside the range to handle the timekeeping duty. +config NO_HZ_FULL_SYSIDLE + bool "Detect full-system idle state for full dynticks system" + depends on NO_HZ_FULL + default n + help + At least one CPU must keep the scheduling-clock tick running for + timekeeping purposes whenever there is a non-idle CPU, where + "non-idle" also includes dynticks CPUs as long as they are + running non-idle tasks. Because the underlying adaptive-tick + support cannot distinguish between all CPUs being idle and + all CPUs each running a single task in dynticks mode, the + underlying support simply ensures that there is always a CPU + handling the scheduling-clock tick, whether or not all CPUs + are idle. This Kconfig option enables scalable detection of + the all-CPUs-idle state, thus allowing the scheduling-clock + tick to be disabled when all CPUs are idle. Note that scalable + detection of the all-CPUs-idle state means that larger systems + will be slower to declare the all-CPUs-idle state. + + Say Y if you would like to help debug all-CPUs-idle detection. + + Say N if you are unsure. + +config NO_HZ_FULL_SYSIDLE_SMALL + int "Number of CPUs above which large-system approach is used" + depends on NO_HZ_FULL_SYSIDLE + range 1 NR_CPUS + default 8 + help + The full-system idle detection mechanism takes a lazy approach + on large systems, as is required to attain decent scalability. + However, on smaller systems, scalability is not anywhere near as + large a concern as is energy efficiency. The sysidle subsystem + therefore uses a fast but non-scalable algorithm for small + systems and a lazier but scalable algorithm for large systems. + This Kconfig parameter defines the number of CPUs in the largest + system that will be considered to be "small". + + The default value will be fine in most cases. Battery-powered + systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger + numbers of CPUs, and (3) are suffering from battery-lifetime + problems due to long sysidle latencies might wish to experiment + with larger values for this Kconfig parameter. On the other + hand, they might be even better served by disabling NO_HZ_FULL + entirely, given that NO_HZ_FULL is intended for HPC and + real-time workloads that at present do not tend to be run on + battery-powered systems. + + Take the default if you are unsure. + config NO_HZ bool "Old Idle dynticks config" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ff7d9d2ab50..9250130646f 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -4,6 +4,8 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o +obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o +obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index f11d83b1294..eec50fcef9e 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -199,6 +199,13 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) } +ktime_t alarm_expires_remaining(const struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + return ktime_sub(alarm->node.expires, base->gettime()); +} +EXPORT_SYMBOL_GPL(alarm_expires_remaining); + #ifdef CONFIG_RTC_CLASS /** * alarmtimer_suspend - Suspend time callback @@ -303,9 +310,10 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, alarm->type = type; alarm->state = ALARMTIMER_STATE_INACTIVE; } +EXPORT_SYMBOL_GPL(alarm_init); /** - * alarm_start - Sets an alarm to fire + * alarm_start - Sets an absolute alarm to fire * @alarm: ptr to alarm to set * @start: time to run the alarm */ @@ -323,6 +331,34 @@ int alarm_start(struct alarm *alarm, ktime_t start) spin_unlock_irqrestore(&base->lock, flags); return ret; } +EXPORT_SYMBOL_GPL(alarm_start); + +/** + * alarm_start_relative - Sets a relative alarm to fire + * @alarm: ptr to alarm to set + * @start: time relative to now to run the alarm + */ +int alarm_start_relative(struct alarm *alarm, ktime_t start) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + + start = ktime_add(start, base->gettime()); + return alarm_start(alarm, start); +} +EXPORT_SYMBOL_GPL(alarm_start_relative); + +void alarm_restart(struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + + spin_lock_irqsave(&base->lock, flags); + hrtimer_set_expires(&alarm->timer, alarm->node.expires); + hrtimer_restart(&alarm->timer); + alarmtimer_enqueue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); +} +EXPORT_SYMBOL_GPL(alarm_restart); /** * alarm_try_to_cancel - Tries to cancel an alarm timer @@ -344,6 +380,7 @@ int alarm_try_to_cancel(struct alarm *alarm) spin_unlock_irqrestore(&base->lock, flags); return ret; } +EXPORT_SYMBOL_GPL(alarm_try_to_cancel); /** @@ -361,6 +398,7 @@ int alarm_cancel(struct alarm *alarm) cpu_relax(); } } +EXPORT_SYMBOL_GPL(alarm_cancel); u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) @@ -393,8 +431,15 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) alarm->node.expires = ktime_add(alarm->node.expires, interval); return overrun; } +EXPORT_SYMBOL_GPL(alarm_forward); +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + return alarm_forward(alarm, base->gettime(), interval); +} +EXPORT_SYMBOL_GPL(alarm_forward_now); /** diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index c6d6400ee13..662c5798a68 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -15,44 +15,82 @@ #include <linux/hrtimer.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/notifier.h> #include <linux/smp.h> +#include <linux/device.h> #include "tick-internal.h" /* The registered clock event devices */ static LIST_HEAD(clockevent_devices); static LIST_HEAD(clockevents_released); - -/* Notification for clock events */ -static RAW_NOTIFIER_HEAD(clockevents_chain); - /* Protection for the above */ static DEFINE_RAW_SPINLOCK(clockevents_lock); +/* Protection for unbind operations */ +static DEFINE_MUTEX(clockevents_mutex); -/** - * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds - * @latch: value to convert - * @evt: pointer to clock event device descriptor - * - * Math helper, returns latch value converted to nanoseconds (bound checked) - */ -u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +struct ce_unbind { + struct clock_event_device *ce; + int res; +}; + +static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, + bool ismax) { u64 clc = (u64) latch << evt->shift; + u64 rnd; if (unlikely(!evt->mult)) { evt->mult = 1; WARN_ON(1); } + rnd = (u64) evt->mult - 1; + + /* + * Upper bound sanity check. If the backwards conversion is + * not equal latch, we know that the above shift overflowed. + */ + if ((clc >> evt->shift) != (u64)latch) + clc = ~0ULL; + + /* + * Scaled math oddities: + * + * For mult <= (1 << shift) we can safely add mult - 1 to + * prevent integer rounding loss. So the backwards conversion + * from nsec to device ticks will be correct. + * + * For mult > (1 << shift), i.e. device frequency is > 1GHz we + * need to be careful. Adding mult - 1 will result in a value + * which when converted back to device ticks can be larger + * than latch by up to (mult - 1) >> shift. For the min_delta + * calculation we still want to apply this in order to stay + * above the minimum device ticks limit. For the upper limit + * we would end up with a latch value larger than the upper + * limit of the device, so we omit the add to stay below the + * device upper boundary. + * + * Also omit the add if it would overflow the u64 boundary. + */ + if ((~0ULL - clc > rnd) && + (!ismax || evt->mult <= (1U << evt->shift))) + clc += rnd; do_div(clc, evt->mult); - if (clc < 1000) - clc = 1000; - if (clc > KTIME_MAX) - clc = KTIME_MAX; - return clc; + /* Deltas less than 1usec are pointless noise */ + return clc > 1000 ? clc : 1000; +} + +/** + * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds + * @latch: value to convert + * @evt: pointer to clock event device descriptor + * + * Math helper, returns latch value converted to nanoseconds (bound checked) + */ +u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +{ + return cev_delta2ns(latch, evt, false); } EXPORT_SYMBOL_GPL(clockevent_delta2ns); @@ -232,47 +270,107 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, return (rc && force) ? clockevents_program_min_delta(dev) : rc; } -/** - * clockevents_register_notifier - register a clock events change listener +/* + * Called after a notify add to make devices available which were + * released from the notifier call. */ -int clockevents_register_notifier(struct notifier_block *nb) +static void clockevents_notify_released(void) { - unsigned long flags; - int ret; + struct clock_event_device *dev; - raw_spin_lock_irqsave(&clockevents_lock, flags); - ret = raw_notifier_chain_register(&clockevents_chain, nb); - raw_spin_unlock_irqrestore(&clockevents_lock, flags); + while (!list_empty(&clockevents_released)) { + dev = list_entry(clockevents_released.next, + struct clock_event_device, list); + list_del(&dev->list); + list_add(&dev->list, &clockevent_devices); + tick_check_new_device(dev); + } +} - return ret; +/* + * Try to install a replacement clock event device + */ +static int clockevents_replace(struct clock_event_device *ced) +{ + struct clock_event_device *dev, *newdev = NULL; + + list_for_each_entry(dev, &clockevent_devices, list) { + if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED) + continue; + + if (!tick_check_replacement(newdev, dev)) + continue; + + if (!try_module_get(dev->owner)) + continue; + + if (newdev) + module_put(newdev->owner); + newdev = dev; + } + if (newdev) { + tick_install_replacement(newdev); + list_del_init(&ced->list); + } + return newdev ? 0 : -EBUSY; } /* - * Notify about a clock event change. Called with clockevents_lock - * held. + * Called with clockevents_mutex and clockevents_lock held */ -static void clockevents_do_notify(unsigned long reason, void *dev) +static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) { - raw_notifier_call_chain(&clockevents_chain, reason, dev); + /* Fast track. Device is unused */ + if (ced->mode == CLOCK_EVT_MODE_UNUSED) { + list_del_init(&ced->list); + return 0; + } + + return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY; } /* - * Called after a notify add to make devices available which were - * released from the notifier call. + * SMP function call to unbind a device */ -static void clockevents_notify_released(void) +static void __clockevents_unbind(void *arg) { - struct clock_event_device *dev; + struct ce_unbind *cu = arg; + int res; + + raw_spin_lock(&clockevents_lock); + res = __clockevents_try_unbind(cu->ce, smp_processor_id()); + if (res == -EAGAIN) + res = clockevents_replace(cu->ce); + cu->res = res; + raw_spin_unlock(&clockevents_lock); +} - while (!list_empty(&clockevents_released)) { - dev = list_entry(clockevents_released.next, - struct clock_event_device, list); - list_del(&dev->list); - list_add(&dev->list, &clockevent_devices); - clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); - } +/* + * Issues smp function call to unbind a per cpu device. Called with + * clockevents_mutex held. + */ +static int clockevents_unbind(struct clock_event_device *ced, int cpu) +{ + struct ce_unbind cu = { .ce = ced, .res = -ENODEV }; + + smp_call_function_single(cpu, __clockevents_unbind, &cu, 1); + return cu.res; } +/* + * Unbind a clockevents device. + */ +int clockevents_unbind_device(struct clock_event_device *ced, int cpu) +{ + int ret; + + mutex_lock(&clockevents_mutex); + ret = clockevents_unbind(ced, cpu); + mutex_unlock(&clockevents_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(clockevents_unbind); + /** * clockevents_register_device - register a clock event device * @dev: device to register @@ -290,7 +388,7 @@ void clockevents_register_device(struct clock_event_device *dev) raw_spin_lock_irqsave(&clockevents_lock, flags); list_add(&dev->list, &clockevent_devices); - clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + tick_check_new_device(dev); clockevents_notify_released(); raw_spin_unlock_irqrestore(&clockevents_lock, flags); @@ -317,8 +415,8 @@ void clockevents_config(struct clock_event_device *dev, u32 freq) sec = 600; clockevents_calc_mult_shift(dev, freq, sec); - dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); - dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); + dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false); + dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true); } /** @@ -386,6 +484,7 @@ void clockevents_exchange_device(struct clock_event_device *old, * released list and do a notify add later. */ if (old) { + module_put(old->owner); clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); list_del(&old->list); list_add(&old->list, &clockevents_released); @@ -433,10 +532,36 @@ void clockevents_notify(unsigned long reason, void *arg) int cpu; raw_spin_lock_irqsave(&clockevents_lock, flags); - clockevents_do_notify(reason, arg); switch (reason) { + case CLOCK_EVT_NOTIFY_BROADCAST_ON: + case CLOCK_EVT_NOTIFY_BROADCAST_OFF: + case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: + tick_broadcast_on_off(reason, arg); + break; + + case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: + case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: + tick_broadcast_oneshot_control(reason); + break; + + case CLOCK_EVT_NOTIFY_CPU_DYING: + tick_handover_do_timer(arg); + break; + + case CLOCK_EVT_NOTIFY_SUSPEND: + tick_suspend(); + tick_suspend_broadcast(); + break; + + case CLOCK_EVT_NOTIFY_RESUME: + tick_resume(); + break; + case CLOCK_EVT_NOTIFY_CPU_DEAD: + tick_shutdown_broadcast_oneshot(arg); + tick_shutdown_broadcast(arg); + tick_shutdown(arg); /* * Unregister the clock event devices which were * released from the users in the notify chain. @@ -462,4 +587,123 @@ void clockevents_notify(unsigned long reason, void *arg) raw_spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_notify); + +#ifdef CONFIG_SYSFS +struct bus_type clockevents_subsys = { + .name = "clockevents", + .dev_name = "clockevent", +}; + +static DEFINE_PER_CPU(struct device, tick_percpu_dev); +static struct tick_device *tick_get_tick_dev(struct device *dev); + +static ssize_t sysfs_show_current_tick_dev(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct tick_device *td; + ssize_t count = 0; + + raw_spin_lock_irq(&clockevents_lock); + td = tick_get_tick_dev(dev); + if (td && td->evtdev) + count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name); + raw_spin_unlock_irq(&clockevents_lock); + return count; +} +static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL); + +/* We don't support the abomination of removable broadcast devices */ +static ssize_t sysfs_unbind_tick_dev(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + char name[CS_NAME_LEN]; + size_t ret = sysfs_get_uname(buf, name, count); + struct clock_event_device *ce; + + if (ret < 0) + return ret; + + ret = -ENODEV; + mutex_lock(&clockevents_mutex); + raw_spin_lock_irq(&clockevents_lock); + list_for_each_entry(ce, &clockevent_devices, list) { + if (!strcmp(ce->name, name)) { + ret = __clockevents_try_unbind(ce, dev->id); + break; + } + } + raw_spin_unlock_irq(&clockevents_lock); + /* + * We hold clockevents_mutex, so ce can't go away + */ + if (ret == -EAGAIN) + ret = clockevents_unbind(ce, dev->id); + mutex_unlock(&clockevents_mutex); + return ret ? ret : count; +} +static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +static struct device tick_bc_dev = { + .init_name = "broadcast", + .id = 0, + .bus = &clockevents_subsys, +}; + +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ + return dev == &tick_bc_dev ? tick_get_broadcast_device() : + &per_cpu(tick_cpu_device, dev->id); +} + +static __init int tick_broadcast_init_sysfs(void) +{ + int err = device_register(&tick_bc_dev); + + if (!err) + err = device_create_file(&tick_bc_dev, &dev_attr_current_device); + return err; +} +#else +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ + return &per_cpu(tick_cpu_device, dev->id); +} +static inline int tick_broadcast_init_sysfs(void) { return 0; } #endif + +static int __init tick_init_sysfs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct device *dev = &per_cpu(tick_percpu_dev, cpu); + int err; + + dev->id = cpu; + dev->bus = &clockevents_subsys; + err = device_register(dev); + if (!err) + err = device_create_file(dev, &dev_attr_current_device); + if (!err) + err = device_create_file(dev, &dev_attr_unbind_device); + if (err) + return err; + } + return tick_broadcast_init_sysfs(); +} + +static int __init clockevents_init_sysfs(void) +{ + int err = subsys_system_register(&clockevents_subsys, NULL); + + if (!err) + err = tick_init_sysfs(); + return err; +} +device_initcall(clockevents_init_sysfs); +#endif /* SYSFS */ + +#endif /* GENERIC_CLOCK_EVENTS */ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c9583382141..50a8736757f 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -31,6 +31,8 @@ #include <linux/tick.h> #include <linux/kthread.h> +#include "tick-internal.h" + void timecounter_init(struct timecounter *tc, const struct cyclecounter *cc, u64 start_tstamp) @@ -174,11 +176,12 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) static struct clocksource *curr_clocksource; static LIST_HEAD(clocksource_list); static DEFINE_MUTEX(clocksource_mutex); -static char override_name[32]; +static char override_name[CS_NAME_LEN]; static int finished_booting; #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static void clocksource_watchdog_work(struct work_struct *work); +static void clocksource_select(void); static LIST_HEAD(watchdog_list); static struct clocksource *watchdog; @@ -299,13 +302,30 @@ static void clocksource_watchdog(unsigned long data) if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { + /* Mark it valid for high-res. */ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + + /* + * clocksource_done_booting() will sort it if + * finished_booting is not set yet. + */ + if (!finished_booting) + continue; + /* - * We just marked the clocksource as highres-capable, - * notify the rest of the system as well so that we - * transition into high-res mode: + * If this is not the current clocksource let + * the watchdog thread reselect it. Due to the + * change to high res this clocksource might + * be preferred now. If it is the current + * clocksource let the tick code know about + * that change. */ - tick_clock_notify(); + if (cs != curr_clocksource) { + cs->flags |= CLOCK_SOURCE_RESELECT; + schedule_work(&watchdog_work); + } else { + tick_clock_notify(); + } } } @@ -388,44 +408,39 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static void clocksource_dequeue_watchdog(struct clocksource *cs) { - struct clocksource *tmp; unsigned long flags; spin_lock_irqsave(&watchdog_lock, flags); - if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { - /* cs is a watched clocksource. */ - list_del_init(&cs->wd_list); - } else if (cs == watchdog) { - /* Reset watchdog cycles */ - clocksource_reset_watchdog(); - /* Current watchdog is removed. Find an alternative. */ - watchdog = NULL; - list_for_each_entry(tmp, &clocksource_list, list) { - if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY) - continue; - if (!watchdog || tmp->rating > watchdog->rating) - watchdog = tmp; + if (cs != watchdog) { + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + /* cs is a watched clocksource. */ + list_del_init(&cs->wd_list); + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); } } - cs->flags &= ~CLOCK_SOURCE_WATCHDOG; - /* Check if the watchdog timer needs to be stopped. */ - clocksource_stop_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); } -static int clocksource_watchdog_kthread(void *data) +static int __clocksource_watchdog_kthread(void) { struct clocksource *cs, *tmp; unsigned long flags; LIST_HEAD(unstable); + int select = 0; - mutex_lock(&clocksource_mutex); spin_lock_irqsave(&watchdog_lock, flags); - list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) + list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { if (cs->flags & CLOCK_SOURCE_UNSTABLE) { list_del_init(&cs->wd_list); list_add(&cs->wd_list, &unstable); + select = 1; } + if (cs->flags & CLOCK_SOURCE_RESELECT) { + cs->flags &= ~CLOCK_SOURCE_RESELECT; + select = 1; + } + } /* Check if the watchdog timer needs to be stopped. */ clocksource_stop_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); @@ -435,10 +450,23 @@ static int clocksource_watchdog_kthread(void *data) list_del_init(&cs->wd_list); __clocksource_change_rating(cs, 0); } + return select; +} + +static int clocksource_watchdog_kthread(void *data) +{ + mutex_lock(&clocksource_mutex); + if (__clocksource_watchdog_kthread()) + clocksource_select(); mutex_unlock(&clocksource_mutex); return 0; } +static bool clocksource_is_watchdog(struct clocksource *cs) +{ + return cs == watchdog; +} + #else /* CONFIG_CLOCKSOURCE_WATCHDOG */ static void clocksource_enqueue_watchdog(struct clocksource *cs) @@ -449,7 +477,8 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } -static inline int clocksource_watchdog_kthread(void *data) { return 0; } +static inline int __clocksource_watchdog_kthread(void) { return 0; } +static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ @@ -553,24 +582,42 @@ static u64 clocksource_max_deferment(struct clocksource *cs) #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET -/** - * clocksource_select - Select the best clocksource available - * - * Private function. Must hold clocksource_mutex when called. - * - * Select the clocksource with the best rating, or the clocksource, - * which is selected by userspace override. - */ -static void clocksource_select(void) +static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) { - struct clocksource *best, *cs; + struct clocksource *cs; if (!finished_booting || list_empty(&clocksource_list)) + return NULL; + + /* + * We pick the clocksource with the highest rating. If oneshot + * mode is active, we pick the highres valid clocksource with + * the best rating. + */ + list_for_each_entry(cs, &clocksource_list, list) { + if (skipcur && cs == curr_clocksource) + continue; + if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) + continue; + return cs; + } + return NULL; +} + +static void __clocksource_select(bool skipcur) +{ + bool oneshot = tick_oneshot_mode_active(); + struct clocksource *best, *cs; + + /* Find the best suitable clocksource */ + best = clocksource_find_best(oneshot, skipcur); + if (!best) return; - /* First clocksource on the list has the best rating. */ - best = list_first_entry(&clocksource_list, struct clocksource, list); + /* Check for the override clocksource. */ list_for_each_entry(cs, &clocksource_list, list) { + if (skipcur && cs == curr_clocksource) + continue; if (strcmp(cs->name, override_name) != 0) continue; /* @@ -578,8 +625,7 @@ static void clocksource_select(void) * capable clocksource if the tick code is in oneshot * mode (highres or nohz) */ - if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && - tick_oneshot_mode_active()) { + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { /* Override clocksource cannot be used. */ printk(KERN_WARNING "Override clocksource %s is not " "HRT compatible. Cannot switch while in " @@ -590,16 +636,35 @@ static void clocksource_select(void) best = cs; break; } - if (curr_clocksource != best) { - printk(KERN_INFO "Switching to clocksource %s\n", best->name); + + if (curr_clocksource != best && !timekeeping_notify(best)) { + pr_info("Switched to clocksource %s\n", best->name); curr_clocksource = best; - timekeeping_notify(curr_clocksource); } } +/** + * clocksource_select - Select the best clocksource available + * + * Private function. Must hold clocksource_mutex when called. + * + * Select the clocksource with the best rating, or the clocksource, + * which is selected by userspace override. + */ +static void clocksource_select(void) +{ + return __clocksource_select(false); +} + +static void clocksource_select_fallback(void) +{ + return __clocksource_select(true); +} + #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } +static inline void clocksource_select_fallback(void) { } #endif @@ -614,16 +679,11 @@ static int __init clocksource_done_booting(void) { mutex_lock(&clocksource_mutex); curr_clocksource = clocksource_default_clock(); - mutex_unlock(&clocksource_mutex); - finished_booting = 1; - /* * Run the watchdog first to eliminate unstable clock sources */ - clocksource_watchdog_kthread(NULL); - - mutex_lock(&clocksource_mutex); + __clocksource_watchdog_kthread(); clocksource_select(); mutex_unlock(&clocksource_mutex); return 0; @@ -756,7 +816,6 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating) list_del(&cs->list); cs->rating = rating; clocksource_enqueue(cs); - clocksource_select(); } /** @@ -768,21 +827,47 @@ void clocksource_change_rating(struct clocksource *cs, int rating) { mutex_lock(&clocksource_mutex); __clocksource_change_rating(cs, rating); + clocksource_select(); mutex_unlock(&clocksource_mutex); } EXPORT_SYMBOL(clocksource_change_rating); +/* + * Unbind clocksource @cs. Called with clocksource_mutex held + */ +static int clocksource_unbind(struct clocksource *cs) +{ + /* + * I really can't convince myself to support this on hardware + * designed by lobotomized monkeys. + */ + if (clocksource_is_watchdog(cs)) + return -EBUSY; + + if (cs == curr_clocksource) { + /* Select and try to install a replacement clock source */ + clocksource_select_fallback(); + if (curr_clocksource == cs) + return -EBUSY; + } + clocksource_dequeue_watchdog(cs); + list_del_init(&cs->list); + return 0; +} + /** * clocksource_unregister - remove a registered clocksource * @cs: clocksource to be unregistered */ -void clocksource_unregister(struct clocksource *cs) +int clocksource_unregister(struct clocksource *cs) { + int ret = 0; + mutex_lock(&clocksource_mutex); - clocksource_dequeue_watchdog(cs); - list_del(&cs->list); - clocksource_select(); + if (!list_empty(&cs->list)) + ret = clocksource_unbind(cs); mutex_unlock(&clocksource_mutex); + return ret; } EXPORT_SYMBOL(clocksource_unregister); @@ -808,6 +893,23 @@ sysfs_show_current_clocksources(struct device *dev, return count; } +size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) +{ + size_t ret = cnt; + + /* strings from sysfs write are not 0 terminated! */ + if (!cnt || cnt >= CS_NAME_LEN) + return -EINVAL; + + /* strip of \n: */ + if (buf[cnt-1] == '\n') + cnt--; + if (cnt > 0) + memcpy(dst, buf, cnt); + dst[cnt] = 0; + return ret; +} + /** * sysfs_override_clocksource - interface for manually overriding clocksource * @dev: unused @@ -822,22 +924,13 @@ static ssize_t sysfs_override_clocksource(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - size_t ret = count; - - /* strings from sysfs write are not 0 terminated! */ - if (count >= sizeof(override_name)) - return -EINVAL; - - /* strip of \n: */ - if (buf[count-1] == '\n') - count--; + size_t ret; mutex_lock(&clocksource_mutex); - if (count > 0) - memcpy(override_name, buf, count); - override_name[count] = 0; - clocksource_select(); + ret = sysfs_get_uname(buf, override_name, count); + if (ret >= 0) + clocksource_select(); mutex_unlock(&clocksource_mutex); @@ -845,6 +938,40 @@ static ssize_t sysfs_override_clocksource(struct device *dev, } /** + * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource + * @dev: unused + * @attr: unused + * @buf: unused + * @count: length of buffer + * + * Takes input from sysfs interface for manually unbinding a clocksource. + */ +static ssize_t sysfs_unbind_clocksource(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct clocksource *cs; + char name[CS_NAME_LEN]; + size_t ret; + + ret = sysfs_get_uname(buf, name, count); + if (ret < 0) + return ret; + + ret = -ENODEV; + mutex_lock(&clocksource_mutex); + list_for_each_entry(cs, &clocksource_list, list) { + if (strcmp(cs->name, name)) + continue; + ret = clocksource_unbind(cs); + break; + } + mutex_unlock(&clocksource_mutex); + + return ret ? ret : count; +} + +/** * sysfs_show_available_clocksources - sysfs interface for listing clocksource * @dev: unused * @attr: unused @@ -886,6 +1013,8 @@ sysfs_show_available_clocksources(struct device *dev, static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, sysfs_override_clocksource); +static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource); + static DEVICE_ATTR(available_clocksource, 0444, sysfs_show_available_clocksources, NULL); @@ -910,6 +1039,9 @@ static int __init init_clocksource_sysfs(void) &device_clocksource, &dev_attr_current_clocksource); if (!error) + error = device_create_file(&device_clocksource, + &dev_attr_unbind_clocksource); + if (!error) error = device_create_file( &device_clocksource, &dev_attr_available_clocksource); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8f5b3b98577..bb2215174f0 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -516,13 +516,13 @@ static void sync_cmos_clock(struct work_struct *work) schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); } -static void notify_cmos_timer(void) +void ntp_notify_cmos_timer(void) { schedule_delayed_work(&sync_cmos_work, 0); } #else -static inline void notify_cmos_timer(void) { } +void ntp_notify_cmos_timer(void) { } #endif @@ -687,8 +687,6 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) if (!(time_status & STA_NANO)) txc->time.tv_usec /= NSEC_PER_USEC; - notify_cmos_timer(); - return result; } diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c new file mode 100644 index 00000000000..0b479a6a22b --- /dev/null +++ b/kernel/time/sched_clock.c @@ -0,0 +1,212 @@ +/* + * sched_clock.c: support for extending counters to full 64-bit ns counter + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/clocksource.h> +#include <linux/init.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/moduleparam.h> +#include <linux/sched.h> +#include <linux/syscore_ops.h> +#include <linux/timer.h> +#include <linux/sched_clock.h> + +struct clock_data { + u64 epoch_ns; + u32 epoch_cyc; + u32 epoch_cyc_copy; + unsigned long rate; + u32 mult; + u32 shift; + bool suspended; +}; + +static void sched_clock_poll(unsigned long wrap_ticks); +static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0); +static int irqtime = -1; + +core_param(irqtime, irqtime, int, 0400); + +static struct clock_data cd = { + .mult = NSEC_PER_SEC / HZ, +}; + +static u32 __read_mostly sched_clock_mask = 0xffffffff; + +static u32 notrace jiffy_sched_clock_read(void) +{ + return (u32)(jiffies - INITIAL_JIFFIES); +} + +static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; + +static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) +{ + return (cyc * mult) >> shift; +} + +static unsigned long long notrace sched_clock_32(void) +{ + u64 epoch_ns; + u32 epoch_cyc; + u32 cyc; + + if (cd.suspended) + return cd.epoch_ns; + + /* + * Load the epoch_cyc and epoch_ns atomically. We do this by + * ensuring that we always write epoch_cyc, epoch_ns and + * epoch_cyc_copy in strict order, and read them in strict order. + * If epoch_cyc and epoch_cyc_copy are not equal, then we're in + * the middle of an update, and we should repeat the load. + */ + do { + epoch_cyc = cd.epoch_cyc; + smp_rmb(); + epoch_ns = cd.epoch_ns; + smp_rmb(); + } while (epoch_cyc != cd.epoch_cyc_copy); + + cyc = read_sched_clock(); + cyc = (cyc - epoch_cyc) & sched_clock_mask; + return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); +} + +/* + * Atomically update the sched_clock epoch. + */ +static void notrace update_sched_clock(void) +{ + unsigned long flags; + u32 cyc; + u64 ns; + + cyc = read_sched_clock(); + ns = cd.epoch_ns + + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, + cd.mult, cd.shift); + /* + * Write epoch_cyc and epoch_ns in a way that the update is + * detectable in cyc_to_fixed_sched_clock(). + */ + raw_local_irq_save(flags); + cd.epoch_cyc_copy = cyc; + smp_wmb(); + cd.epoch_ns = ns; + smp_wmb(); + cd.epoch_cyc = cyc; + raw_local_irq_restore(flags); +} + +static void sched_clock_poll(unsigned long wrap_ticks) +{ + mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks)); + update_sched_clock(); +} + +void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) +{ + unsigned long r, w; + u64 res, wrap; + char r_unit; + + if (cd.rate > rate) + return; + + BUG_ON(bits > 32); + WARN_ON(!irqs_disabled()); + read_sched_clock = read; + sched_clock_mask = (1ULL << bits) - 1; + cd.rate = rate; + + /* calculate the mult/shift to convert counter ticks to ns. */ + clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0); + + r = rate; + if (r >= 4000000) { + r /= 1000000; + r_unit = 'M'; + } else if (r >= 1000) { + r /= 1000; + r_unit = 'k'; + } else + r_unit = ' '; + + /* calculate how many ns until we wrap */ + wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift); + do_div(wrap, NSEC_PER_MSEC); + w = wrap; + + /* calculate the ns resolution of this counter */ + res = cyc_to_ns(1ULL, cd.mult, cd.shift); + pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n", + bits, r, r_unit, res, w); + + /* + * Start the timer to keep sched_clock() properly updated and + * sets the initial epoch. + */ + sched_clock_timer.data = msecs_to_jiffies(w - (w / 10)); + update_sched_clock(); + + /* + * Ensure that sched_clock() starts off at 0ns + */ + cd.epoch_ns = 0; + + /* Enable IRQ time accounting if we have a fast enough sched_clock */ + if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) + enable_sched_clock_irqtime(); + + pr_debug("Registered %pF as sched_clock source\n", read); +} + +unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; + +unsigned long long notrace sched_clock(void) +{ + return sched_clock_func(); +} + +void __init sched_clock_postinit(void) +{ + /* + * If no sched_clock function has been provided at that point, + * make it the final one one. + */ + if (read_sched_clock == jiffy_sched_clock_read) + setup_sched_clock(jiffy_sched_clock_read, 32, HZ); + + sched_clock_poll(sched_clock_timer.data); +} + +static int sched_clock_suspend(void) +{ + sched_clock_poll(sched_clock_timer.data); + cd.suspended = true; + return 0; +} + +static void sched_clock_resume(void) +{ + cd.epoch_cyc = read_sched_clock(); + cd.epoch_cyc_copy = cd.epoch_cyc; + cd.suspended = false; +} + +static struct syscore_ops sched_clock_ops = { + .suspend = sched_clock_suspend, + .resume = sched_clock_resume, +}; + +static int __init sched_clock_syscore_init(void) +{ + register_syscore_ops(&sched_clock_ops); + return 0; +} +device_initcall(sched_clock_syscore_init); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 0c739423b0f..218bcb565fe 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -19,6 +19,7 @@ #include <linux/profile.h> #include <linux/sched.h> #include <linux/smp.h> +#include <linux/module.h> #include "tick-internal.h" @@ -29,6 +30,7 @@ static struct tick_device tick_broadcast_device; static cpumask_var_t tick_broadcast_mask; +static cpumask_var_t tick_broadcast_on; static cpumask_var_t tmpmask; static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); static int tick_broadcast_force; @@ -64,17 +66,34 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) /* * Check, if the device can be utilized as broadcast device: */ -int tick_check_broadcast_device(struct clock_event_device *dev) +static bool tick_check_broadcast_device(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || + (newdev->features & CLOCK_EVT_FEAT_C3STOP)) + return false; + + if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT && + !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + + return !curdev || newdev->rating > curdev->rating; +} + +/* + * Conditionally install/replace broadcast device + */ +void tick_install_broadcast_device(struct clock_event_device *dev) { struct clock_event_device *cur = tick_broadcast_device.evtdev; - if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || - (tick_broadcast_device.evtdev && - tick_broadcast_device.evtdev->rating >= dev->rating) || - (dev->features & CLOCK_EVT_FEAT_C3STOP)) - return 0; + if (!tick_check_broadcast_device(cur, dev)) + return; + + if (!try_module_get(dev->owner)) + return; - clockevents_exchange_device(tick_broadcast_device.evtdev, dev); + clockevents_exchange_device(cur, dev); if (cur) cur->event_handler = clockevents_handle_noop; tick_broadcast_device.evtdev = dev; @@ -90,7 +109,6 @@ int tick_check_broadcast_device(struct clock_event_device *dev) */ if (dev->features & CLOCK_EVT_FEAT_ONESHOT) tick_clock_notify(); - return 1; } /* @@ -123,8 +141,9 @@ static void tick_device_setup_broadcast_func(struct clock_event_device *dev) */ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { + struct clock_event_device *bc = tick_broadcast_device.evtdev; unsigned long flags; - int ret = 0; + int ret; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -138,20 +157,62 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) dev->event_handler = tick_handle_periodic; tick_device_setup_broadcast_func(dev); cpumask_set_cpu(cpu, tick_broadcast_mask); - tick_broadcast_start_periodic(tick_broadcast_device.evtdev); + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + tick_broadcast_start_periodic(bc); + else + tick_broadcast_setup_oneshot(bc); ret = 1; } else { /* - * When the new device is not affected by the stop - * feature and the cpu is marked in the broadcast mask - * then clear the broadcast bit. + * Clear the broadcast bit for this cpu if the + * device is not power state affected. */ - if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { - int cpu = smp_processor_id(); + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) cpumask_clear_cpu(cpu, tick_broadcast_mask); - tick_broadcast_clear_oneshot(cpu); - } else { + else tick_device_setup_broadcast_func(dev); + + /* + * Clear the broadcast bit if the CPU is not in + * periodic broadcast on state. + */ + if (!cpumask_test_cpu(cpu, tick_broadcast_on)) + cpumask_clear_cpu(cpu, tick_broadcast_mask); + + switch (tick_broadcast_device.mode) { + case TICKDEV_MODE_ONESHOT: + /* + * If the system is in oneshot mode we can + * unconditionally clear the oneshot mask bit, + * because the CPU is running and therefore + * not in an idle state which causes the power + * state affected device to stop. Let the + * caller initialize the device. + */ + tick_broadcast_clear_oneshot(cpu); + ret = 0; + break; + + case TICKDEV_MODE_PERIODIC: + /* + * If the system is in periodic mode, check + * whether the broadcast device can be + * switched off now. + */ + if (cpumask_empty(tick_broadcast_mask) && bc) + clockevents_shutdown(bc); + /* + * If we kept the cpu in the broadcast mask, + * tell the caller to leave the per cpu device + * in shutdown state. The periodic interrupt + * is delivered by the broadcast device. + */ + ret = cpumask_test_cpu(cpu, tick_broadcast_mask); + break; + default: + /* Nothing to do */ + ret = 0; + break; } } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); @@ -281,6 +342,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) switch (*reason) { case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: + cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) @@ -290,8 +352,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason) tick_broadcast_force = 1; break; case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - if (!tick_broadcast_force && - cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { + if (tick_broadcast_force) + break; + cpumask_clear_cpu(cpu, tick_broadcast_on); + if (!tick_device_is_functional(dev)) + break; + if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); @@ -349,6 +415,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) bc = tick_broadcast_device.evtdev; cpumask_clear_cpu(cpu, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tick_broadcast_on); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { if (bc && cpumask_empty(tick_broadcast_mask)) @@ -475,7 +542,15 @@ void tick_check_oneshot_broadcast(int cpu) if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) { struct tick_device *td = &per_cpu(tick_cpu_device, cpu); - clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); + /* + * We might be in the middle of switching over from + * periodic to oneshot. If the CPU has not yet + * switched over, leave the device alone. + */ + if (td->mode == TICKDEV_MODE_ONESHOT) { + clockevents_set_mode(td->evtdev, + CLOCK_EVT_MODE_ONESHOT); + } } } @@ -522,6 +597,13 @@ again: cpumask_clear(tick_broadcast_force_mask); /* + * Sanity check. Catch the case where we try to broadcast to + * offline cpus. + */ + if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask))) + cpumask_and(tmpmask, tmpmask, cpu_online_mask); + + /* * Wakeup the cpus which have an expired event. */ tick_do_broadcast(tmpmask); @@ -599,8 +681,6 @@ void tick_broadcast_oneshot_control(unsigned long reason) } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); - if (dev->next_event.tv64 == KTIME_MAX) - goto out; /* * The cpu which was handling the broadcast * timer marked this cpu in the broadcast @@ -615,6 +695,11 @@ void tick_broadcast_oneshot_control(unsigned long reason) goto out; /* + * Bail out if there is no next event. + */ + if (dev->next_event.tv64 == KTIME_MAX) + goto out; + /* * If the pending bit is not set, then we are * either the CPU handling the broadcast * interrupt or we got woken by something else. @@ -698,10 +783,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) bc->event_handler = tick_handle_oneshot_broadcast; - /* Take the do_timer update */ - if (!tick_nohz_full_cpu(cpu)) - tick_do_timer_cpu = cpu; - /* * We must be careful here. There might be other CPUs * waiting for periodic broadcast. We need to set the @@ -762,10 +843,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); /* - * Clear the broadcast mask flag for the dead cpu, but do not - * stop the broadcast device! + * Clear the broadcast masks for the dead cpu, but do not stop + * the broadcast device! */ cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); + cpumask_clear_cpu(cpu, tick_broadcast_force_mask); raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -793,6 +876,7 @@ bool tick_broadcast_oneshot_available(void) void __init tick_broadcast_init(void) { zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT); zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); #ifdef CONFIG_TICK_ONESHOT zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 5d3fb100bc0..64522ecdfe0 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -18,6 +18,7 @@ #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> +#include <linux/module.h> #include <asm/irq_regs.h> @@ -33,7 +34,6 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); ktime_t tick_next_period; ktime_t tick_period; int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; -static DEFINE_RAW_SPINLOCK(tick_device_lock); /* * Debugging: see timer_list.c @@ -194,7 +194,8 @@ static void tick_setup_device(struct tick_device *td, * When global broadcasting is active, check if the current * device is registered as a placeholder for broadcast mode. * This allows us to handle this x86 misfeature in a generic - * way. + * way. This function also returns !=0 when we keep the + * current active broadcast state for this CPU. */ if (tick_device_uses_broadcast(newdev, cpu)) return; @@ -205,17 +206,75 @@ static void tick_setup_device(struct tick_device *td, tick_setup_oneshot(newdev, handler, next_event); } +void tick_install_replacement(struct clock_event_device *newdev) +{ + struct tick_device *td = &__get_cpu_var(tick_cpu_device); + int cpu = smp_processor_id(); + + clockevents_exchange_device(td->evtdev, newdev); + tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); + if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_oneshot_notify(); +} + +static bool tick_check_percpu(struct clock_event_device *curdev, + struct clock_event_device *newdev, int cpu) +{ + if (!cpumask_test_cpu(cpu, newdev->cpumask)) + return false; + if (cpumask_equal(newdev->cpumask, cpumask_of(cpu))) + return true; + /* Check if irq affinity can be set */ + if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq)) + return false; + /* Prefer an existing cpu local device */ + if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) + return false; + return true; +} + +static bool tick_check_preferred(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + /* Prefer oneshot capable device */ + if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) { + if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + if (tick_oneshot_mode_active()) + return false; + } + + /* + * Use the higher rated one, but prefer a CPU local device with a lower + * rating than a non-CPU local device + */ + return !curdev || + newdev->rating > curdev->rating || + !cpumask_equal(curdev->cpumask, newdev->cpumask); +} + +/* + * Check whether the new device is a better fit than curdev. curdev + * can be NULL ! + */ +bool tick_check_replacement(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + if (tick_check_percpu(curdev, newdev, smp_processor_id())) + return false; + + return tick_check_preferred(curdev, newdev); +} + /* - * Check, if the new registered device should be used. + * Check, if the new registered device should be used. Called with + * clockevents_lock held and interrupts disabled. */ -static int tick_check_new_device(struct clock_event_device *newdev) +void tick_check_new_device(struct clock_event_device *newdev) { struct clock_event_device *curdev; struct tick_device *td; - int cpu, ret = NOTIFY_OK; - unsigned long flags; - - raw_spin_lock_irqsave(&tick_device_lock, flags); + int cpu; cpu = smp_processor_id(); if (!cpumask_test_cpu(cpu, newdev->cpumask)) @@ -225,40 +284,15 @@ static int tick_check_new_device(struct clock_event_device *newdev) curdev = td->evtdev; /* cpu local device ? */ - if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { - - /* - * If the cpu affinity of the device interrupt can not - * be set, ignore it. - */ - if (!irq_can_set_affinity(newdev->irq)) - goto out_bc; + if (!tick_check_percpu(curdev, newdev, cpu)) + goto out_bc; - /* - * If we have a cpu local device already, do not replace it - * by a non cpu local device - */ - if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) - goto out_bc; - } + /* Preference decision */ + if (!tick_check_preferred(curdev, newdev)) + goto out_bc; - /* - * If we have an active device, then check the rating and the oneshot - * feature. - */ - if (curdev) { - /* - * Prefer one shot capable devices ! - */ - if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && - !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) - goto out_bc; - /* - * Check the rating - */ - if (curdev->rating >= newdev->rating) - goto out_bc; - } + if (!try_module_get(newdev->owner)) + return; /* * Replace the eventually existing device by the new @@ -273,20 +307,13 @@ static int tick_check_new_device(struct clock_event_device *newdev) tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); - - raw_spin_unlock_irqrestore(&tick_device_lock, flags); - return NOTIFY_STOP; + return; out_bc: /* * Can the new device be used as a broadcast device ? */ - if (tick_check_broadcast_device(newdev)) - ret = NOTIFY_STOP; - - raw_spin_unlock_irqrestore(&tick_device_lock, flags); - - return ret; + tick_install_broadcast_device(newdev); } /* @@ -294,7 +321,7 @@ out_bc: * * Called with interrupts disabled. */ -static void tick_handover_do_timer(int *cpup) +void tick_handover_do_timer(int *cpup) { if (*cpup == tick_do_timer_cpu) { int cpu = cpumask_first(cpu_online_mask); @@ -311,13 +338,11 @@ static void tick_handover_do_timer(int *cpup) * access the hardware device itself. * We just set the mode and remove it from the lists. */ -static void tick_shutdown(unsigned int *cpup) +void tick_shutdown(unsigned int *cpup) { struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); struct clock_event_device *dev = td->evtdev; - unsigned long flags; - raw_spin_lock_irqsave(&tick_device_lock, flags); td->mode = TICKDEV_MODE_PERIODIC; if (dev) { /* @@ -329,26 +354,20 @@ static void tick_shutdown(unsigned int *cpup) dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; } - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } -static void tick_suspend(void) +void tick_suspend(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); - unsigned long flags; - raw_spin_lock_irqsave(&tick_device_lock, flags); clockevents_shutdown(td->evtdev); - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } -static void tick_resume(void) +void tick_resume(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); - unsigned long flags; int broadcast = tick_resume_broadcast(); - raw_spin_lock_irqsave(&tick_device_lock, flags); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); if (!broadcast) { @@ -357,68 +376,12 @@ static void tick_resume(void) else tick_resume_oneshot(); } - raw_spin_unlock_irqrestore(&tick_device_lock, flags); } -/* - * Notification about clock event devices - */ -static int tick_notify(struct notifier_block *nb, unsigned long reason, - void *dev) -{ - switch (reason) { - - case CLOCK_EVT_NOTIFY_ADD: - return tick_check_new_device(dev); - - case CLOCK_EVT_NOTIFY_BROADCAST_ON: - case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - tick_broadcast_on_off(reason, dev); - break; - - case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: - case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: - tick_broadcast_oneshot_control(reason); - break; - - case CLOCK_EVT_NOTIFY_CPU_DYING: - tick_handover_do_timer(dev); - break; - - case CLOCK_EVT_NOTIFY_CPU_DEAD: - tick_shutdown_broadcast_oneshot(dev); - tick_shutdown_broadcast(dev); - tick_shutdown(dev); - break; - - case CLOCK_EVT_NOTIFY_SUSPEND: - tick_suspend(); - tick_suspend_broadcast(); - break; - - case CLOCK_EVT_NOTIFY_RESUME: - tick_resume(); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block tick_notifier = { - .notifier_call = tick_notify, -}; - /** * tick_init - initialize the tick control - * - * Register the notifier with the clockevents framework */ void __init tick_init(void) { - clockevents_register_notifier(&tick_notifier); tick_broadcast_init(); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f0299eae460..bc906cad709 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -6,6 +6,8 @@ extern seqlock_t jiffies_lock; +#define CS_NAME_LEN 32 + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD #define TICK_DO_TIMER_NONE -1 @@ -18,9 +20,19 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); +extern void tick_check_new_device(struct clock_event_device *dev); +extern void tick_handover_do_timer(int *cpup); +extern void tick_shutdown(unsigned int *cpup); +extern void tick_suspend(void); +extern void tick_resume(void); +extern bool tick_check_replacement(struct clock_event_device *curdev, + struct clock_event_device *newdev); +extern void tick_install_replacement(struct clock_event_device *dev); extern void clockevents_shutdown(struct clock_event_device *dev); +extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); + /* * NO_HZ / high resolution timer shared code */ @@ -90,7 +102,7 @@ static inline bool tick_broadcast_oneshot_available(void) { return false; } */ #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); -extern int tick_check_broadcast_device(struct clock_event_device *dev); +extern void tick_install_broadcast_device(struct clock_event_device *dev); extern int tick_is_broadcast_device(struct clock_event_device *dev); extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); extern void tick_shutdown_broadcast(unsigned int *cpup); @@ -102,9 +114,8 @@ tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); #else /* !BROADCAST */ -static inline int tick_check_broadcast_device(struct clock_event_device *dev) +static inline void tick_install_broadcast_device(struct clock_event_device *dev) { - return 0; } static inline int tick_is_broadcast_device(struct clock_event_device *dev) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f4208138fbf..3612fc77f83 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -23,6 +23,7 @@ #include <linux/irq_work.h> #include <linux/posix-timers.h> #include <linux/perf_event.h> +#include <linux/context_tracking.h> #include <asm/irq_regs.h> @@ -148,8 +149,8 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) } #ifdef CONFIG_NO_HZ_FULL -static cpumask_var_t nohz_full_mask; -bool have_nohz_full_mask; +cpumask_var_t tick_nohz_full_mask; +bool tick_nohz_full_running; static bool can_stop_full_tick(void) { @@ -178,6 +179,12 @@ static bool can_stop_full_tick(void) */ if (!sched_clock_stable) { trace_tick_stop(0, "unstable sched clock\n"); + /* + * Don't allow the user to think they can get + * full NO_HZ with this machine. + */ + WARN_ONCE(tick_nohz_full_running, + "NO_HZ FULL will not work with unstable sched clock"); return false; } #endif @@ -191,7 +198,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); * Re-evaluate the need for the tick on the current CPU * and restart it if necessary. */ -void tick_nohz_full_check(void) +void __tick_nohz_full_check(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); @@ -205,7 +212,7 @@ void tick_nohz_full_check(void) static void nohz_full_kick_work_func(struct irq_work *work) { - tick_nohz_full_check(); + __tick_nohz_full_check(); } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { @@ -224,7 +231,7 @@ void tick_nohz_full_kick(void) static void nohz_full_kick_ipi(void *info) { - tick_nohz_full_check(); + __tick_nohz_full_check(); } /* @@ -233,12 +240,13 @@ static void nohz_full_kick_ipi(void *info) */ void tick_nohz_full_kick_all(void) { - if (!have_nohz_full_mask) + if (!tick_nohz_full_running) return; preempt_disable(); - smp_call_function_many(nohz_full_mask, + smp_call_function_many(tick_nohz_full_mask, nohz_full_kick_ipi, NULL, false); + tick_nohz_full_kick(); preempt_enable(); } @@ -247,7 +255,7 @@ void tick_nohz_full_kick_all(void) * It might need the tick due to per task/process properties: * perf events, posix cpu timers, ... */ -void tick_nohz_task_switch(struct task_struct *tsk) +void __tick_nohz_task_switch(struct task_struct *tsk) { unsigned long flags; @@ -263,37 +271,29 @@ out: local_irq_restore(flags); } -int tick_nohz_full_cpu(int cpu) -{ - if (!have_nohz_full_mask) - return 0; - - return cpumask_test_cpu(cpu, nohz_full_mask); -} - /* Parse the boot-time nohz CPU list from the kernel parameters. */ static int __init tick_nohz_full_setup(char *str) { int cpu; - alloc_bootmem_cpumask_var(&nohz_full_mask); - if (cpulist_parse(str, nohz_full_mask) < 0) { + alloc_bootmem_cpumask_var(&tick_nohz_full_mask); + if (cpulist_parse(str, tick_nohz_full_mask) < 0) { pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); return 1; } cpu = smp_processor_id(); - if (cpumask_test_cpu(cpu, nohz_full_mask)) { + if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); - cpumask_clear_cpu(cpu, nohz_full_mask); + cpumask_clear_cpu(cpu, tick_nohz_full_mask); } - have_nohz_full_mask = true; + tick_nohz_full_running = true; return 1; } __setup("nohz_full=", tick_nohz_full_setup); -static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, +static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -305,8 +305,8 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, * If we handle the timekeeping duty for full dynticks CPUs, * we can't safely shutdown that CPU. */ - if (have_nohz_full_mask && tick_do_timer_cpu == cpu) - return -EINVAL; + if (tick_nohz_full_running && tick_do_timer_cpu == cpu) + return NOTIFY_BAD; break; } return NOTIFY_OK; @@ -324,14 +324,14 @@ static int tick_nohz_init_all(void) int err = -1; #ifdef CONFIG_NO_HZ_FULL_ALL - if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) { + if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); return err; } err = 0; - cpumask_setall(nohz_full_mask); - cpumask_clear_cpu(smp_processor_id(), nohz_full_mask); - have_nohz_full_mask = true; + cpumask_setall(tick_nohz_full_mask); + cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); + tick_nohz_full_running = true; #endif return err; } @@ -340,27 +340,18 @@ void __init tick_nohz_init(void) { int cpu; - if (!have_nohz_full_mask) { + if (!tick_nohz_full_running) { if (tick_nohz_init_all() < 0) return; } - cpu_notifier(tick_nohz_cpu_down_callback, 0); - - /* Make sure full dynticks CPU are also RCU nocbs */ - for_each_cpu(cpu, nohz_full_mask) { - if (!rcu_is_nocb_cpu(cpu)) { - pr_warning("NO_HZ: CPU %d is not RCU nocb: " - "cleared from nohz_full range", cpu); - cpumask_clear_cpu(cpu, nohz_full_mask); - } - } + for_each_cpu(cpu, tick_nohz_full_mask) + context_tracking_cpu_set(cpu); - cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); + cpu_notifier(tick_nohz_cpu_down_callback, 0); + cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask); pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); } -#else -#define have_nohz_full_mask (0) #endif /* @@ -738,7 +729,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } - if (have_nohz_full_mask) { + if (tick_nohz_full_enabled()) { /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around @@ -832,13 +823,10 @@ void tick_nohz_irq_exit(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); - if (ts->inidle) { - /* Cancel the timer because CPU already waken up from the C-states*/ - menu_hrtimer_cancel(); + if (ts->inidle) __tick_nohz_idle_enter(ts); - } else { + else tick_nohz_full_stop_tick(ts); - } } /** @@ -936,8 +924,6 @@ void tick_nohz_idle_exit(void) ts->inidle = 0; - /* Cancel the timer because CPU already waken up from the C-states*/ - menu_hrtimer_cancel(); if (ts->idle_active || ts->tick_stopped) now = ktime_get(); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index baeeb5c87cf..947ba25a95a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -25,6 +25,11 @@ #include "tick-internal.h" #include "ntp_internal.h" +#include "timekeeping_internal.h" + +#define TK_CLEAR_NTP (1 << 0) +#define TK_MIRROR (1 << 1) +#define TK_CLOCK_WAS_SET (1 << 2) static struct timekeeper timekeeper; static DEFINE_RAW_SPINLOCK(timekeeper_lock); @@ -200,9 +205,9 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); -static void update_pvclock_gtod(struct timekeeper *tk) +static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) { - raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk); + raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk); } /** @@ -216,7 +221,7 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb) raw_spin_lock_irqsave(&timekeeper_lock, flags); ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); - update_pvclock_gtod(tk); + update_pvclock_gtod(tk, true); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return ret; @@ -241,16 +246,16 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); /* must hold timekeeper_lock */ -static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror) +static void timekeeping_update(struct timekeeper *tk, unsigned int action) { - if (clearntp) { + if (action & TK_CLEAR_NTP) { tk->ntp_error = 0; ntp_clear(); } update_vsyscall(tk); - update_pvclock_gtod(tk); + update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); - if (mirror) + if (action & TK_MIRROR) memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); } @@ -508,7 +513,7 @@ int do_settimeofday(const struct timespec *tv) tk_set_xtime(tk, tv); - timekeeping_update(tk, true, true); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -552,7 +557,7 @@ int timekeeping_inject_offset(struct timespec *ts) tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); error: /* even if we error out, we forwarded the time, so call update */ - timekeeping_update(tk, true, true); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -627,13 +632,22 @@ static int change_clocksource(void *data) write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); - if (!new->enable || new->enable(new) == 0) { - old = tk->clock; - tk_setup_internals(tk, new); - if (old->disable) - old->disable(old); + /* + * If the cs is in module, get a module reference. Succeeds + * for built-in code (owner == NULL) as well. + */ + if (try_module_get(new->owner)) { + if (!new->enable || new->enable(new) == 0) { + old = tk->clock; + tk_setup_internals(tk, new); + if (old->disable) + old->disable(old); + module_put(old->owner); + } else { + module_put(new->owner); + } } - timekeeping_update(tk, true, true); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -648,14 +662,15 @@ static int change_clocksource(void *data) * This function is called from clocksource.c after a new, better clock * source has been registered. The caller holds the clocksource_mutex. */ -void timekeeping_notify(struct clocksource *clock) +int timekeeping_notify(struct clocksource *clock) { struct timekeeper *tk = &timekeeper; if (tk->clock == clock) - return; + return 0; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); + return tk->clock == clock ? 0 : -1; } /** @@ -841,6 +856,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, tk_xtime_add(tk, delta); tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); + tk_debug_account_sleep_time(delta); } /** @@ -872,7 +888,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) __timekeeping_inject_sleeptime(tk, delta); - timekeeping_update(tk, true, true); + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -954,7 +970,7 @@ static void timekeeping_resume(void) tk->cycle_last = clock->cycle_last = cycle_now; tk->ntp_error = 0; timekeeping_suspended = 0; - timekeeping_update(tk, false, true); + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -1236,9 +1252,10 @@ out_adjust: * It also calls into the NTP code to handle leapsecond processing. * */ -static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) +static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; + unsigned int action = 0; while (tk->xtime_nsec >= nsecps) { int leap; @@ -1261,8 +1278,10 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); clock_was_set_delayed(); + action = TK_CLOCK_WAS_SET; } } + return action; } /** @@ -1347,6 +1366,7 @@ static void update_wall_time(void) struct timekeeper *tk = &shadow_timekeeper; cycle_t offset; int shift = 0, maxshift; + unsigned int action; unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); @@ -1399,7 +1419,7 @@ static void update_wall_time(void) * Finally, make sure that after the rounding * xtime_nsec isn't larger than NSEC_PER_SEC */ - accumulate_nsecs_to_secs(tk); + action = accumulate_nsecs_to_secs(tk); write_seqcount_begin(&timekeeper_seq); /* Update clock->cycle_last with the new value */ @@ -1415,7 +1435,7 @@ static void update_wall_time(void) * updating. */ memcpy(real_tk, tk, sizeof(*tk)); - timekeeping_update(real_tk, false, false); + timekeeping_update(real_tk, action); write_seqcount_end(&timekeeper_seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -1677,11 +1697,14 @@ int do_adjtimex(struct timex *txc) if (tai != orig_tai) { __timekeeping_set_tai_offset(tk, tai); + update_pvclock_gtod(tk, true); clock_was_set_delayed(); } write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + ntp_notify_cmos_timer(); + return ret; } diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c new file mode 100644 index 00000000000..802433a4f5e --- /dev/null +++ b/kernel/time/timekeeping_debug.c @@ -0,0 +1,72 @@ +/* + * debugfs file to track time spent in suspend + * + * Copyright (c) 2011, Google, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/debugfs.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/seq_file.h> +#include <linux/time.h> + +static unsigned int sleep_time_bin[32] = {0}; + +static int tk_debug_show_sleep_time(struct seq_file *s, void *data) +{ + unsigned int bin; + seq_puts(s, " time (secs) count\n"); + seq_puts(s, "------------------------------\n"); + for (bin = 0; bin < 32; bin++) { + if (sleep_time_bin[bin] == 0) + continue; + seq_printf(s, "%10u - %-10u %4u\n", + bin ? 1 << (bin - 1) : 0, 1 << bin, + sleep_time_bin[bin]); + } + return 0; +} + +static int tk_debug_sleep_time_open(struct inode *inode, struct file *file) +{ + return single_open(file, tk_debug_show_sleep_time, NULL); +} + +static const struct file_operations tk_debug_sleep_time_fops = { + .open = tk_debug_sleep_time_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init tk_debug_sleep_time_init(void) +{ + struct dentry *d; + + d = debugfs_create_file("sleep_time", 0444, NULL, NULL, + &tk_debug_sleep_time_fops); + if (!d) { + pr_err("Failed to create sleep_time debug file\n"); + return -ENOMEM; + } + + return 0; +} +late_initcall(tk_debug_sleep_time_init); + +void tk_debug_account_sleep_time(struct timespec *t) +{ + sleep_time_bin[fls(t->tv_sec)]++; +} + diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h new file mode 100644 index 00000000000..13323ea08ff --- /dev/null +++ b/kernel/time/timekeeping_internal.h @@ -0,0 +1,14 @@ +#ifndef _TIMEKEEPING_INTERNAL_H +#define _TIMEKEEPING_INTERNAL_H +/* + * timekeeping debug functions + */ +#include <linux/time.h> + +#ifdef CONFIG_DEBUG_FS +extern void tk_debug_account_sleep_time(struct timespec *t); +#else +#define tk_debug_account_sleep_time(x) +#endif + +#endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 3bdf2832301..61ed862cdd3 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -265,10 +265,9 @@ static inline void timer_list_header(struct seq_file *m, u64 now) static int timer_list_show(struct seq_file *m, void *v) { struct timer_list_iter *iter = v; - u64 now = ktime_to_ns(ktime_get()); if (iter->cpu == -1 && !iter->second_pass) - timer_list_header(m, now); + timer_list_header(m, iter->now); else if (!iter->second_pass) print_cpu(m, iter->cpu, iter->now); #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -298,33 +297,41 @@ void sysrq_timer_list_show(void) return; } -static void *timer_list_start(struct seq_file *file, loff_t *offset) +static void *move_iter(struct timer_list_iter *iter, loff_t offset) { - struct timer_list_iter *iter = file->private; - - if (!*offset) { - iter->cpu = -1; - iter->now = ktime_to_ns(ktime_get()); - } else if (iter->cpu >= nr_cpu_ids) { + for (; offset; offset--) { + iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); + if (iter->cpu >= nr_cpu_ids) { #ifdef CONFIG_GENERIC_CLOCKEVENTS - if (!iter->second_pass) { - iter->cpu = -1; - iter->second_pass = true; - } else - return NULL; + if (!iter->second_pass) { + iter->cpu = -1; + iter->second_pass = true; + } else + return NULL; #else - return NULL; + return NULL; #endif + } } return iter; } +static void *timer_list_start(struct seq_file *file, loff_t *offset) +{ + struct timer_list_iter *iter = file->private; + + if (!*offset) + iter->now = ktime_to_ns(ktime_get()); + iter->cpu = -1; + iter->second_pass = false; + return move_iter(iter, *offset); +} + static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) { struct timer_list_iter *iter = file->private; - iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); ++*offset; - return timer_list_start(file, offset); + return move_iter(iter, 1); } static void timer_list_stop(struct seq_file *seq, void *v) diff --git a/kernel/timer.c b/kernel/timer.c index 15ffdb3f194..4296d13db3d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -149,9 +149,11 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu, /* now that we have rounded, subtract the extra skew again */ j -= cpu * 3; - if (j <= jiffies) /* rounding ate our timeout entirely; */ - return original; - return j; + /* + * Make sure j is still in the future. Otherwise return the + * unmodified value. + */ + return time_is_after_jiffies(j) ? j : original; } /** @@ -1503,11 +1505,11 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); -static int __cpuinit init_timers_cpu(int cpu) +static int init_timers_cpu(int cpu) { int j; struct tvec_base *base; - static char __cpuinitdata tvec_base_done[NR_CPUS]; + static char tvec_base_done[NR_CPUS]; if (!tvec_base_done[cpu]) { static char boot_done; @@ -1575,7 +1577,7 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea } } -static void __cpuinit migrate_timers(int cpu) +static void migrate_timers(int cpu) { struct tvec_base *old_base; struct tvec_base *new_base; @@ -1608,7 +1610,7 @@ static void __cpuinit migrate_timers(int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int __cpuinit timer_cpu_notify(struct notifier_block *self, +static int timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1633,7 +1635,7 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata timers_nb = { +static struct notifier_block timers_nb = { .notifier_call = timer_cpu_notify, }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6c508ff33c6..03cf44ac54d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -413,6 +413,17 @@ static int __register_ftrace_function(struct ftrace_ops *ops) return 0; } +static void ftrace_sync(struct work_struct *work) +{ + /* + * This function is just a stub to implement a hard force + * of synchronize_sched(). This requires synchronizing + * tasks even in userspace and idle. + * + * Yes, function tracing is rude. + */ +} + static int __unregister_ftrace_function(struct ftrace_ops *ops) { int ret; @@ -440,8 +451,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) * so there'll be no new users. We must ensure * all current users are done before we free * the control data. + * Note synchronize_sched() is not enough, as we + * use preempt_disable() to do RCU, but the function + * tracer can be called where RCU is not active + * (before user_exit()). */ - synchronize_sched(); + schedule_on_each_cpu(ftrace_sync); control_ops_free(ops); } } else @@ -456,9 +471,13 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) /* * Dynamic ops may be freed, we must make sure that all * callers are done before leaving this function. + * + * Again, normal synchronize_sched() is not good enough. + * We need to do a hard force of sched synchronization. */ if (ops->flags & FTRACE_OPS_FL_DYNAMIC) - synchronize_sched(); + schedule_on_each_cpu(ftrace_sync); + return 0; } @@ -622,12 +641,18 @@ static int function_stat_show(struct seq_file *m, void *v) if (rec->counter <= 1) stddev = 0; else { - stddev = rec->time_squared - rec->counter * avg * avg; + /* + * Apply Welford's method: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + */ + stddev = rec->counter * rec->time_squared - + rec->time * rec->time; + /* * Divide only 1000 for ns^2 -> us^2 conversion. * trace_print_graph_duration will divide 1000 again. */ - do_div(stddev, (rec->counter - 1) * 1000); + do_div(stddev, rec->counter * (rec->counter - 1) * 1000); } trace_seq_init(&s); @@ -1416,12 +1441,22 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, * the hashes are freed with call_rcu_sched(). */ static int -ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) { struct ftrace_hash *filter_hash; struct ftrace_hash *notrace_hash; int ret; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + /* + * There's a small race when adding ops that the ftrace handler + * that wants regs, may be called without them. We can not + * allow that handler to be called if regs is NULL. + */ + if (regs == NULL && (ops->flags & FTRACE_OPS_FL_SAVE_REGS)) + return 0; +#endif + filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); @@ -1943,12 +1978,27 @@ int __weak ftrace_arch_code_modify_post_process(void) void ftrace_modify_all_code(int command) { + int update = command & FTRACE_UPDATE_TRACE_FUNC; + + /* + * If the ftrace_caller calls a ftrace_ops func directly, + * we need to make sure that it only traces functions it + * expects to trace. When doing the switch of functions, + * we need to update to the ftrace_ops_list_func first + * before the transition between old and new calls are set, + * as the ftrace_ops_list_func will check the ops hashes + * to make sure the ops are having the right functions + * traced. + */ + if (update) + ftrace_update_ftrace_func(ftrace_ops_list_func); + if (command & FTRACE_UPDATE_CALLS) ftrace_replace_code(1); else if (command & FTRACE_DISABLE_CALLS) ftrace_replace_code(0); - if (command & FTRACE_UPDATE_TRACE_FUNC) + if (update && ftrace_trace_function != ftrace_ops_list_func) ftrace_update_ftrace_func(ftrace_trace_function); if (command & FTRACE_START_FUNC_RET) @@ -2134,12 +2184,57 @@ static cycle_t ftrace_update_time; static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; -static int ops_traces_mod(struct ftrace_ops *ops) +static inline int ops_traces_mod(struct ftrace_ops *ops) { - struct ftrace_hash *hash; + /* + * Filter_hash being empty will default to trace module. + * But notrace hash requires a test of individual module functions. + */ + return ftrace_hash_empty(ops->filter_hash) && + ftrace_hash_empty(ops->notrace_hash); +} - hash = ops->filter_hash; - return ftrace_hash_empty(hash); +/* + * Check if the current ops references the record. + * + * If the ops traces all functions, then it was already accounted for. + * If the ops does not trace the current record function, skip it. + * If the ops ignores the function via notrace filter, skip it. + */ +static inline bool +ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + /* If ops isn't enabled, ignore it */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return 0; + + /* If ops traces all mods, we already accounted for it */ + if (ops_traces_mod(ops)) + return 0; + + /* The function must be in the filter */ + if (!ftrace_hash_empty(ops->filter_hash) && + !ftrace_lookup_ip(ops->filter_hash, rec->ip)) + return 0; + + /* If in notrace hash, we ignore it too */ + if (ftrace_lookup_ip(ops->notrace_hash, rec->ip)) + return 0; + + return 1; +} + +static int referenced_filters(struct dyn_ftrace *rec) +{ + struct ftrace_ops *ops; + int cnt = 0; + + for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { + if (ops_references_rec(ops, rec)) + cnt++; + } + + return cnt; } static int ftrace_update_code(struct module *mod) @@ -2148,6 +2243,7 @@ static int ftrace_update_code(struct module *mod) struct dyn_ftrace *p; cycle_t start, stop; unsigned long ref = 0; + bool test = false; int i; /* @@ -2161,9 +2257,12 @@ static int ftrace_update_code(struct module *mod) for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { - if (ops->flags & FTRACE_OPS_FL_ENABLED && - ops_traces_mod(ops)) - ref++; + if (ops->flags & FTRACE_OPS_FL_ENABLED) { + if (ops_traces_mod(ops)) + ref++; + else + test = true; + } } } @@ -2173,12 +2272,16 @@ static int ftrace_update_code(struct module *mod) for (pg = ftrace_new_pgs; pg; pg = pg->next) { for (i = 0; i < pg->index; i++) { + int cnt = ref; + /* If something went wrong, bail without enabling anything */ if (unlikely(ftrace_disabled)) return -1; p = &pg->records[i]; - p->flags = ref; + if (test) + cnt += referenced_filters(p); + p->flags = cnt; /* * Do the initial record conversion from mcount jump @@ -2198,7 +2301,7 @@ static int ftrace_update_code(struct module *mod) * conversion puts the module to the correct state, thus * passing the ftrace_make_call check. */ - if (ftrace_start_up && ref) { + if (ftrace_start_up && cnt) { int failed = __ftrace_replace_code(p, 1); if (failed) ftrace_bug(failed, p->ip); @@ -3349,6 +3452,12 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return add_hash_entry(hash, ip); } +static void ftrace_ops_update_code(struct ftrace_ops *ops) +{ + if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) + ftrace_run_update_code(FTRACE_UPDATE_CALLS); +} + static int ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, unsigned long ip, int remove, int reset, int enable) @@ -3391,9 +3500,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_lock(&ftrace_lock); ret = ftrace_hash_move(ops, enable, orig_hash, hash); - if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED - && ftrace_enabled) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); + if (!ret) + ftrace_ops_update_code(ops); mutex_unlock(&ftrace_lock); @@ -3512,8 +3620,12 @@ EXPORT_SYMBOL_GPL(ftrace_set_global_notrace); static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; +/* Used by function selftest to not test if filter is set */ +bool ftrace_filter_param __initdata; + static int __init set_ftrace_notrace(char *str) { + ftrace_filter_param = true; strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); return 1; } @@ -3521,6 +3633,7 @@ __setup("ftrace_notrace=", set_ftrace_notrace); static int __init set_ftrace_filter(char *str) { + ftrace_filter_param = true; strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); return 1; } @@ -3615,9 +3728,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file) mutex_lock(&ftrace_lock); ret = ftrace_hash_move(iter->ops, filter_hash, orig_hash, iter->hash); - if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) - && ftrace_enabled) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); + if (!ret) + ftrace_ops_update_code(iter->ops); mutex_unlock(&ftrace_lock); } @@ -4188,7 +4300,7 @@ static inline void ftrace_startup_enable(int command) { } # define ftrace_shutdown_sysctl() do { } while (0) static inline int -ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) { return 1; } @@ -4211,7 +4323,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, do_for_each_ftrace_op(op, ftrace_control_list) { if (!(op->flags & FTRACE_OPS_FL_STUB) && !ftrace_function_local_disabled(op) && - ftrace_ops_test(op, ip)) + ftrace_ops_test(op, ip, regs)) op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); trace_recursion_clear(TRACE_CONTROL_BIT); @@ -4244,7 +4356,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, */ preempt_disable_notrace(); do_for_each_ftrace_op(op, ftrace_ops_list) { - if (ftrace_ops_test(op, ip)) + if (ftrace_ops_test(op, ip, regs)) op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); preempt_enable_notrace(); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e444ff88f0a..cc2f66f68dc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -36,11 +36,11 @@ int ring_buffer_print_entry_header(struct trace_seq *s) { int ret; - ret = trace_seq_printf(s, "# compressed entry header\n"); - ret = trace_seq_printf(s, "\ttype_len : 5 bits\n"); - ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); - ret = trace_seq_printf(s, "\tarray : 32 bits\n"); - ret = trace_seq_printf(s, "\n"); + ret = trace_seq_puts(s, "# compressed entry header\n"); + ret = trace_seq_puts(s, "\ttype_len : 5 bits\n"); + ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n"); + ret = trace_seq_puts(s, "\tarray : 32 bits\n"); + ret = trace_seq_putc(s, '\n'); ret = trace_seq_printf(s, "\tpadding : type == %d\n", RINGBUF_TYPE_PADDING); ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", @@ -1066,7 +1066,7 @@ static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer, } /** - * check_pages - integrity check of buffer pages + * rb_check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test * * As a safety measure we check to make sure the data pages have not @@ -1258,7 +1258,7 @@ static int rb_cpu_notify(struct notifier_block *self, #endif /** - * ring_buffer_alloc - allocate a new ring_buffer + * __ring_buffer_alloc - allocate a new ring_buffer * @size: the size in bytes per cpu that is needed. * @flags: attributes to set for the ring buffer. * @@ -1607,6 +1607,7 @@ static void update_pages_handler(struct work_struct *work) * ring_buffer_resize - resize the ring buffer * @buffer: the buffer to resize. * @size: the new size. + * @cpu_id: the cpu buffer to resize * * Minimum size is 2 * BUF_PAGE_SIZE. * @@ -3956,11 +3957,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * expected. * * After a sequence of ring_buffer_read_prepare calls, the user is - * expected to make at least one call to ring_buffer_prepare_sync. + * expected to make at least one call to ring_buffer_read_prepare_sync. * Afterwards, ring_buffer_read_start is invoked to get things going * for real. * - * This overall must be paired with ring_buffer_finish. + * This overall must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) @@ -4009,7 +4010,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); * an intervening ring_buffer_read_prepare_sync must have been * performed. * - * Must be paired with ring_buffer_finish. + * Must be paired with ring_buffer_read_finish. */ void ring_buffer_read_start(struct ring_buffer_iter *iter) @@ -4031,7 +4032,7 @@ ring_buffer_read_start(struct ring_buffer_iter *iter) EXPORT_SYMBOL_GPL(ring_buffer_read_start); /** - * ring_buffer_finish - finish reading the iterator of the buffer + * ring_buffer_read_finish - finish reading the iterator of the buffer * @iter: The iterator retrieved by ring_buffer_start * * This re-enables the recording to the buffer, and frees the @@ -4346,6 +4347,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); /** * ring_buffer_alloc_read_page - allocate a page to read from buffer * @buffer: the buffer to allocate for. + * @cpu: the cpu buffer to allocate. * * This function is used in conjunction with ring_buffer_read_page. * When reading a full page from the ring buffer, these functions @@ -4403,7 +4405,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); * to swap with a page in the ring buffer. * * for example: - * rpage = ring_buffer_alloc_read_page(buffer); + * rpage = ring_buffer_alloc_read_page(buffer, cpu); * if (!rpage) * return error; * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1a41023a1f8..7974ba20557 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -115,6 +115,9 @@ cpumask_var_t __read_mostly tracing_buffer_mask; enum ftrace_dump_mode ftrace_dump_on_oops; +/* When set, tracing will stop when a WARN*() is hit */ +int __disable_trace_on_warning; + static int tracing_set_tracer(const char *buf); #define MAX_TRACER_SIZE 100 @@ -149,6 +152,13 @@ static int __init set_ftrace_dump_on_oops(char *str) } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); +static int __init stop_trace_on_warning(char *str) +{ + __disable_trace_on_warning = 1; + return 1; +} +__setup("traceoff_on_warning=", stop_trace_on_warning); + static int __init boot_alloc_snapshot(char *str) { allocate_snapshot = true; @@ -170,6 +180,7 @@ static int __init set_trace_boot_options(char *str) } __setup("trace_options=", set_trace_boot_options); + unsigned long long ns2usecs(cycle_t nsec) { nsec += 500; @@ -193,6 +204,37 @@ static struct trace_array global_trace; LIST_HEAD(ftrace_trace_arrays); +int trace_array_get(struct trace_array *this_tr) +{ + struct trace_array *tr; + int ret = -ENODEV; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr == this_tr) { + tr->ref++; + ret = 0; + break; + } + } + mutex_unlock(&trace_types_lock); + + return ret; +} + +static void __trace_array_put(struct trace_array *this_tr) +{ + WARN_ON(!this_tr->ref); + this_tr->ref--; +} + +void trace_array_put(struct trace_array *this_tr) +{ + mutex_lock(&trace_types_lock); + __trace_array_put(this_tr); + mutex_unlock(&trace_types_lock); +} + int filter_current_check_discard(struct ring_buffer *buffer, struct ftrace_event_call *call, void *rec, struct ring_buffer_event *event) @@ -201,23 +243,43 @@ int filter_current_check_discard(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(filter_current_check_discard); -cycle_t ftrace_now(int cpu) +cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { u64 ts; /* Early boot up does not have a buffer yet */ - if (!global_trace.trace_buffer.buffer) + if (!buf->buffer) return trace_clock_local(); - ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu); - ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts); + ts = ring_buffer_time_stamp(buf->buffer, cpu); + ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts); return ts; } +cycle_t ftrace_now(int cpu) +{ + return buffer_ftrace_now(&global_trace.trace_buffer, cpu); +} + +/** + * tracing_is_enabled - Show if global_trace has been disabled + * + * Shows if the global trace has been enabled or not. It uses the + * mirror flag "buffer_disabled" to be used in fast paths such as for + * the irqsoff tracer. But it may be inaccurate due to races. If you + * need to know the accurate state, use tracing_is_on() which is a little + * slower, but accurate. + */ int tracing_is_enabled(void) { - return tracing_is_on(); + /* + * For quick access (irqsoff uses this in fast path), just + * return the mirror variable of the state of the ring buffer. + * It's a little racy, but we don't really care. + */ + smp_rmb(); + return !global_trace.buffer_disabled; } /* @@ -240,7 +302,7 @@ static struct tracer *trace_types __read_mostly; /* * trace_types_lock is used to protect the trace_types list. */ -static DEFINE_MUTEX(trace_types_lock); +DEFINE_MUTEX(trace_types_lock); /* * serialize the access of the ring buffer @@ -330,6 +392,23 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; +static void tracer_tracing_on(struct trace_array *tr) +{ + if (tr->trace_buffer.buffer) + ring_buffer_record_on(tr->trace_buffer.buffer); + /* + * This flag is looked at when buffers haven't been allocated + * yet, or by some tracers (like irqsoff), that just want to + * know if the ring buffer has been disabled, but it can handle + * races of where it gets disabled but we still do a record. + * As the check is in the fast path of the tracers, it is more + * important to be fast than accurate. + */ + tr->buffer_disabled = 0; + /* Make the flag seen by readers */ + smp_wmb(); +} + /** * tracing_on - enable tracing buffers * @@ -338,15 +417,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | */ void tracing_on(void) { - if (global_trace.trace_buffer.buffer) - ring_buffer_record_on(global_trace.trace_buffer.buffer); - /* - * This flag is only looked at when buffers haven't been - * allocated yet. We don't really care about the race - * between setting this flag and actually turning - * on the buffer. - */ - global_trace.buffer_disabled = 0; + tracer_tracing_on(&global_trace); } EXPORT_SYMBOL_GPL(tracing_on); @@ -540,6 +611,23 @@ void tracing_snapshot_alloc(void) EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); #endif /* CONFIG_TRACER_SNAPSHOT */ +static void tracer_tracing_off(struct trace_array *tr) +{ + if (tr->trace_buffer.buffer) + ring_buffer_record_off(tr->trace_buffer.buffer); + /* + * This flag is looked at when buffers haven't been allocated + * yet, or by some tracers (like irqsoff), that just want to + * know if the ring buffer has been disabled, but it can handle + * races of where it gets disabled but we still do a record. + * As the check is in the fast path of the tracers, it is more + * important to be fast than accurate. + */ + tr->buffer_disabled = 1; + /* Make the flag seen by readers */ + smp_wmb(); +} + /** * tracing_off - turn off tracing buffers * @@ -550,26 +638,35 @@ EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); */ void tracing_off(void) { - if (global_trace.trace_buffer.buffer) - ring_buffer_record_off(global_trace.trace_buffer.buffer); - /* - * This flag is only looked at when buffers haven't been - * allocated yet. We don't really care about the race - * between setting this flag and actually turning - * on the buffer. - */ - global_trace.buffer_disabled = 1; + tracer_tracing_off(&global_trace); } EXPORT_SYMBOL_GPL(tracing_off); +void disable_trace_on_warning(void) +{ + if (__disable_trace_on_warning) + tracing_off(); +} + +/** + * tracer_tracing_is_on - show real state of ring buffer enabled + * @tr : the trace array to know if ring buffer is enabled + * + * Shows real state of the ring buffer if it is enabled or not. + */ +static int tracer_tracing_is_on(struct trace_array *tr) +{ + if (tr->trace_buffer.buffer) + return ring_buffer_record_is_on(tr->trace_buffer.buffer); + return !tr->buffer_disabled; +} + /** * tracing_is_on - show state of ring buffers enabled */ int tracing_is_on(void) { - if (global_trace.trace_buffer.buffer) - return ring_buffer_record_is_on(global_trace.trace_buffer.buffer); - return !global_trace.buffer_disabled; + return tracer_tracing_is_on(&global_trace); } EXPORT_SYMBOL_GPL(tracing_is_on); @@ -652,8 +749,6 @@ static struct { ARCH_TRACE_CLOCKS }; -int trace_clock_id; - /* * trace_parser_get_init - gets the buffer for trace parser */ @@ -1121,7 +1216,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf) /* Make sure all commits have finished */ synchronize_sched(); - buf->time_start = ftrace_now(buf->cpu); + buf->time_start = buffer_ftrace_now(buf, buf->cpu); for_each_online_cpu(cpu) ring_buffer_reset_cpu(buffer, cpu); @@ -1129,23 +1224,17 @@ void tracing_reset_online_cpus(struct trace_buffer *buf) ring_buffer_record_enable(buffer); } -void tracing_reset_current(int cpu) -{ - tracing_reset(&global_trace.trace_buffer, cpu); -} - +/* Must have trace_types_lock held */ void tracing_reset_all_online_cpus(void) { struct trace_array *tr; - mutex_lock(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { tracing_reset_online_cpus(&tr->trace_buffer); #ifdef CONFIG_TRACER_MAX_TRACE tracing_reset_online_cpus(&tr->max_buffer); #endif } - mutex_unlock(&trace_types_lock); } #define SAVED_CMDLINES 128 @@ -1545,15 +1634,6 @@ trace_function(struct trace_array *tr, __buffer_unlock_commit(buffer, event); } -void -ftrace(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long ip, unsigned long parent_ip, unsigned long flags, - int pc) -{ - if (likely(!atomic_read(&data->disabled))) - trace_function(tr, ip, parent_ip, flags, pc); -} - #ifdef CONFIG_STACKTRACE #define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) @@ -2762,6 +2842,17 @@ static int s_show(struct seq_file *m, void *v) return 0; } +/* + * Should be used after trace_array_get(), trace_types_lock + * ensures that i_cdev was already initialized. + */ +static inline int tracing_get_cpu(struct inode *inode) +{ + if (inode->i_cdev) /* See trace_create_cpu_file() */ + return (long)inode->i_cdev - 1; + return RING_BUFFER_ALL_CPUS; +} + static const struct seq_operations tracer_seq_ops = { .start = s_start, .next = s_next, @@ -2772,8 +2863,7 @@ static const struct seq_operations tracer_seq_ops = { static struct trace_iterator * __tracing_open(struct inode *inode, struct file *file, bool snapshot) { - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; + struct trace_array *tr = inode->i_private; struct trace_iterator *iter; int cpu; @@ -2814,8 +2904,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) iter->trace_buffer = &tr->trace_buffer; iter->snapshot = snapshot; iter->pos = -1; + iter->cpu_file = tracing_get_cpu(inode); mutex_init(&iter->mutex); - iter->cpu_file = tc->cpu; /* Notify the tracer early; before we stop tracing. */ if (iter->trace && iter->trace->open) @@ -2826,7 +2916,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) iter->iter_flags |= TRACE_FILE_ANNOTATE; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ - if (trace_clocks[trace_clock_id].in_ns) + if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; /* stop the trace while dumping if we are not opening "snapshot" */ @@ -2852,8 +2942,6 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) tracing_iter_reset(iter, cpu); } - tr->ref++; - mutex_unlock(&trace_types_lock); return iter; @@ -2876,24 +2964,41 @@ int tracing_open_generic(struct inode *inode, struct file *filp) return 0; } +/* + * Open and update trace_array ref count. + * Must have the current trace_array passed to it. + */ +static int tracing_open_generic_tr(struct inode *inode, struct file *filp) +{ + struct trace_array *tr = inode->i_private; + + if (tracing_disabled) + return -ENODEV; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + filp->private_data = inode->i_private; + + return 0; +} + static int tracing_release(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; struct seq_file *m = file->private_data; struct trace_iterator *iter; - struct trace_array *tr; int cpu; - if (!(file->f_mode & FMODE_READ)) + if (!(file->f_mode & FMODE_READ)) { + trace_array_put(tr); return 0; + } + /* Writes do not use seq_file */ iter = m->private; - tr = iter->tr; - mutex_lock(&trace_types_lock); - WARN_ON(!tr->ref); - tr->ref--; - for_each_tracing_cpu(cpu) { if (iter->buffer_iter[cpu]) ring_buffer_read_finish(iter->buffer_iter[cpu]); @@ -2905,6 +3010,9 @@ static int tracing_release(struct inode *inode, struct file *file) if (!iter->snapshot) /* reenable tracing if it was previously enabled */ tracing_start_tr(tr); + + __trace_array_put(tr); + mutex_unlock(&trace_types_lock); mutex_destroy(&iter->mutex); @@ -2912,24 +3020,44 @@ static int tracing_release(struct inode *inode, struct file *file) kfree(iter->trace); kfree(iter->buffer_iter); seq_release_private(inode, file); + + return 0; +} + +static int tracing_release_generic_tr(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); return 0; } +static int tracing_single_release_tr(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + + return single_release(inode, file); +} + static int tracing_open(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; struct trace_iterator *iter; int ret = 0; + if (trace_array_get(tr) < 0) + return -ENODEV; + /* If this file was open for write, then erase contents */ - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) { - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + int cpu = tracing_get_cpu(inode); - if (tc->cpu == RING_BUFFER_ALL_CPUS) + if (cpu == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(&tr->trace_buffer); else - tracing_reset(&tr->trace_buffer, tc->cpu); + tracing_reset(&tr->trace_buffer, cpu); } if (file->f_mode & FMODE_READ) { @@ -2939,6 +3067,10 @@ static int tracing_open(struct inode *inode, struct file *file) else if (trace_flags & TRACE_ITER_LATENCY_FMT) iter->iter_flags |= TRACE_FILE_LAT_FMT; } + + if (ret < 0) + trace_array_put(tr); + return ret; } @@ -3034,11 +3166,6 @@ static const struct file_operations show_traces_fops = { }; /* - * Only trace on a CPU if the bitmask is set: - */ -static cpumask_var_t tracing_cpumask; - -/* * The tracer itself will not take this lock, but still we want * to provide a consistent cpumask to user-space: */ @@ -3054,11 +3181,12 @@ static ssize_t tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { + struct trace_array *tr = file_inode(filp)->i_private; int len; mutex_lock(&tracing_cpumask_update_lock); - len = cpumask_scnprintf(mask_str, count, tracing_cpumask); + len = cpumask_scnprintf(mask_str, count, tr->tracing_cpumask); if (count - len < 2) { count = -EINVAL; goto out_err; @@ -3076,7 +3204,7 @@ static ssize_t tracing_cpumask_write(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) { - struct trace_array *tr = filp->private_data; + struct trace_array *tr = file_inode(filp)->i_private; cpumask_var_t tracing_cpumask_new; int err, cpu; @@ -3096,12 +3224,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, * Increase/decrease the disabled counter if we are * about to flip a bit in the cpumask: */ - if (cpumask_test_cpu(cpu, tracing_cpumask) && + if (cpumask_test_cpu(cpu, tr->tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu); } - if (!cpumask_test_cpu(cpu, tracing_cpumask) && + if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); @@ -3110,7 +3238,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, arch_spin_unlock(&ftrace_max_lock); local_irq_enable(); - cpumask_copy(tracing_cpumask, tracing_cpumask_new); + cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); mutex_unlock(&tracing_cpumask_update_lock); free_cpumask_var(tracing_cpumask_new); @@ -3124,9 +3252,10 @@ err_unlock: } static const struct file_operations tracing_cpumask_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = tracing_cpumask_read, .write = tracing_cpumask_write, + .release = tracing_release_generic_tr, .llseek = generic_file_llseek, }; @@ -3295,17 +3424,27 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, static int tracing_trace_options_open(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; + int ret; + if (tracing_disabled) return -ENODEV; - return single_open(file, tracing_trace_options_show, inode->i_private); + if (trace_array_get(tr) < 0) + return -ENODEV; + + ret = single_open(file, tracing_trace_options_show, inode->i_private); + if (ret < 0) + trace_array_put(tr); + + return ret; } static const struct file_operations tracing_iter_fops = { .open = tracing_trace_options_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = tracing_single_release_tr, .write = tracing_trace_options_write, }; @@ -3381,14 +3520,14 @@ static const char readme_msg[] = "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n" "\t\t\t Read the contents for more information\n" #endif -#ifdef CONFIG_STACKTRACE +#ifdef CONFIG_STACK_TRACER " stack_trace\t\t- Shows the max stack trace when active\n" " stack_max_size\t- Shows current max stack size that was traced\n" "\t\t\t Write into this file to reset the max size (trigger a new trace)\n" #ifdef CONFIG_DYNAMIC_FTRACE " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n" #endif -#endif /* CONFIG_STACKTRACE */ +#endif /* CONFIG_STACK_TRACER */ ; static ssize_t @@ -3785,20 +3924,23 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, static int tracing_open_pipe(struct inode *inode, struct file *filp) { - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; + struct trace_array *tr = inode->i_private; struct trace_iterator *iter; int ret = 0; if (tracing_disabled) return -ENODEV; + if (trace_array_get(tr) < 0) + return -ENODEV; + mutex_lock(&trace_types_lock); /* create a buffer to store the information to pass to userspace */ iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { ret = -ENOMEM; + __trace_array_put(tr); goto out; } @@ -3825,12 +3967,12 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) iter->iter_flags |= TRACE_FILE_LAT_FMT; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ - if (trace_clocks[trace_clock_id].in_ns) + if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; - iter->cpu_file = tc->cpu; - iter->tr = tc->tr; - iter->trace_buffer = &tc->tr->trace_buffer; + iter->tr = tr; + iter->trace_buffer = &tr->trace_buffer; + iter->cpu_file = tracing_get_cpu(inode); mutex_init(&iter->mutex); filp->private_data = iter; @@ -3845,6 +3987,7 @@ out: fail: kfree(iter->trace); kfree(iter); + __trace_array_put(tr); mutex_unlock(&trace_types_lock); return ret; } @@ -3852,6 +3995,7 @@ fail: static int tracing_release_pipe(struct inode *inode, struct file *file) { struct trace_iterator *iter = file->private_data; + struct trace_array *tr = inode->i_private; mutex_lock(&trace_types_lock); @@ -3865,6 +4009,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) kfree(iter->trace); kfree(iter); + trace_array_put(tr); + return 0; } @@ -3941,7 +4087,7 @@ static int tracing_wait_pipe(struct file *filp) * * iter->pos will be 0 if we haven't read anything. */ - if (!tracing_is_enabled() && iter->pos) + if (!tracing_is_on() && iter->pos) break; } @@ -4002,6 +4148,7 @@ waitagain: memset(&iter->seq, 0, sizeof(struct trace_iterator) - offsetof(struct trace_iterator, seq)); + cpumask_clear(iter->started); iter->pos = -1; trace_event_read_lock(); @@ -4202,15 +4349,16 @@ static ssize_t tracing_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_cpu *tc = filp->private_data; - struct trace_array *tr = tc->tr; + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + int cpu = tracing_get_cpu(inode); char buf[64]; int r = 0; ssize_t ret; mutex_lock(&trace_types_lock); - if (tc->cpu == RING_BUFFER_ALL_CPUS) { + if (cpu == RING_BUFFER_ALL_CPUS) { int cpu, buf_size_same; unsigned long size; @@ -4237,7 +4385,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf, } else r = sprintf(buf, "X\n"); } else - r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10); + r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10); mutex_unlock(&trace_types_lock); @@ -4249,7 +4397,8 @@ static ssize_t tracing_entries_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_cpu *tc = filp->private_data; + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; unsigned long val; int ret; @@ -4263,8 +4412,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, /* value is in KB */ val <<= 10; - - ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu); + ret = tracing_resize_ring_buffer(tr, val, tracing_get_cpu(inode)); if (ret < 0) return ret; @@ -4318,10 +4466,12 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) /* disable tracing ? */ if (trace_flags & TRACE_ITER_STOP_ON_FREE) - tracing_off(); + tracer_tracing_off(tr); /* resize the ring buffer to 0 */ tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); + trace_array_put(tr); + return 0; } @@ -4330,6 +4480,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { unsigned long addr = (unsigned long)ubuf; + struct trace_array *tr = filp->private_data; struct ring_buffer_event *event; struct ring_buffer *buffer; struct print_entry *entry; @@ -4389,7 +4540,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, local_save_flags(irq_flags); size = sizeof(*entry) + cnt + 2; /* possible \n added */ - buffer = global_trace.trace_buffer.buffer; + buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, irq_flags, preempt_count()); if (!event) { @@ -4480,12 +4631,12 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, * New clock may not be consistent with the previous clock. * Reset the buffer so that it doesn't have incomparable timestamps. */ - tracing_reset_online_cpus(&global_trace.trace_buffer); + tracing_reset_online_cpus(&tr->trace_buffer); #ifdef CONFIG_TRACER_MAX_TRACE if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); - tracing_reset_online_cpus(&global_trace.max_buffer); + tracing_reset_online_cpus(&tr->max_buffer); #endif mutex_unlock(&trace_types_lock); @@ -4497,10 +4648,20 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, static int tracing_clock_open(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; + int ret; + if (tracing_disabled) return -ENODEV; - return single_open(file, tracing_clock_show, inode->i_private); + if (trace_array_get(tr)) + return -ENODEV; + + ret = single_open(file, tracing_clock_show, inode->i_private); + if (ret < 0) + trace_array_put(tr); + + return ret; } struct ftrace_buffer_info { @@ -4512,31 +4673,40 @@ struct ftrace_buffer_info { #ifdef CONFIG_TRACER_SNAPSHOT static int tracing_snapshot_open(struct inode *inode, struct file *file) { - struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = inode->i_private; struct trace_iterator *iter; struct seq_file *m; int ret = 0; + if (trace_array_get(tr) < 0) + return -ENODEV; + if (file->f_mode & FMODE_READ) { iter = __tracing_open(inode, file, true); if (IS_ERR(iter)) ret = PTR_ERR(iter); } else { /* Writes still need the seq_file to hold the private data */ + ret = -ENOMEM; m = kzalloc(sizeof(*m), GFP_KERNEL); if (!m) - return -ENOMEM; + goto out; iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { kfree(m); - return -ENOMEM; + goto out; } - iter->tr = tc->tr; - iter->trace_buffer = &tc->tr->max_buffer; - iter->cpu_file = tc->cpu; + ret = 0; + + iter->tr = tr; + iter->trace_buffer = &tr->max_buffer; + iter->cpu_file = tracing_get_cpu(inode); m->private = iter; file->private_data = m; } +out: + if (ret < 0) + trace_array_put(tr); return ret; } @@ -4618,9 +4788,12 @@ out: static int tracing_snapshot_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; + int ret; + + ret = tracing_release(inode, file); if (file->f_mode & FMODE_READ) - return tracing_release(inode, file); + return ret; /* If write only, the seq_file is just a stub */ if (m) @@ -4686,34 +4859,38 @@ static const struct file_operations tracing_pipe_fops = { }; static const struct file_operations tracing_entries_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = tracing_entries_read, .write = tracing_entries_write, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; static const struct file_operations tracing_total_entries_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = tracing_total_entries_read, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; static const struct file_operations tracing_free_buffer_fops = { + .open = tracing_open_generic_tr, .write = tracing_free_buffer_write, .release = tracing_free_buffer_release, }; static const struct file_operations tracing_mark_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .write = tracing_mark_write, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; static const struct file_operations trace_clock_fops = { .open = tracing_clock_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = tracing_single_release_tr, .write = tracing_clock_write, }; @@ -4738,23 +4915,26 @@ static const struct file_operations snapshot_raw_fops = { static int tracing_buffers_open(struct inode *inode, struct file *filp) { - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; + struct trace_array *tr = inode->i_private; struct ftrace_buffer_info *info; + int ret; if (tracing_disabled) return -ENODEV; + if (trace_array_get(tr) < 0) + return -ENODEV; + info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) + if (!info) { + trace_array_put(tr); return -ENOMEM; + } mutex_lock(&trace_types_lock); - tr->ref++; - info->iter.tr = tr; - info->iter.cpu_file = tc->cpu; + info->iter.cpu_file = tracing_get_cpu(inode); info->iter.trace = tr->current_trace; info->iter.trace_buffer = &tr->trace_buffer; info->spare = NULL; @@ -4765,7 +4945,11 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) mutex_unlock(&trace_types_lock); - return nonseekable_open(inode, filp); + ret = nonseekable_open(inode, filp); + if (ret < 0) + trace_array_put(tr); + + return ret; } static unsigned int @@ -4865,8 +5049,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); - WARN_ON(!iter->tr->ref); - iter->tr->ref--; + __trace_array_put(iter->tr); if (info->spare) ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); @@ -5068,14 +5251,14 @@ static ssize_t tracing_stats_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - struct trace_cpu *tc = filp->private_data; - struct trace_array *tr = tc->tr; + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; struct trace_buffer *trace_buf = &tr->trace_buffer; + int cpu = tracing_get_cpu(inode); struct trace_seq *s; unsigned long cnt; unsigned long long t; unsigned long usec_rem; - int cpu = tc->cpu; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) @@ -5095,7 +5278,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "bytes: %ld\n", cnt); - if (trace_clocks[trace_clock_id].in_ns) { + if (trace_clocks[tr->clock_id].in_ns) { /* local or global for trace_clock */ t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); usec_rem = do_div(t, USEC_PER_SEC); @@ -5128,9 +5311,10 @@ tracing_stats_read(struct file *filp, char __user *ubuf, } static const struct file_operations tracing_stats_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = tracing_stats_read, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; #ifdef CONFIG_DYNAMIC_FTRACE @@ -5319,10 +5503,20 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) return tr->percpu_dir; } +static struct dentry * +trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, + void *data, long cpu, const struct file_operations *fops) +{ + struct dentry *ret = trace_create_file(name, mode, parent, data, fops); + + if (ret) /* See tracing_get_cpu() */ + ret->d_inode->i_cdev = (void *)(cpu + 1); + return ret; +} + static void tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) { - struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu); struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); struct dentry *d_cpu; char cpu_dir[30]; /* 30 characters should be more than enough */ @@ -5338,28 +5532,28 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) } /* per cpu trace_pipe */ - trace_create_file("trace_pipe", 0444, d_cpu, - (void *)&data->trace_cpu, &tracing_pipe_fops); + trace_create_cpu_file("trace_pipe", 0444, d_cpu, + tr, cpu, &tracing_pipe_fops); /* per cpu trace */ - trace_create_file("trace", 0644, d_cpu, - (void *)&data->trace_cpu, &tracing_fops); + trace_create_cpu_file("trace", 0644, d_cpu, + tr, cpu, &tracing_fops); - trace_create_file("trace_pipe_raw", 0444, d_cpu, - (void *)&data->trace_cpu, &tracing_buffers_fops); + trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu, + tr, cpu, &tracing_buffers_fops); - trace_create_file("stats", 0444, d_cpu, - (void *)&data->trace_cpu, &tracing_stats_fops); + trace_create_cpu_file("stats", 0444, d_cpu, + tr, cpu, &tracing_stats_fops); - trace_create_file("buffer_size_kb", 0444, d_cpu, - (void *)&data->trace_cpu, &tracing_entries_fops); + trace_create_cpu_file("buffer_size_kb", 0444, d_cpu, + tr, cpu, &tracing_entries_fops); #ifdef CONFIG_TRACER_SNAPSHOT - trace_create_file("snapshot", 0644, d_cpu, - (void *)&data->trace_cpu, &snapshot_fops); + trace_create_cpu_file("snapshot", 0644, d_cpu, + tr, cpu, &snapshot_fops); - trace_create_file("snapshot_raw", 0444, d_cpu, - (void *)&data->trace_cpu, &snapshot_raw_fops); + trace_create_cpu_file("snapshot_raw", 0444, d_cpu, + tr, cpu, &snapshot_raw_fops); #endif } @@ -5614,15 +5808,10 @@ rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; - struct ring_buffer *buffer = tr->trace_buffer.buffer; char buf[64]; int r; - if (buffer) - r = ring_buffer_record_is_on(buffer); - else - r = 0; - + r = tracer_tracing_is_on(tr); r = sprintf(buf, "%d\n", r); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); @@ -5644,11 +5833,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf, if (buffer) { mutex_lock(&trace_types_lock); if (val) { - ring_buffer_record_on(buffer); + tracer_tracing_on(tr); if (tr->current_trace->start) tr->current_trace->start(tr); } else { - ring_buffer_record_off(buffer); + tracer_tracing_off(tr); if (tr->current_trace->stop) tr->current_trace->stop(tr); } @@ -5661,9 +5850,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf, } static const struct file_operations rb_simple_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = rb_simple_read, .write = rb_simple_write, + .release = tracing_release_generic_tr, .llseek = default_llseek, }; @@ -5672,17 +5862,6 @@ struct dentry *trace_instance_dir; static void init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); -static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf) -{ - int cpu; - - for_each_tracing_cpu(cpu) { - memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu)); - per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu; - per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr; - } -} - static int allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) { @@ -5700,8 +5879,6 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size return -ENOMEM; } - init_trace_buffers(tr, buf); - /* Allocate the first page for all buffers */ set_buffer_entries(&tr->trace_buffer, ring_buffer_size(tr->trace_buffer.buffer, 0)); @@ -5758,6 +5935,11 @@ static int new_instance_create(const char *name) if (!tr->name) goto out_free_tr; + if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) + goto out_free_tr; + + cpumask_copy(tr->tracing_cpumask, cpu_all_mask); + raw_spin_lock_init(&tr->start_lock); tr->current_trace = &nop_trace; @@ -5768,17 +5950,15 @@ static int new_instance_create(const char *name) if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; - /* Holder for file callbacks */ - tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS; - tr->trace_cpu.tr = tr; - tr->dir = debugfs_create_dir(name, trace_instance_dir); if (!tr->dir) goto out_free_tr; ret = event_trace_add_tracer(tr->dir, tr); - if (ret) + if (ret) { + debugfs_remove_recursive(tr->dir); goto out_free_tr; + } init_tracer_debugfs(tr, tr->dir); @@ -5791,6 +5971,7 @@ static int new_instance_create(const char *name) out_free_tr: if (tr->trace_buffer.buffer) ring_buffer_free(tr->trace_buffer.buffer); + free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); @@ -5920,22 +6101,25 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) { int cpu; + trace_create_file("tracing_cpumask", 0644, d_tracer, + tr, &tracing_cpumask_fops); + trace_create_file("trace_options", 0644, d_tracer, tr, &tracing_iter_fops); trace_create_file("trace", 0644, d_tracer, - (void *)&tr->trace_cpu, &tracing_fops); + tr, &tracing_fops); trace_create_file("trace_pipe", 0444, d_tracer, - (void *)&tr->trace_cpu, &tracing_pipe_fops); + tr, &tracing_pipe_fops); trace_create_file("buffer_size_kb", 0644, d_tracer, - (void *)&tr->trace_cpu, &tracing_entries_fops); + tr, &tracing_entries_fops); trace_create_file("buffer_total_size_kb", 0444, d_tracer, tr, &tracing_total_entries_fops); - trace_create_file("free_buffer", 0644, d_tracer, + trace_create_file("free_buffer", 0200, d_tracer, tr, &tracing_free_buffer_fops); trace_create_file("trace_marker", 0220, d_tracer, @@ -5945,11 +6129,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) &trace_clock_fops); trace_create_file("tracing_on", 0644, d_tracer, - tr, &rb_simple_fops); + tr, &rb_simple_fops); #ifdef CONFIG_TRACER_SNAPSHOT trace_create_file("snapshot", 0644, d_tracer, - (void *)&tr->trace_cpu, &snapshot_fops); + tr, &snapshot_fops); #endif for_each_tracing_cpu(cpu) @@ -5969,9 +6153,6 @@ static __init int tracer_init_debugfs(void) init_tracer_debugfs(&global_trace, d_tracer); - trace_create_file("tracing_cpumask", 0644, d_tracer, - &global_trace, &tracing_cpumask_fops); - trace_create_file("available_tracers", 0444, d_tracer, &global_trace, &show_traces_fops); @@ -6193,7 +6374,7 @@ __init static int tracer_alloc_buffers(void) if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) goto out; - if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) + if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; /* Only allocate trace_printk buffers if a trace_printk exists */ @@ -6208,7 +6389,7 @@ __init static int tracer_alloc_buffers(void) ring_buf_size = 1; cpumask_copy(tracing_buffer_mask, cpu_possible_mask); - cpumask_copy(tracing_cpumask, cpu_all_mask); + cpumask_copy(global_trace.tracing_cpumask, cpu_all_mask); raw_spin_lock_init(&global_trace.start_lock); @@ -6243,10 +6424,6 @@ __init static int tracer_alloc_buffers(void) global_trace.flags = TRACE_ARRAY_FL_GLOBAL; - /* Holder for file callbacks */ - global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS; - global_trace.trace_cpu.tr = &global_trace; - INIT_LIST_HEAD(&global_trace.systems); INIT_LIST_HEAD(&global_trace.events); list_add(&global_trace.list, &ftrace_trace_arrays); @@ -6267,7 +6444,7 @@ out_free_cpumask: #ifdef CONFIG_TRACER_MAX_TRACE free_percpu(global_trace.max_buffer.data); #endif - free_cpumask_var(tracing_cpumask); + free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); out: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 711ca7d3e7f..10c86fb7a2b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -130,19 +130,12 @@ enum trace_flag_type { struct trace_array; -struct trace_cpu { - struct trace_array *tr; - struct dentry *dir; - int cpu; -}; - /* * The CPU trace array - it consists of thousands of trace entries * plus some other descriptor data: (for example which task started * the trace, etc.) */ struct trace_array_cpu { - struct trace_cpu trace_cpu; atomic_t disabled; void *buffer_page; /* ring buffer spare */ @@ -196,7 +189,6 @@ struct trace_array { bool allocated_snapshot; #endif int buffer_disabled; - struct trace_cpu trace_cpu; /* place holder */ #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; int sys_refcount_exit; @@ -214,7 +206,7 @@ struct trace_array { struct dentry *event_dir; struct list_head systems; struct list_head events; - struct task_struct *waiter; + cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ int ref; }; @@ -224,6 +216,11 @@ enum { extern struct list_head ftrace_trace_arrays; +extern struct mutex trace_types_lock; + +extern int trace_array_get(struct trace_array *tr); +extern void trace_array_put(struct trace_array *tr); + /* * The global tracer (top) should be the first trace array added, * but we check the flag anyway. @@ -554,11 +551,6 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu); void poll_wait_pipe(struct trace_iterator *iter); -void ftrace(struct trace_array *tr, - struct trace_array_cpu *data, - unsigned long ip, - unsigned long parent_ip, - unsigned long flags, int pc); void tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *prev, struct task_struct *next, @@ -680,6 +672,15 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); +/* + * Tracer data references selftest functions that only occur + * on boot up. These can be __init functions. Thus, when selftests + * are enabled, then the tracers need to reference __init functions. + */ +#define __tracer_data __refdata +#else +/* Tracers are seldom changed. Optimize when selftests are disabled. */ +#define __tracer_data __read_mostly #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); @@ -700,8 +701,6 @@ enum print_line_t print_trace_line(struct trace_iterator *iter); extern unsigned long trace_flags; -extern int trace_clock_id; - /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -776,6 +775,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER +extern bool ftrace_filter_param __initdata; static inline int ftrace_trace_task(struct task_struct *task) { if (list_empty(&ftrace_pids)) @@ -901,12 +901,6 @@ static inline void trace_branch_disable(void) /* set ring buffers to default size if not already done so */ int tracing_update_buffers(void); -/* trace event type bit fields, not numeric */ -enum { - TRACE_EVENT_TYPE_PRINTF = 1, - TRACE_EVENT_TYPE_RAW = 2, -}; - struct ftrace_event_field { struct list_head link; const char *name; @@ -1029,6 +1023,9 @@ extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; +extern const char *__start___tracepoint_str[]; +extern const char *__stop___tracepoint_str[]; + void trace_printk_init_buffers(void); void trace_printk_start_comm(void); int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 84b1e045fab..80c36bcf66e 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -236,6 +236,10 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "perf buffer not large enough")) + return NULL; + pc = preempt_count(); *rctxp = perf_swevent_get_recursion_context(); @@ -266,6 +270,10 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, struct pt_regs regs; int rctx; + head = this_cpu_ptr(event_function.perf_events); + if (hlist_empty(head)) + return; + #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ sizeof(u64)) - sizeof(u32)) @@ -279,8 +287,6 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, entry->ip = ip; entry->parent_ip = parent_ip; - - head = this_cpu_ptr(event_function.perf_events); perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, 1, ®s, head, NULL); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 27963e2bf4b..368a4d50cc3 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -41,6 +41,23 @@ static LIST_HEAD(ftrace_common_fields); static struct kmem_cache *field_cachep; static struct kmem_cache *file_cachep; +#define SYSTEM_FL_FREE_NAME (1 << 31) + +static inline int system_refcount(struct event_subsystem *system) +{ + return system->ref_count & ~SYSTEM_FL_FREE_NAME; +} + +static int system_refcount_inc(struct event_subsystem *system) +{ + return (system->ref_count++) & ~SYSTEM_FL_FREE_NAME; +} + +static int system_refcount_dec(struct event_subsystem *system) +{ + return (--system->ref_count) & ~SYSTEM_FL_FREE_NAME; +} + /* Double loops, do not use break, only goto's work */ #define do_for_each_event_file(tr, file) \ list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ @@ -97,7 +114,7 @@ static int __trace_define_field(struct list_head *head, const char *type, field = kmem_cache_alloc(field_cachep, GFP_TRACE); if (!field) - goto err; + return -ENOMEM; field->name = name; field->type = type; @@ -114,11 +131,6 @@ static int __trace_define_field(struct list_head *head, const char *type, list_add(&field->link, head); return 0; - -err: - kmem_cache_free(field_cachep, field); - - return -ENOMEM; } int trace_define_field(struct ftrace_event_call *call, const char *type, @@ -279,9 +291,11 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, } call->class->reg(call, TRACE_REG_UNREGISTER, file); } - /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */ + /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + else + clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); break; case 1: /* @@ -349,8 +363,8 @@ static void __put_system(struct event_subsystem *system) { struct event_filter *filter = system->filter; - WARN_ON_ONCE(system->ref_count == 0); - if (--system->ref_count) + WARN_ON_ONCE(system_refcount(system) == 0); + if (system_refcount_dec(system)) return; list_del(&system->list); @@ -359,13 +373,15 @@ static void __put_system(struct event_subsystem *system) kfree(filter->filter_string); kfree(filter); } + if (system->ref_count & SYSTEM_FL_FREE_NAME) + kfree(system->name); kfree(system); } static void __get_system(struct event_subsystem *system) { - WARN_ON_ONCE(system->ref_count == 0); - system->ref_count++; + WARN_ON_ONCE(system_refcount(system) == 0); + system_refcount_inc(system); } static void __get_system_dir(struct ftrace_subsystem_dir *dir) @@ -379,7 +395,7 @@ static void __put_system_dir(struct ftrace_subsystem_dir *dir) { WARN_ON_ONCE(dir->ref_count == 0); /* If the subsystem is about to be freed, the dir must be too */ - WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1); + WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1); __put_system(dir->subsystem); if (!--dir->ref_count) @@ -393,17 +409,55 @@ static void put_system(struct ftrace_subsystem_dir *dir) mutex_unlock(&event_mutex); } +static void remove_subsystem(struct ftrace_subsystem_dir *dir) +{ + if (!dir) + return; + + if (!--dir->nr_events) { + debugfs_remove_recursive(dir->entry); + list_del(&dir->list); + __put_system_dir(dir); + } +} + +static void *event_file_data(struct file *filp) +{ + return ACCESS_ONCE(file_inode(filp)->i_private); +} + +static void remove_event_file_dir(struct ftrace_event_file *file) +{ + struct dentry *dir = file->dir; + struct dentry *child; + + if (dir) { + spin_lock(&dir->d_lock); /* probably unneeded */ + list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) { + if (child->d_inode) /* probably unneeded */ + child->d_inode->i_private = NULL; + } + spin_unlock(&dir->d_lock); + + debugfs_remove_recursive(dir); + } + + list_del(&file->list); + remove_subsystem(file->system); + kmem_cache_free(file_cachep, file); +} + /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ -static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, - const char *sub, const char *event, int set) +static int +__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, + const char *sub, const char *event, int set) { struct ftrace_event_file *file; struct ftrace_event_call *call; int ret = -EINVAL; - mutex_lock(&event_mutex); list_for_each_entry(file, &tr->events, list) { call = file->event_call; @@ -429,6 +483,17 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, ret = 0; } + + return ret; +} + +static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, + const char *sub, const char *event, int set) +{ + int ret; + + mutex_lock(&event_mutex); + ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set); mutex_unlock(&event_mutex); return ret; @@ -623,18 +688,28 @@ static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_file *file = filp->private_data; - char *buf; + struct ftrace_event_file *file; + unsigned long flags; + char buf[4] = "0"; - if (file->flags & FTRACE_EVENT_FL_ENABLED) { - if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED) - buf = "0*\n"; - else if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) - buf = "1*\n"; - else - buf = "1\n"; - } else - buf = "0\n"; + mutex_lock(&event_mutex); + file = event_file_data(filp); + if (likely(file)) + flags = file->flags; + mutex_unlock(&event_mutex); + + if (!file) + return -ENODEV; + + if (flags & FTRACE_EVENT_FL_ENABLED && + !(flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + strcpy(buf, "1"); + + if (flags & FTRACE_EVENT_FL_SOFT_DISABLED || + flags & FTRACE_EVENT_FL_SOFT_MODE) + strcat(buf, "*"); + + strcat(buf, "\n"); return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf)); } @@ -643,13 +718,10 @@ static ssize_t event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_file *file = filp->private_data; + struct ftrace_event_file *file; unsigned long val; int ret; - if (!file) - return -EINVAL; - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; @@ -661,8 +733,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, switch (val) { case 0: case 1: + ret = -ENODEV; mutex_lock(&event_mutex); - ret = ftrace_event_enable_disable(file, val); + file = event_file_data(filp); + if (likely(file)) + ret = ftrace_event_enable_disable(file, val); mutex_unlock(&event_mutex); break; @@ -769,65 +844,39 @@ enum { static void *f_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = m->private; - struct ftrace_event_field *field; + struct ftrace_event_call *call = event_file_data(m->private); struct list_head *common_head = &ftrace_common_fields; struct list_head *head = trace_get_fields(call); + struct list_head *node = v; (*pos)++; switch ((unsigned long)v) { case FORMAT_HEADER: - if (unlikely(list_empty(common_head))) - return NULL; - - field = list_entry(common_head->prev, - struct ftrace_event_field, link); - return field; + node = common_head; + break; case FORMAT_FIELD_SEPERATOR: - if (unlikely(list_empty(head))) - return NULL; - - field = list_entry(head->prev, struct ftrace_event_field, link); - return field; + node = head; + break; case FORMAT_PRINTFMT: /* all done */ return NULL; } - field = v; - if (field->link.prev == common_head) + node = node->prev; + if (node == common_head) return (void *)FORMAT_FIELD_SEPERATOR; - else if (field->link.prev == head) + else if (node == head) return (void *)FORMAT_PRINTFMT; - - field = list_entry(field->link.prev, struct ftrace_event_field, link); - - return field; -} - -static void *f_start(struct seq_file *m, loff_t *pos) -{ - loff_t l = 0; - void *p; - - /* Start by showing the header */ - if (!*pos) - return (void *)FORMAT_HEADER; - - p = (void *)FORMAT_HEADER; - do { - p = f_next(m, p, &l); - } while (p && l < *pos); - - return p; + else + return node; } static int f_show(struct seq_file *m, void *v) { - struct ftrace_event_call *call = m->private; + struct ftrace_event_call *call = event_file_data(m->private); struct ftrace_event_field *field; const char *array_descriptor; @@ -848,8 +897,7 @@ static int f_show(struct seq_file *m, void *v) return 0; } - field = v; - + field = list_entry(v, struct ftrace_event_field, link); /* * Smartly shows the array type(except dynamic array). * Normal: @@ -876,8 +924,25 @@ static int f_show(struct seq_file *m, void *v) return 0; } +static void *f_start(struct seq_file *m, loff_t *pos) +{ + void *p = (void *)FORMAT_HEADER; + loff_t l = 0; + + /* ->stop() is called even if ->start() fails */ + mutex_lock(&event_mutex); + if (!event_file_data(m->private)) + return ERR_PTR(-ENODEV); + + while (l < *pos && p) + p = f_next(m, p, &l); + + return p; +} + static void f_stop(struct seq_file *m, void *p) { + mutex_unlock(&event_mutex); } static const struct seq_operations trace_format_seq_ops = { @@ -889,7 +954,6 @@ static const struct seq_operations trace_format_seq_ops = { static int trace_format_open(struct inode *inode, struct file *file) { - struct ftrace_event_call *call = inode->i_private; struct seq_file *m; int ret; @@ -898,7 +962,7 @@ static int trace_format_open(struct inode *inode, struct file *file) return ret; m = file->private_data; - m->private = call; + m->private = file; return 0; } @@ -906,45 +970,47 @@ static int trace_format_open(struct inode *inode, struct file *file) static ssize_t event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call = filp->private_data; - struct trace_seq *s; - int r; + int id = (long)event_file_data(filp); + char buf[32]; + int len; if (*ppos) return 0; - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; + if (unlikely(!id)) + return -ENODEV; - trace_seq_init(s); - trace_seq_printf(s, "%d\n", call->event.type); + len = sprintf(buf, "%d\n", id); - r = simple_read_from_buffer(ubuf, cnt, ppos, - s->buffer, s->len); - kfree(s); - return r; + return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); } static ssize_t event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call = filp->private_data; + struct ftrace_event_call *call; struct trace_seq *s; - int r; + int r = -ENODEV; if (*ppos) return 0; s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) return -ENOMEM; trace_seq_init(s); - print_event_filter(call, s); - r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + mutex_lock(&event_mutex); + call = event_file_data(filp); + if (call) + print_event_filter(call, s); + mutex_unlock(&event_mutex); + + if (call) + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -955,9 +1021,9 @@ static ssize_t event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call = filp->private_data; + struct ftrace_event_call *call; char *buf; - int err; + int err = -ENODEV; if (cnt >= PAGE_SIZE) return -EINVAL; @@ -972,7 +1038,12 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, } buf[cnt] = '\0'; - err = apply_event_filter(call, buf); + mutex_lock(&event_mutex); + call = event_file_data(filp); + if (call) + err = apply_event_filter(call, buf); + mutex_unlock(&event_mutex); + free_page((unsigned long) buf); if (err < 0) return err; @@ -992,6 +1063,7 @@ static int subsystem_open(struct inode *inode, struct file *filp) int ret; /* Make sure the system still exists */ + mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); list_for_each_entry(tr, &ftrace_trace_arrays, list) { list_for_each_entry(dir, &tr->systems, list) { @@ -1007,6 +1079,7 @@ static int subsystem_open(struct inode *inode, struct file *filp) } exit_loop: mutex_unlock(&event_mutex); + mutex_unlock(&trace_types_lock); if (!system) return -ENODEV; @@ -1014,9 +1087,17 @@ static int subsystem_open(struct inode *inode, struct file *filp) /* Some versions of gcc think dir can be uninitialized here */ WARN_ON(!dir); + /* Still need to increment the ref count of the system */ + if (trace_array_get(tr) < 0) { + put_system(dir); + return -ENODEV; + } + ret = tracing_open_generic(inode, filp); - if (ret < 0) + if (ret < 0) { + trace_array_put(tr); put_system(dir); + } return ret; } @@ -1027,16 +1108,23 @@ static int system_tr_open(struct inode *inode, struct file *filp) struct trace_array *tr = inode->i_private; int ret; + if (trace_array_get(tr) < 0) + return -ENODEV; + /* Make a temporary dir that has no system but points to tr */ dir = kzalloc(sizeof(*dir), GFP_KERNEL); - if (!dir) + if (!dir) { + trace_array_put(tr); return -ENOMEM; + } dir->tr = tr; ret = tracing_open_generic(inode, filp); - if (ret < 0) + if (ret < 0) { + trace_array_put(tr); kfree(dir); + } filp->private_data = dir; @@ -1047,6 +1135,8 @@ static int subsystem_release(struct inode *inode, struct file *file) { struct ftrace_subsystem_dir *dir = file->private_data; + trace_array_put(dir->tr); + /* * If dir->subsystem is NULL, then this is a temporary * descriptor that was made for a trace_array to enable @@ -1143,6 +1233,7 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) static int ftrace_event_avail_open(struct inode *inode, struct file *file); static int ftrace_event_set_open(struct inode *inode, struct file *file); +static int ftrace_event_release(struct inode *inode, struct file *file); static const struct seq_operations show_event_seq_ops = { .start = t_start, @@ -1170,7 +1261,7 @@ static const struct file_operations ftrace_set_event_fops = { .read = seq_read, .write = ftrace_event_write, .llseek = seq_lseek, - .release = seq_release, + .release = ftrace_event_release, }; static const struct file_operations ftrace_enable_fops = { @@ -1188,7 +1279,6 @@ static const struct file_operations ftrace_event_format_fops = { }; static const struct file_operations ftrace_event_id_fops = { - .open = tracing_open_generic, .read = event_id_read, .llseek = default_llseek, }; @@ -1247,6 +1337,15 @@ ftrace_event_open(struct inode *inode, struct file *file, return ret; } +static int ftrace_event_release(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + + return seq_release(inode, file); +} + static int ftrace_event_avail_open(struct inode *inode, struct file *file) { @@ -1260,12 +1359,19 @@ ftrace_event_set_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_set_event_seq_ops; struct trace_array *tr = inode->i_private; + int ret; + + if (trace_array_get(tr) < 0) + return -ENODEV; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) ftrace_clear_events(tr); - return ftrace_event_open(inode, file, seq_ops); + ret = ftrace_event_open(inode, file, seq_ops); + if (ret < 0) + trace_array_put(tr); + return ret; } static struct event_subsystem * @@ -1279,7 +1385,15 @@ create_new_subsystem(const char *name) return NULL; system->ref_count = 1; - system->name = name; + + /* Only allocate if dynamic (kprobes and modules) */ + if (!core_kernel_data((unsigned long)name)) { + system->ref_count |= SYSTEM_FL_FREE_NAME; + system->name = kstrdup(name, GFP_KERNEL); + if (!system->name) + goto out_free; + } else + system->name = name; system->filter = NULL; @@ -1292,6 +1406,8 @@ create_new_subsystem(const char *name) return system; out_free: + if (system->ref_count & SYSTEM_FL_FREE_NAME) + kfree(system->name); kfree(system); return NULL; } @@ -1373,12 +1489,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } static int -event_create_dir(struct dentry *parent, - struct ftrace_event_file *file, - const struct file_operations *id, - const struct file_operations *enable, - const struct file_operations *filter, - const struct file_operations *format) +event_create_dir(struct dentry *parent, struct ftrace_event_file *file) { struct ftrace_event_call *call = file->event_call; struct trace_array *tr = file->tr; @@ -1406,12 +1517,13 @@ event_create_dir(struct dentry *parent, if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) trace_create_file("enable", 0644, file->dir, file, - enable); + &ftrace_enable_fops); #ifdef CONFIG_PERF_EVENTS if (call->event.type && call->class->reg) - trace_create_file("id", 0444, file->dir, call, - id); + trace_create_file("id", 0444, file->dir, + (void *)(long)call->event.type, + &ftrace_event_id_fops); #endif /* @@ -1428,41 +1540,24 @@ event_create_dir(struct dentry *parent, } } trace_create_file("filter", 0644, file->dir, call, - filter); + &ftrace_event_filter_fops); trace_create_file("format", 0444, file->dir, call, - format); + &ftrace_event_format_fops); return 0; } -static void remove_subsystem(struct ftrace_subsystem_dir *dir) -{ - if (!dir) - return; - - if (!--dir->nr_events) { - debugfs_remove_recursive(dir->entry); - list_del(&dir->list); - __put_system_dir(dir); - } -} - static void remove_event_from_tracers(struct ftrace_event_call *call) { struct ftrace_event_file *file; struct trace_array *tr; do_for_each_event_file_safe(tr, file) { - if (file->event_call != call) continue; - list_del(&file->list); - debugfs_remove_recursive(file->dir); - remove_subsystem(file->system); - kmem_cache_free(file_cachep, file); - + remove_event_file_dir(file); /* * The do_for_each_event_file_safe() is * a double loop. After finding the call for this @@ -1549,12 +1644,7 @@ trace_create_new_event(struct ftrace_event_call *call, /* Add an event to a trace directory */ static int -__trace_add_new_event(struct ftrace_event_call *call, - struct trace_array *tr, - const struct file_operations *id, - const struct file_operations *enable, - const struct file_operations *filter, - const struct file_operations *format) +__trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr) { struct ftrace_event_file *file; @@ -1562,7 +1652,7 @@ __trace_add_new_event(struct ftrace_event_call *call, if (!file) return -ENOMEM; - return event_create_dir(tr->event_dir, file, id, enable, filter, format); + return event_create_dir(tr->event_dir, file); } /* @@ -1584,25 +1674,27 @@ __trace_early_add_new_event(struct ftrace_event_call *call, } struct ftrace_module_file_ops; -static void __add_event_to_tracers(struct ftrace_event_call *call, - struct ftrace_module_file_ops *file_ops); +static void __add_event_to_tracers(struct ftrace_event_call *call); /* Add an additional event_call dynamically */ int trace_add_event_call(struct ftrace_event_call *call) { int ret; + mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); ret = __register_event(call, NULL); if (ret >= 0) - __add_event_to_tracers(call, NULL); + __add_event_to_tracers(call); mutex_unlock(&event_mutex); + mutex_unlock(&trace_types_lock); return ret; } /* - * Must be called under locking both of event_mutex and trace_event_sem. + * Must be called under locking of trace_types_lock, event_mutex and + * trace_event_sem. */ static void __trace_remove_event_call(struct ftrace_event_call *call) { @@ -1611,14 +1703,53 @@ static void __trace_remove_event_call(struct ftrace_event_call *call) destroy_preds(call); } +static int probe_remove_event_call(struct ftrace_event_call *call) +{ + struct trace_array *tr; + struct ftrace_event_file *file; + +#ifdef CONFIG_PERF_EVENTS + if (call->perf_refcount) + return -EBUSY; +#endif + do_for_each_event_file(tr, file) { + if (file->event_call != call) + continue; + /* + * We can't rely on ftrace_event_enable_disable(enable => 0) + * we are going to do, FTRACE_EVENT_FL_SOFT_MODE can suppress + * TRACE_REG_UNREGISTER. + */ + if (file->flags & FTRACE_EVENT_FL_ENABLED) + return -EBUSY; + /* + * The do_for_each_event_file_safe() is + * a double loop. After finding the call for this + * trace_array, we use break to jump to the next + * trace_array. + */ + break; + } while_for_each_event_file(); + + __trace_remove_event_call(call); + + return 0; +} + /* Remove an event_call */ -void trace_remove_event_call(struct ftrace_event_call *call) +int trace_remove_event_call(struct ftrace_event_call *call) { + int ret; + + mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); down_write(&trace_event_sem); - __trace_remove_event_call(call); + ret = probe_remove_event_call(call); up_write(&trace_event_sem); mutex_unlock(&event_mutex); + mutex_unlock(&trace_types_lock); + + return ret; } #define for_each_event(event, start, end) \ @@ -1628,100 +1759,21 @@ void trace_remove_event_call(struct ftrace_event_call *call) #ifdef CONFIG_MODULES -static LIST_HEAD(ftrace_module_file_list); - -/* - * Modules must own their file_operations to keep up with - * reference counting. - */ -struct ftrace_module_file_ops { - struct list_head list; - struct module *mod; - struct file_operations id; - struct file_operations enable; - struct file_operations format; - struct file_operations filter; -}; - -static struct ftrace_module_file_ops * -find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) -{ - /* - * As event_calls are added in groups by module, - * when we find one file_ops, we don't need to search for - * each call in that module, as the rest should be the - * same. Only search for a new one if the last one did - * not match. - */ - if (file_ops && mod == file_ops->mod) - return file_ops; - - list_for_each_entry(file_ops, &ftrace_module_file_list, list) { - if (file_ops->mod == mod) - return file_ops; - } - return NULL; -} - -static struct ftrace_module_file_ops * -trace_create_file_ops(struct module *mod) -{ - struct ftrace_module_file_ops *file_ops; - - /* - * This is a bit of a PITA. To allow for correct reference - * counting, modules must "own" their file_operations. - * To do this, we allocate the file operations that will be - * used in the event directory. - */ - - file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL); - if (!file_ops) - return NULL; - - file_ops->mod = mod; - - file_ops->id = ftrace_event_id_fops; - file_ops->id.owner = mod; - - file_ops->enable = ftrace_enable_fops; - file_ops->enable.owner = mod; - - file_ops->filter = ftrace_event_filter_fops; - file_ops->filter.owner = mod; - - file_ops->format = ftrace_event_format_fops; - file_ops->format.owner = mod; - - list_add(&file_ops->list, &ftrace_module_file_list); - - return file_ops; -} - static void trace_module_add_events(struct module *mod) { - struct ftrace_module_file_ops *file_ops = NULL; struct ftrace_event_call **call, **start, **end; start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; - if (start == end) - return; - - file_ops = trace_create_file_ops(mod); - if (!file_ops) - return; - for_each_event(call, start, end) { __register_event(*call, mod); - __add_event_to_tracers(*call, file_ops); + __add_event_to_tracers(*call); } } static void trace_module_remove_events(struct module *mod) { - struct ftrace_module_file_ops *file_ops; struct ftrace_event_call *call, *p; bool clear_trace = false; @@ -1733,16 +1785,6 @@ static void trace_module_remove_events(struct module *mod) __trace_remove_event_call(call); } } - - /* Now free the file_operations */ - list_for_each_entry(file_ops, &ftrace_module_file_list, list) { - if (file_ops->mod == mod) - break; - } - if (&file_ops->list != &ftrace_module_file_list) { - list_del(&file_ops->list); - kfree(file_ops); - } up_write(&trace_event_sem); /* @@ -1762,6 +1804,7 @@ static int trace_module_notify(struct notifier_block *self, { struct module *mod = data; + mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); switch (val) { case MODULE_STATE_COMING: @@ -1772,71 +1815,26 @@ static int trace_module_notify(struct notifier_block *self, break; } mutex_unlock(&event_mutex); + mutex_unlock(&trace_types_lock); return 0; } -static int -__trace_add_new_mod_event(struct ftrace_event_call *call, - struct trace_array *tr, - struct ftrace_module_file_ops *file_ops) -{ - return __trace_add_new_event(call, tr, - &file_ops->id, &file_ops->enable, - &file_ops->filter, &file_ops->format); -} - -#else -static inline struct ftrace_module_file_ops * -find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) -{ - return NULL; -} -static inline int trace_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - return 0; -} -static inline int -__trace_add_new_mod_event(struct ftrace_event_call *call, - struct trace_array *tr, - struct ftrace_module_file_ops *file_ops) -{ - return -ENODEV; -} +static struct notifier_block trace_module_nb = { + .notifier_call = trace_module_notify, + .priority = 0, +}; #endif /* CONFIG_MODULES */ /* Create a new event directory structure for a trace directory. */ static void __trace_add_event_dirs(struct trace_array *tr) { - struct ftrace_module_file_ops *file_ops = NULL; struct ftrace_event_call *call; int ret; list_for_each_entry(call, &ftrace_events, list) { - if (call->mod) { - /* - * Directories for events by modules need to - * keep module ref counts when opened (as we don't - * want the module to disappear when reading one - * of these files). The file_ops keep account of - * the module ref count. - */ - file_ops = find_ftrace_file_ops(file_ops, call->mod); - if (!file_ops) - continue; /* Warn? */ - ret = __trace_add_new_mod_event(call, tr, file_ops); - if (ret < 0) - pr_warning("Could not create directory for event %s\n", - call->name); - continue; - } - ret = __trace_add_new_event(call, tr, - &ftrace_event_id_fops, - &ftrace_enable_fops, - &ftrace_event_filter_fops, - &ftrace_event_format_fops); + ret = __trace_add_new_event(call, tr); if (ret < 0) pr_warning("Could not create directory for event %s\n", call->name); @@ -2011,10 +2009,7 @@ event_enable_func(struct ftrace_hash *hash, int ret; /* hash funcs only work with set_ftrace_filter */ - if (!enabled) - return -EINVAL; - - if (!param) + if (!enabled || !param) return -EINVAL; system = strsep(¶m, ":"); @@ -2147,11 +2142,7 @@ __trace_early_add_event_dirs(struct trace_array *tr) list_for_each_entry(file, &tr->events, list) { - ret = event_create_dir(tr->event_dir, file, - &ftrace_event_id_fops, - &ftrace_enable_fops, - &ftrace_event_filter_fops, - &ftrace_event_format_fops); + ret = event_create_dir(tr->event_dir, file); if (ret < 0) pr_warning("Could not create directory for event %s\n", file->event_call->name); @@ -2188,37 +2179,18 @@ __trace_remove_event_dirs(struct trace_array *tr) { struct ftrace_event_file *file, *next; - list_for_each_entry_safe(file, next, &tr->events, list) { - list_del(&file->list); - debugfs_remove_recursive(file->dir); - remove_subsystem(file->system); - kmem_cache_free(file_cachep, file); - } + list_for_each_entry_safe(file, next, &tr->events, list) + remove_event_file_dir(file); } -static void -__add_event_to_tracers(struct ftrace_event_call *call, - struct ftrace_module_file_ops *file_ops) +static void __add_event_to_tracers(struct ftrace_event_call *call) { struct trace_array *tr; - list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (file_ops) - __trace_add_new_mod_event(call, tr, file_ops); - else - __trace_add_new_event(call, tr, - &ftrace_event_id_fops, - &ftrace_enable_fops, - &ftrace_event_filter_fops, - &ftrace_event_format_fops); - } + list_for_each_entry(tr, &ftrace_trace_arrays, list) + __trace_add_new_event(call, tr); } -static struct notifier_block trace_module_nb = { - .notifier_call = trace_module_notify, - .priority = 0, -}; - extern struct ftrace_event_call *__start_ftrace_events[]; extern struct ftrace_event_call *__stop_ftrace_events[]; @@ -2329,11 +2301,11 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr) int event_trace_del_tracer(struct trace_array *tr) { - /* Disable any running events */ - __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); - mutex_lock(&event_mutex); + /* Disable any running events */ + __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); + down_write(&trace_event_sem); __trace_remove_event_dirs(tr); debugfs_remove_recursive(tr->event_dir); @@ -2423,10 +2395,11 @@ static __init int event_trace_init(void) if (ret) return ret; +#ifdef CONFIG_MODULES ret = register_module_notifier(&trace_module_nb); if (ret) pr_warning("Failed to register trace events module notifier\n"); - +#endif return 0; } early_initcall(event_trace_memsetup); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e1b653f7e1c..97daa8cf958 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -44,6 +44,7 @@ enum filter_op_ids OP_LE, OP_GT, OP_GE, + OP_BAND, OP_NONE, OP_OPEN_PAREN, }; @@ -54,6 +55,7 @@ struct filter_op { int precedence; }; +/* Order must be the same as enum filter_op_ids above */ static struct filter_op filter_ops[] = { { OP_OR, "||", 1 }, { OP_AND, "&&", 2 }, @@ -64,6 +66,7 @@ static struct filter_op filter_ops[] = { { OP_LE, "<=", 5 }, { OP_GT, ">", 5 }, { OP_GE, ">=", 5 }, + { OP_BAND, "&", 6 }, { OP_NONE, "OP_NONE", 0 }, { OP_OPEN_PAREN, "(", 0 }, }; @@ -156,6 +159,9 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \ case OP_GE: \ match = (*addr >= val); \ break; \ + case OP_BAND: \ + match = (*addr & val); \ + break; \ default: \ break; \ } \ @@ -631,17 +637,15 @@ static void append_filter_err(struct filter_parse_state *ps, free_page((unsigned long) buf); } +/* caller must hold event_mutex */ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) { - struct event_filter *filter; + struct event_filter *filter = call->filter; - mutex_lock(&event_mutex); - filter = call->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else - trace_seq_printf(s, "none\n"); - mutex_unlock(&event_mutex); + trace_seq_puts(s, "none\n"); } void print_subsystem_event_filter(struct event_subsystem *system, @@ -654,7 +658,7 @@ void print_subsystem_event_filter(struct event_subsystem *system, if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else - trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); + trace_seq_puts(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); mutex_unlock(&event_mutex); } @@ -1835,23 +1839,22 @@ static int create_system_filter(struct event_subsystem *system, return err; } +/* caller must hold event_mutex */ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) { struct event_filter *filter; - int err = 0; - - mutex_lock(&event_mutex); + int err; if (!strcmp(strstrip(filter_string), "0")) { filter_disable(call); filter = call->filter; if (!filter) - goto out_unlock; + return 0; RCU_INIT_POINTER(call->filter, NULL); /* Make sure the filter is not being used */ synchronize_sched(); __free_filter(filter); - goto out_unlock; + return 0; } err = create_filter(call, filter_string, true, &filter); @@ -1878,8 +1881,6 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) __free_filter(tmp); } } -out_unlock: - mutex_unlock(&event_mutex); return err; } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index c4d6d719198..38fe1483c50 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -199,7 +199,7 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) return 0; } -static struct tracer function_trace __read_mostly = +static struct tracer function_trace __tracer_data = { .name = "function", .init = function_trace_init, @@ -290,6 +290,21 @@ ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) trace_dump_stack(STACK_SKIP); } +static void +ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (update_count(data)) + ftrace_dump(DUMP_ALL); +} + +/* Only dump the current CPU buffer. */ +static void +ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (update_count(data)) + ftrace_dump(DUMP_ORIG); +} + static int ftrace_probe_print(const char *name, struct seq_file *m, unsigned long ip, void *data) @@ -327,6 +342,20 @@ ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, return ftrace_probe_print("stacktrace", m, ip, data); } +static int +ftrace_dump_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("dump", m, ip, data); +} + +static int +ftrace_cpudump_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("cpudump", m, ip, data); +} + static struct ftrace_probe_ops traceon_count_probe_ops = { .func = ftrace_traceon_count, .print = ftrace_traceon_print, @@ -342,6 +371,16 @@ static struct ftrace_probe_ops stacktrace_count_probe_ops = { .print = ftrace_stacktrace_print, }; +static struct ftrace_probe_ops dump_probe_ops = { + .func = ftrace_dump_probe, + .print = ftrace_dump_print, +}; + +static struct ftrace_probe_ops cpudump_probe_ops = { + .func = ftrace_cpudump_probe, + .print = ftrace_cpudump_print, +}; + static struct ftrace_probe_ops traceon_probe_ops = { .func = ftrace_traceon, .print = ftrace_traceon_print, @@ -425,6 +464,32 @@ ftrace_stacktrace_callback(struct ftrace_hash *hash, param, enable); } +static int +ftrace_dump_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + ops = &dump_probe_ops; + + /* Only dump once. */ + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + "1", enable); +} + +static int +ftrace_cpudump_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + ops = &cpudump_probe_ops; + + /* Only dump once. */ + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + "1", enable); +} + static struct ftrace_func_command ftrace_traceon_cmd = { .name = "traceon", .func = ftrace_trace_onoff_callback, @@ -440,6 +505,16 @@ static struct ftrace_func_command ftrace_stacktrace_cmd = { .func = ftrace_stacktrace_callback, }; +static struct ftrace_func_command ftrace_dump_cmd = { + .name = "dump", + .func = ftrace_dump_callback, +}; + +static struct ftrace_func_command ftrace_cpudump_cmd = { + .name = "cpudump", + .func = ftrace_cpudump_callback, +}; + static int __init init_func_cmd_traceon(void) { int ret; @@ -450,13 +525,31 @@ static int __init init_func_cmd_traceon(void) ret = register_ftrace_command(&ftrace_traceon_cmd); if (ret) - unregister_ftrace_command(&ftrace_traceoff_cmd); + goto out_free_traceoff; ret = register_ftrace_command(&ftrace_stacktrace_cmd); - if (ret) { - unregister_ftrace_command(&ftrace_traceoff_cmd); - unregister_ftrace_command(&ftrace_traceon_cmd); - } + if (ret) + goto out_free_traceon; + + ret = register_ftrace_command(&ftrace_dump_cmd); + if (ret) + goto out_free_stacktrace; + + ret = register_ftrace_command(&ftrace_cpudump_cmd); + if (ret) + goto out_free_dump; + + return 0; + + out_free_dump: + unregister_ftrace_command(&ftrace_dump_cmd); + out_free_stacktrace: + unregister_ftrace_command(&ftrace_stacktrace_cmd); + out_free_traceon: + unregister_ftrace_command(&ftrace_traceon_cmd); + out_free_traceoff: + unregister_ftrace_command(&ftrace_traceoff_cmd); + return ret; } #else diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 8388bc99f2e..b5c09242683 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -446,7 +446,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid) /* First spaces to align center */ for (i = 0; i < spaces / 2; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -457,7 +457,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid) /* Last spaces to align center */ for (i = 0; i < spaces - (spaces / 2); i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -503,7 +503,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) ------------------------------------------ */ - ret = trace_seq_printf(s, + ret = trace_seq_puts(s, " ------------------------------------------\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -516,7 +516,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, " => "); + ret = trace_seq_puts(s, " => "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -524,7 +524,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, + ret = trace_seq_puts(s, "\n ------------------------------------------\n\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -645,7 +645,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, ret = print_graph_proc(s, pid); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, " | "); + ret = trace_seq_puts(s, " | "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -657,9 +657,9 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, return ret; if (type == TRACE_GRAPH_ENT) - ret = trace_seq_printf(s, "==========>"); + ret = trace_seq_puts(s, "==========>"); else - ret = trace_seq_printf(s, "<=========="); + ret = trace_seq_puts(s, "<=========="); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -668,7 +668,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, if (ret != TRACE_TYPE_HANDLED) return ret; - ret = trace_seq_printf(s, "\n"); + ret = trace_seq_putc(s, '\n'); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -705,13 +705,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) len += strlen(nsecs_str); } - ret = trace_seq_printf(s, " us "); + ret = trace_seq_puts(s, " us "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; /* Print remaining spaces to fit the row's width */ for (i = len; i < 7; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -731,13 +731,13 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, /* No real adata, just filling the column with spaces */ switch (duration) { case DURATION_FILL_FULL: - ret = trace_seq_printf(s, " | "); + ret = trace_seq_puts(s, " | "); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; case DURATION_FILL_START: - ret = trace_seq_printf(s, " "); + ret = trace_seq_puts(s, " "); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; case DURATION_FILL_END: - ret = trace_seq_printf(s, " |"); + ret = trace_seq_puts(s, " |"); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; } @@ -745,10 +745,10 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { /* Duration exceeded 100 msecs */ if (duration > 100000ULL) - ret = trace_seq_printf(s, "! "); + ret = trace_seq_puts(s, "! "); /* Duration exceeded 10 msecs */ else if (duration > 10000ULL) - ret = trace_seq_printf(s, "+ "); + ret = trace_seq_puts(s, "+ "); } /* @@ -757,7 +757,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, * to fill out the space. */ if (ret == -1) - ret = trace_seq_printf(s, " "); + ret = trace_seq_puts(s, " "); /* Catching here any failure happenned above */ if (!ret) @@ -767,7 +767,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, if (ret != TRACE_TYPE_HANDLED) return ret; - ret = trace_seq_printf(s, "| "); + ret = trace_seq_puts(s, "| "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -817,7 +817,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -858,7 +858,7 @@ print_graph_entry_nested(struct trace_iterator *iter, /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -917,7 +917,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, " | "); + ret = trace_seq_puts(s, " | "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -1117,7 +1117,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, /* Closing brace */ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } @@ -1129,7 +1129,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, * belongs to, write out the function name. */ if (func_match) { - ret = trace_seq_printf(s, "}\n"); + ret = trace_seq_puts(s, "}\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } else { @@ -1179,13 +1179,13 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, /* Indentation */ if (depth > 0) for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_printf(s, " "); + ret = trace_seq_putc(s, ' '); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } /* The comment */ - ret = trace_seq_printf(s, "/* "); + ret = trace_seq_puts(s, "/* "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -1216,7 +1216,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, s->len--; } - ret = trace_seq_printf(s, " */\n"); + ret = trace_seq_puts(s, " */\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -1448,7 +1448,7 @@ static struct trace_event graph_trace_ret_event = { .funcs = &graph_functions }; -static struct tracer graph_trace __read_mostly = { +static struct tracer graph_trace __tracer_data = { .name = "function_graph", .open = graph_trace_open, .pipe_open = graph_trace_open, diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index b19d065a28c..2aefbee93a6 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -373,7 +373,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; - if (likely(!tracer_enabled)) + if (!tracer_enabled || !tracing_is_enabled()) return; cpu = raw_smp_processor_id(); @@ -416,7 +416,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) else return; - if (!tracer_enabled) + if (!tracer_enabled || !tracing_is_enabled()) return; data = per_cpu_ptr(tr->trace_buffer.data, cpu); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9f46e98ba8f..243f6834d02 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -35,12 +35,17 @@ struct trace_probe { const char *symbol; /* symbol name */ struct ftrace_event_class class; struct ftrace_event_call call; - struct ftrace_event_file * __rcu *files; + struct list_head files; ssize_t size; /* trace entry size */ unsigned int nr_args; struct probe_arg args[]; }; +struct event_file_link { + struct ftrace_event_file *file; + struct list_head list; +}; + #define SIZEOF_TRACE_PROBE(n) \ (offsetof(struct trace_probe, args) + \ (sizeof(struct probe_arg) * (n))) @@ -90,7 +95,7 @@ static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp) } static int register_probe_event(struct trace_probe *tp); -static void unregister_probe_event(struct trace_probe *tp); +static int unregister_probe_event(struct trace_probe *tp); static DEFINE_MUTEX(probe_lock); static LIST_HEAD(probe_list); @@ -150,6 +155,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, goto error; INIT_LIST_HEAD(&tp->list); + INIT_LIST_HEAD(&tp->files); return tp; error: kfree(tp->call.name); @@ -183,25 +189,6 @@ static struct trace_probe *find_trace_probe(const char *event, return NULL; } -static int trace_probe_nr_files(struct trace_probe *tp) -{ - struct ftrace_event_file **file; - int ret = 0; - - /* - * Since all tp->files updater is protected by probe_enable_lock, - * we don't need to lock an rcu_read_lock. - */ - file = rcu_dereference_raw(tp->files); - if (file) - while (*(file++)) - ret++; - - return ret; -} - -static DEFINE_MUTEX(probe_enable_lock); - /* * Enable trace_probe * if the file is NULL, enable "perf" handler, or enable "trace" handler. @@ -211,67 +198,42 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) { int ret = 0; - mutex_lock(&probe_enable_lock); - if (file) { - struct ftrace_event_file **new, **old; - int n = trace_probe_nr_files(tp); - - old = rcu_dereference_raw(tp->files); - /* 1 is for new one and 1 is for stopper */ - new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *), - GFP_KERNEL); - if (!new) { + struct event_file_link *link; + + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) { ret = -ENOMEM; - goto out_unlock; + goto out; } - memcpy(new, old, n * sizeof(struct ftrace_event_file *)); - new[n] = file; - /* The last one keeps a NULL */ - rcu_assign_pointer(tp->files, new); - tp->flags |= TP_FLAG_TRACE; + link->file = file; + list_add_tail_rcu(&link->list, &tp->files); - if (old) { - /* Make sure the probe is done with old files */ - synchronize_sched(); - kfree(old); - } + tp->flags |= TP_FLAG_TRACE; } else tp->flags |= TP_FLAG_PROFILE; - if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) && - !trace_probe_has_gone(tp)) { + if (trace_probe_is_registered(tp) && !trace_probe_has_gone(tp)) { if (trace_probe_is_return(tp)) ret = enable_kretprobe(&tp->rp); else ret = enable_kprobe(&tp->rp.kp); } - - out_unlock: - mutex_unlock(&probe_enable_lock); - + out: return ret; } -static int -trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file) +static struct event_file_link * +find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) { - struct ftrace_event_file **files; - int i; + struct event_file_link *link; - /* - * Since all tp->files updater is protected by probe_enable_lock, - * we don't need to lock an rcu_read_lock. - */ - files = rcu_dereference_raw(tp->files); - if (files) { - for (i = 0; files[i]; i++) - if (files[i] == file) - return i; - } + list_for_each_entry(link, &tp->files, list) + if (link->file == file) + return link; - return -1; + return NULL; } /* @@ -281,43 +243,23 @@ trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file) static int disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) { + struct event_file_link *link = NULL; + int wait = 0; int ret = 0; - mutex_lock(&probe_enable_lock); - if (file) { - struct ftrace_event_file **new, **old; - int n = trace_probe_nr_files(tp); - int i, j; - - old = rcu_dereference_raw(tp->files); - if (n == 0 || trace_probe_file_index(tp, file) < 0) { + link = find_event_file_link(tp, file); + if (!link) { ret = -EINVAL; - goto out_unlock; + goto out; } - if (n == 1) { /* Remove the last file */ - tp->flags &= ~TP_FLAG_TRACE; - new = NULL; - } else { - new = kzalloc(n * sizeof(struct ftrace_event_file *), - GFP_KERNEL); - if (!new) { - ret = -ENOMEM; - goto out_unlock; - } - - /* This copy & check loop copies the NULL stopper too */ - for (i = 0, j = 0; j < n && i < n + 1; i++) - if (old[i] != file) - new[j++] = old[i]; - } - - rcu_assign_pointer(tp->files, new); + list_del_rcu(&link->list); + wait = 1; + if (!list_empty(&tp->files)) + goto out; - /* Make sure the probe is done with old files */ - synchronize_sched(); - kfree(old); + tp->flags &= ~TP_FLAG_TRACE; } else tp->flags &= ~TP_FLAG_PROFILE; @@ -326,10 +268,21 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) disable_kretprobe(&tp->rp); else disable_kprobe(&tp->rp.kp); + wait = 1; + } + out: + if (wait) { + /* + * Synchronize with kprobe_trace_func/kretprobe_trace_func + * to ensure disabled (all running handlers are finished). + * This is not only for kfree(), but also the caller, + * trace_remove_event_call() supposes it for releasing + * event_call related objects, which will be accessed in + * the kprobe_trace_func/kretprobe_trace_func. + */ + synchronize_sched(); + kfree(link); /* Ignored if link == NULL */ } - - out_unlock: - mutex_unlock(&probe_enable_lock); return ret; } @@ -398,9 +351,12 @@ static int unregister_trace_probe(struct trace_probe *tp) if (trace_probe_is_enabled(tp)) return -EBUSY; + /* Will fail if probe is being used by ftrace or perf */ + if (unregister_probe_event(tp)) + return -EBUSY; + __unregister_trace_probe(tp); list_del(&tp->list); - unregister_probe_event(tp); return 0; } @@ -679,7 +635,9 @@ static int release_all_trace_probes(void) /* TODO: Use batch unregistration */ while (!list_empty(&probe_list)) { tp = list_entry(probe_list.next, struct trace_probe, list); - unregister_trace_probe(tp); + ret = unregister_trace_probe(tp); + if (ret) + goto end; free_trace_probe(tp); } @@ -885,20 +843,10 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, static __kprobes void kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) { - /* - * Note: preempt is already disabled around the kprobe handler. - * However, we still need an smp_read_barrier_depends() corresponding - * to smp_wmb() in rcu_assign_pointer() to access the pointer. - */ - struct ftrace_event_file **file = rcu_dereference_raw(tp->files); - - if (unlikely(!file)) - return; + struct event_file_link *link; - while (*file) { - __kprobe_trace_func(tp, regs, *file); - file++; - } + list_for_each_entry_rcu(link, &tp->files, list) + __kprobe_trace_func(tp, regs, link->file); } /* Kretprobe handler */ @@ -945,20 +893,10 @@ static __kprobes void kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, struct pt_regs *regs) { - /* - * Note: preempt is already disabled around the kprobe handler. - * However, we still need an smp_read_barrier_depends() corresponding - * to smp_wmb() in rcu_assign_pointer() to access the pointer. - */ - struct ftrace_event_file **file = rcu_dereference_raw(tp->files); - - if (unlikely(!file)) - return; + struct event_file_link *link; - while (*file) { - __kretprobe_trace_func(tp, ri, regs, *file); - file++; - } + list_for_each_entry_rcu(link, &tp->files, list) + __kretprobe_trace_func(tp, ri, regs, link->file); } /* Event entry printers */ @@ -1157,13 +1095,14 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) int size, __size, dsize; int rctx; + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return; + dsize = __get_data_size(tp, regs); __size = sizeof(*entry) + tp->size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "profile buffer not large enough")) - return; entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); if (!entry) @@ -1172,10 +1111,7 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) entry->ip = (unsigned long)tp->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - - head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, - entry->ip, 1, regs, head, NULL); + perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); } /* Kretprobe profile handler */ @@ -1189,13 +1125,14 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, int size, __size, dsize; int rctx; + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return; + dsize = __get_data_size(tp, regs); __size = sizeof(*entry) + tp->size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "profile buffer not large enough")) - return; entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); if (!entry) @@ -1204,13 +1141,16 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - - head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, - entry->ret_ip, 1, regs, head, NULL); + perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); } #endif /* CONFIG_PERF_EVENTS */ +/* + * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex. + * + * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe + * lockless, but we can't race with this __init function. + */ static __kprobes int kprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) @@ -1312,11 +1252,15 @@ static int register_probe_event(struct trace_probe *tp) return ret; } -static void unregister_probe_event(struct trace_probe *tp) +static int unregister_probe_event(struct trace_probe *tp) { + int ret; + /* tp->event is unregistered in trace_remove_event_call() */ - trace_remove_event_call(&tp->call); - kfree(tp->call.print_fmt); + ret = trace_remove_event_call(&tp->call); + if (!ret) + kfree(tp->call.print_fmt); + return ret; } /* Make a debugfs interface for controlling probe points */ @@ -1376,6 +1320,10 @@ find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr) return NULL; } +/* + * Nobody but us can call enable_trace_probe/disable_trace_probe at this + * stage, we can do this lockless. + */ static __init int kprobe_trace_self_tests_init(void) { int ret, warn = 0; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index a5e8f4878bf..b3dcfb2f0fe 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -90,7 +90,7 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) if (drv) ret += trace_seq_printf(s, " %s\n", drv->name); else - ret += trace_seq_printf(s, " \n"); + ret += trace_seq_puts(s, " \n"); return ret; } @@ -107,7 +107,7 @@ static void mmio_pipe_open(struct trace_iterator *iter) struct header_iter *hiter; struct trace_seq *s = &iter->seq; - trace_seq_printf(s, "VERSION 20070824\n"); + trace_seq_puts(s, "VERSION 20070824\n"); hiter = kzalloc(sizeof(*hiter), GFP_KERNEL); if (!hiter) @@ -209,7 +209,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) (rw->value >> 0) & 0xff, rw->pc, 0); break; default: - ret = trace_seq_printf(s, "rw what?\n"); + ret = trace_seq_puts(s, "rw what?\n"); break; } if (ret) @@ -245,7 +245,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter) secs, usec_rem, m->map_id, 0UL, 0); break; default: - ret = trace_seq_printf(s, "map what?\n"); + ret = trace_seq_puts(s, "map what?\n"); break; } if (ret) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index bb922d9ee51..34e7cbac0c9 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -78,7 +78,7 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) trace_assign_type(field, entry); - ret = trace_seq_printf(s, "%s", field->buf); + ret = trace_seq_puts(s, field->buf); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -558,14 +558,14 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, if (ret) ret = trace_seq_puts(s, "??"); if (ret) - ret = trace_seq_puts(s, "\n"); + ret = trace_seq_putc(s, '\n'); continue; } if (!ret) break; if (ret) ret = seq_print_user_ip(s, mm, ip, sym_flags); - ret = trace_seq_puts(s, "\n"); + ret = trace_seq_putc(s, '\n'); } if (mm) @@ -579,7 +579,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) int ret; if (!ip) - return trace_seq_printf(s, "0"); + return trace_seq_putc(s, '0'); if (sym_flags & TRACE_ITER_SYM_OFFSET) ret = seq_print_sym_offset(s, "%s", ip); @@ -964,14 +964,14 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, goto partial; if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { - if (!trace_seq_printf(s, " <-")) + if (!trace_seq_puts(s, " <-")) goto partial; if (!seq_print_ip_sym(s, field->parent_ip, flags)) goto partial; } - if (!trace_seq_printf(s, "\n")) + if (!trace_seq_putc(s, '\n')) goto partial; return TRACE_TYPE_HANDLED; @@ -1210,7 +1210,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, if (!seq_print_ip_sym(s, *p, flags)) goto partial; - if (!trace_seq_puts(s, "\n")) + if (!trace_seq_putc(s, '\n')) goto partial; } diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index a9077c1b4ad..2900817ba65 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -244,12 +244,31 @@ static const char **find_next(void *v, loff_t *pos) { const char **fmt = v; int start_index; + int last_index; start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; if (*pos < start_index) return __start___trace_bprintk_fmt + *pos; + /* + * The __tracepoint_str section is treated the same as the + * __trace_printk_fmt section. The difference is that the + * __trace_printk_fmt section should only be used by trace_printk() + * in a debugging environment, as if anything exists in that section + * the trace_prink() helper buffers are allocated, which would just + * waste space in a production environment. + * + * The __tracepoint_str sections on the other hand are used by + * tracepoints which need to map pointers to their strings to + * the ASCII text for userspace. + */ + last_index = start_index; + start_index = __stop___tracepoint_str - __start___tracepoint_str; + + if (*pos < last_index + start_index) + return __start___tracepoint_str + (*pos - last_index); + return find_next_mod_format(start_index, v, fmt, pos); } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 2901e3b8859..a7329b7902f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -640,13 +640,20 @@ out: * Enable ftrace, sleep 1/10 second, and then read the trace * buffer to see if all is in order. */ -int +__init int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) { int save_ftrace_enabled = ftrace_enabled; unsigned long count; int ret; +#ifdef CONFIG_DYNAMIC_FTRACE + if (ftrace_filter_param) { + printk(KERN_CONT " ... kernel command line filter set: force PASS ... "); + return 0; + } +#endif + /* make sure msleep has been recorded */ msleep(1); @@ -727,13 +734,20 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) * Pretty much the same than for the function tracer from which the selftest * has been borrowed. */ -int +__init int trace_selftest_startup_function_graph(struct tracer *trace, struct trace_array *tr) { int ret; unsigned long count; +#ifdef CONFIG_DYNAMIC_FTRACE + if (ftrace_filter_param) { + printk(KERN_CONT " ... kernel command line filter set: force PASS ... "); + return 0; + } +#endif + /* * Simulate the init() callback but we attach a watchdog callback * to detect and recover from possible hangs diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8f2ac73c7a5..559329d9bd2 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -175,7 +175,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags, entry = syscall_nr_to_meta(syscall); if (!entry) { - trace_seq_printf(s, "\n"); + trace_seq_putc(s, '\n'); return TRACE_TYPE_HANDLED; } @@ -200,8 +200,8 @@ extern char *__bad_type_size(void); #type, #name, offsetof(typeof(trace), name), \ sizeof(trace.name), is_signed_type(type) -static -int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) +static int __init +__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) { int i; int pos = 0; @@ -228,7 +228,7 @@ int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) return pos; } -static int set_syscall_print_fmt(struct ftrace_event_call *call) +static int __init set_syscall_print_fmt(struct ftrace_event_call *call) { char *print_fmt; int len; @@ -253,7 +253,7 @@ static int set_syscall_print_fmt(struct ftrace_event_call *call) return 0; } -static void free_syscall_print_fmt(struct ftrace_event_call *call) +static void __init free_syscall_print_fmt(struct ftrace_event_call *call) { struct syscall_metadata *entry = call->data; @@ -306,6 +306,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct ring_buffer_event *event; struct ring_buffer *buffer; + unsigned long irq_flags; + int pc; int syscall_nr; int size; @@ -321,9 +323,12 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; + local_save_flags(irq_flags); + pc = preempt_count(); + buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, - sys_data->enter_event->event.type, size, 0, 0); + sys_data->enter_event->event.type, size, irq_flags, pc); if (!event) return; @@ -333,7 +338,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) if (!filter_current_check_discard(buffer, sys_data->enter_event, entry, event)) - trace_current_buffer_unlock_commit(buffer, event, 0, 0); + trace_current_buffer_unlock_commit(buffer, event, + irq_flags, pc); } static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) @@ -343,6 +349,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) struct syscall_metadata *sys_data; struct ring_buffer_event *event; struct ring_buffer *buffer; + unsigned long irq_flags; + int pc; int syscall_nr; syscall_nr = trace_get_syscall_nr(current, regs); @@ -355,9 +363,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (!sys_data) return; + local_save_flags(irq_flags); + pc = preempt_count(); + buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, - sys_data->exit_event->event.type, sizeof(*entry), 0, 0); + sys_data->exit_event->event.type, sizeof(*entry), + irq_flags, pc); if (!event) return; @@ -367,7 +379,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (!filter_current_check_discard(buffer, sys_data->exit_event, entry, event)) - trace_current_buffer_unlock_commit(buffer, event, 0, 0); + trace_current_buffer_unlock_commit(buffer, event, + irq_flags, pc); } static int reg_event_syscall_enter(struct ftrace_event_file *file, @@ -446,7 +459,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file, mutex_unlock(&syscall_trace_lock); } -static int init_syscall_trace(struct ftrace_event_call *call) +static int __init init_syscall_trace(struct ftrace_event_call *call) { int id; int num; @@ -553,15 +566,15 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (!sys_data) return; + head = this_cpu_ptr(sys_data->enter_event->perf_events); + if (hlist_empty(head)) + return; + /* get the size after alignment with the u32 buffer size field */ size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "perf buffer not large enough")) - return; - rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, sys_data->enter_event->event.type, regs, &rctx); if (!rec) @@ -570,8 +583,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - - head = this_cpu_ptr(sys_data->enter_event->perf_events); perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } @@ -629,18 +640,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) if (!sys_data) return; + head = this_cpu_ptr(sys_data->exit_event->perf_events); + if (hlist_empty(head)) + return; + /* We can probably do that at build time */ size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - /* - * Impossible, but be paranoid with the future - * How to put this check outside runtime? - */ - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "exit event has grown above perf buffer size")) - return; - rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, sys_data->exit_event->event.type, regs, &rctx); if (!rec) @@ -648,8 +655,6 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - - head = this_cpu_ptr(sys_data->exit_event->perf_events); perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 32494fb0ee6..272261b5f94 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -70,7 +70,7 @@ struct trace_uprobe { (sizeof(struct probe_arg) * (n))) static int register_uprobe_event(struct trace_uprobe *tu); -static void unregister_uprobe_event(struct trace_uprobe *tu); +static int unregister_uprobe_event(struct trace_uprobe *tu); static DEFINE_MUTEX(uprobe_lock); static LIST_HEAD(uprobe_list); @@ -164,11 +164,17 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou } /* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */ -static void unregister_trace_uprobe(struct trace_uprobe *tu) +static int unregister_trace_uprobe(struct trace_uprobe *tu) { + int ret; + + ret = unregister_uprobe_event(tu); + if (ret) + return ret; + list_del(&tu->list); - unregister_uprobe_event(tu); free_trace_uprobe(tu); + return 0; } /* Register a trace_uprobe and probe_event */ @@ -181,9 +187,12 @@ static int register_trace_uprobe(struct trace_uprobe *tu) /* register as an event */ old_tp = find_probe_event(tu->call.name, tu->call.class->system); - if (old_tp) + if (old_tp) { /* delete old event */ - unregister_trace_uprobe(old_tp); + ret = unregister_trace_uprobe(old_tp); + if (ret) + goto end; + } ret = register_uprobe_event(tu); if (ret) { @@ -256,6 +265,8 @@ static int create_trace_uprobe(int argc, char **argv) group = UPROBE_EVENT_SYSTEM; if (is_delete) { + int ret; + if (!event) { pr_info("Delete command needs an event name.\n"); return -EINVAL; @@ -269,9 +280,9 @@ static int create_trace_uprobe(int argc, char **argv) return -ENOENT; } /* delete an event */ - unregister_trace_uprobe(tu); + ret = unregister_trace_uprobe(tu); mutex_unlock(&uprobe_lock); - return 0; + return ret; } if (argc < 2) { @@ -283,8 +294,10 @@ static int create_trace_uprobe(int argc, char **argv) return -EINVAL; } arg = strchr(argv[1], ':'); - if (!arg) + if (!arg) { + ret = -EINVAL; goto fail_address_parse; + } *arg++ = '\0'; filename = argv[1]; @@ -406,16 +419,20 @@ fail_address_parse: return ret; } -static void cleanup_all_probes(void) +static int cleanup_all_probes(void) { struct trace_uprobe *tu; + int ret = 0; mutex_lock(&uprobe_lock); while (!list_empty(&uprobe_list)) { tu = list_entry(uprobe_list.next, struct trace_uprobe, list); - unregister_trace_uprobe(tu); + ret = unregister_trace_uprobe(tu); + if (ret) + break; } mutex_unlock(&uprobe_lock); + return ret; } /* Probes listing interfaces */ @@ -460,8 +477,13 @@ static const struct seq_operations probes_seq_op = { static int probes_open(struct inode *inode, struct file *file) { - if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - cleanup_all_probes(); + int ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = cleanup_all_probes(); + if (ret) + return ret; + } return seq_open(file, &probes_seq_op); } @@ -816,8 +838,6 @@ static void uprobe_perf_print(struct trace_uprobe *tu, size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); - if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) - return; preempt_disable(); head = this_cpu_ptr(call->perf_events); @@ -968,12 +988,17 @@ static int register_uprobe_event(struct trace_uprobe *tu) return ret; } -static void unregister_uprobe_event(struct trace_uprobe *tu) +static int unregister_uprobe_event(struct trace_uprobe *tu) { + int ret; + /* tu->event is unregistered in trace_remove_event_call() */ - trace_remove_event_call(&tu->call); + ret = trace_remove_event_call(&tu->call); + if (ret) + return ret; kfree(tu->call.print_fmt); tu->call.print_fmt = NULL; + return 0; } /* Make a trace interface for controling probe points */ diff --git a/kernel/uid16.c b/kernel/uid16.c index f6c83d7ef00..602e5bbbcef 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!nsown_capable(CAP_SETGID)) + if (!ns_capable(current_user_ns(), CAP_SETGID)) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/up.c b/kernel/up.c index c54c75e9faf..630d72bf7e4 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -10,12 +10,64 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int wait) { + unsigned long flags; + WARN_ON(cpu != 0); - local_irq_disable(); - (func)(info); - local_irq_enable(); + local_irq_save(flags); + func(info); + local_irq_restore(flags); return 0; } EXPORT_SYMBOL(smp_call_function_single); + +int on_each_cpu(smp_call_func_t func, void *info, int wait) +{ + unsigned long flags; + + local_irq_save(flags); + func(info); + local_irq_restore(flags); + return 0; +} +EXPORT_SYMBOL(on_each_cpu); + +/* + * Note we still need to test the mask even for UP + * because we actually can get an empty mask from + * code that on SMP might call us without the local + * CPU in the mask. + */ +void on_each_cpu_mask(const struct cpumask *mask, + smp_call_func_t func, void *info, bool wait) +{ + unsigned long flags; + + if (cpumask_test_cpu(0, mask)) { + local_irq_save(flags); + func(info); + local_irq_restore(flags); + } +} +EXPORT_SYMBOL(on_each_cpu_mask); + +/* + * Preemption is disabled here to make sure the cond_func is called under the + * same condtions in UP and SMP. + */ +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags) +{ + unsigned long flags; + + preempt_disable(); + if (cond_func(0, info)) { + local_irq_save(flags); + func(info); + local_irq_restore(flags); + } + preempt_enable(); +} +EXPORT_SYMBOL(on_each_cpu_cond); diff --git a/kernel/user.c b/kernel/user.c index 69b4c3d48cd..5bbb91988e6 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,8 +51,6 @@ struct user_namespace init_user_ns = { .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, - .may_mount_sysfs = true, - .may_mount_proc = true, }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index d8c30db06c5..13fb1134ba5 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -62,6 +62,9 @@ int create_user_ns(struct cred *new) kgid_t group = new->egid; int ret; + if (parent_ns->level > 32) + return -EUSERS; + /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, @@ -92,29 +95,33 @@ int create_user_ns(struct cred *new) atomic_set(&ns->count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; + ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; set_cred_user_ns(new, ns); - update_mnt_policy(ns); - return 0; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { struct cred *cred; + int err = -ENOMEM; if (!(unshare_flags & CLONE_NEWUSER)) return 0; cred = prepare_creds(); - if (!cred) - return -ENOMEM; + if (cred) { + err = create_user_ns(cred); + if (err) + put_cred(cred); + else + *new_cred = cred; + } - *new_cred = cred; - return create_user_ns(cred); + return err; } void free_user_ns(struct user_namespace *ns) diff --git a/kernel/utsname.c b/kernel/utsname.c index 2fc8576efaa..fd393124e50 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -114,7 +114,7 @@ static int utsns_install(struct nsproxy *nsproxy, void *new) struct uts_namespace *ns = new; if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || - !nsown_capable(CAP_SYS_ADMIN)) + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; get_uts_ns(ns); diff --git a/kernel/wait.c b/kernel/wait.c index 6698e0c04ea..d550920e040 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -287,3 +287,91 @@ wait_queue_head_t *bit_waitqueue(void *word, int bit) return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; } EXPORT_SYMBOL(bit_waitqueue); + +/* + * Manipulate the atomic_t address to produce a better bit waitqueue table hash + * index (we're keying off bit -1, but that would produce a horrible hash + * value). + */ +static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) +{ + if (BITS_PER_LONG == 64) { + unsigned long q = (unsigned long)p; + return bit_waitqueue((void *)(q & ~1), q & 1); + } + return bit_waitqueue(p, 0); +} + +static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync, + void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue *wait_bit + = container_of(wait, struct wait_bit_queue, wait); + atomic_t *val = key->flags; + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + atomic_read(val) != 0) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, + * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero + * return codes halt waiting and return. + */ +static __sched +int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(atomic_t *), unsigned mode) +{ + atomic_t *val; + int ret = 0; + + do { + prepare_to_wait(wq, &q->wait, mode); + val = q->key.flags; + if (atomic_read(val) == 0) + break; + ret = (*action)(val); + } while (!ret && atomic_read(val) != 0); + finish_wait(wq, &q->wait); + return ret; +} + +#define DEFINE_WAIT_ATOMIC_T(name, p) \ + struct wait_bit_queue name = { \ + .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ + .wait = { \ + .private = current, \ + .func = wake_atomic_t_function, \ + .task_list = \ + LIST_HEAD_INIT((name).wait.task_list), \ + }, \ + } + +__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), + unsigned mode) +{ + wait_queue_head_t *wq = atomic_t_waitqueue(p); + DEFINE_WAIT_ATOMIC_T(wait, p); + + return __wait_on_atomic_t(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); + +/** + * wake_up_atomic_t - Wake up a waiter on a atomic_t + * @p: The atomic_t being waited on, a kernel virtual address + * + * Wake up anyone waiting for the atomic_t to go to zero. + * + * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t + * check is done by the waiter's wake function, not the by the waker itself). + */ +void wake_up_atomic_t(atomic_t *p) +{ + __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); +} +EXPORT_SYMBOL(wake_up_atomic_t); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 05039e348f0..4431610f049 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -29,9 +29,9 @@ #include <linux/kvm_para.h> #include <linux/perf_event.h> -int watchdog_enabled = 1; +int watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; -static int __read_mostly watchdog_disabled; +static int __read_mostly watchdog_running; static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -63,7 +63,7 @@ static int __init hardlockup_panic_setup(char *str) else if (!strncmp(str, "nopanic", 7)) hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) - watchdog_enabled = 0; + watchdog_user_enabled = 0; return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); @@ -82,7 +82,7 @@ __setup("softlockup_panic=", softlockup_panic_setup); static int __init nowatchdog_setup(char *str) { - watchdog_enabled = 0; + watchdog_user_enabled = 0; return 1; } __setup("nowatchdog", nowatchdog_setup); @@ -90,7 +90,7 @@ __setup("nowatchdog", nowatchdog_setup); /* deprecated */ static int __init nosoftlockup_setup(char *str) { - watchdog_enabled = 0; + watchdog_user_enabled = 0; return 1; } __setup("nosoftlockup", nosoftlockup_setup); @@ -158,7 +158,7 @@ void touch_all_softlockup_watchdogs(void) #ifdef CONFIG_HARDLOCKUP_DETECTOR void touch_nmi_watchdog(void) { - if (watchdog_enabled) { + if (watchdog_user_enabled) { unsigned cpu; for_each_present_cpu(cpu) { @@ -347,11 +347,6 @@ static void watchdog_enable(unsigned int cpu) hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = watchdog_timer_fn; - if (!watchdog_enabled) { - kthread_park(current); - return; - } - /* Enable the perf event */ watchdog_nmi_enable(cpu); @@ -374,6 +369,11 @@ static void watchdog_disable(unsigned int cpu) watchdog_nmi_disable(cpu); } +static void watchdog_cleanup(unsigned int cpu, bool online) +{ + watchdog_disable(cpu); +} + static int watchdog_should_run(unsigned int cpu) { return __this_cpu_read(hrtimer_interrupts) != @@ -475,28 +475,87 @@ static int watchdog_nmi_enable(unsigned int cpu) { return 0; } static void watchdog_nmi_disable(unsigned int cpu) { return; } #endif /* CONFIG_HARDLOCKUP_DETECTOR */ -/* prepare/enable/disable routines */ -/* sysctl functions */ -#ifdef CONFIG_SYSCTL -static void watchdog_enable_all_cpus(void) +static struct smp_hotplug_thread watchdog_threads = { + .store = &softlockup_watchdog, + .thread_should_run = watchdog_should_run, + .thread_fn = watchdog, + .thread_comm = "watchdog/%u", + .setup = watchdog_enable, + .cleanup = watchdog_cleanup, + .park = watchdog_disable, + .unpark = watchdog_enable, +}; + +static void restart_watchdog_hrtimer(void *info) +{ + struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + int ret; + + /* + * No need to cancel and restart hrtimer if it is currently executing + * because it will reprogram itself with the new period now. + * We should never see it unqueued here because we are running per-cpu + * with interrupts disabled. + */ + ret = hrtimer_try_to_cancel(hrtimer); + if (ret == 1) + hrtimer_start(hrtimer, ns_to_ktime(sample_period), + HRTIMER_MODE_REL_PINNED); +} + +static void update_timers(int cpu) +{ + struct call_single_data data = {.func = restart_watchdog_hrtimer}; + /* + * Make sure that perf event counter will adopt to a new + * sampling period. Updating the sampling period directly would + * be much nicer but we do not have an API for that now so + * let's use a big hammer. + * Hrtimer will adopt the new period on the next tick but this + * might be late already so we have to restart the timer as well. + */ + watchdog_nmi_disable(cpu); + __smp_call_function_single(cpu, &data, 1); + watchdog_nmi_enable(cpu); +} + +static void update_timers_all_cpus(void) +{ + int cpu; + + get_online_cpus(); + preempt_disable(); + for_each_online_cpu(cpu) + update_timers(cpu); + preempt_enable(); + put_online_cpus(); +} + +static int watchdog_enable_all_cpus(bool sample_period_changed) { - unsigned int cpu; + int err = 0; - if (watchdog_disabled) { - watchdog_disabled = 0; - for_each_online_cpu(cpu) - kthread_unpark(per_cpu(softlockup_watchdog, cpu)); + if (!watchdog_running) { + err = smpboot_register_percpu_thread(&watchdog_threads); + if (err) + pr_err("Failed to create watchdog threads, disabled\n"); + else + watchdog_running = 1; + } else if (sample_period_changed) { + update_timers_all_cpus(); } + + return err; } +/* prepare/enable/disable routines */ +/* sysctl functions */ +#ifdef CONFIG_SYSCTL static void watchdog_disable_all_cpus(void) { - unsigned int cpu; - - if (!watchdog_disabled) { - watchdog_disabled = 1; - for_each_online_cpu(cpu) - kthread_park(per_cpu(softlockup_watchdog, cpu)); + if (watchdog_running) { + watchdog_running = 0; + smpboot_unregister_percpu_thread(&watchdog_threads); } } @@ -507,45 +566,43 @@ static void watchdog_disable_all_cpus(void) int proc_dowatchdog(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret; + int err, old_thresh, old_enabled; + static DEFINE_MUTEX(watchdog_proc_mutex); - if (watchdog_disabled < 0) - return -ENODEV; + mutex_lock(&watchdog_proc_mutex); + old_thresh = ACCESS_ONCE(watchdog_thresh); + old_enabled = ACCESS_ONCE(watchdog_user_enabled); - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (ret || !write) - return ret; + err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (err || !write) + goto out; set_sample_period(); /* * Watchdog threads shouldn't be enabled if they are - * disabled. The 'watchdog_disabled' variable check in + * disabled. The 'watchdog_running' variable check in * watchdog_*_all_cpus() function takes care of this. */ - if (watchdog_enabled && watchdog_thresh) - watchdog_enable_all_cpus(); + if (watchdog_user_enabled && watchdog_thresh) + err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); else watchdog_disable_all_cpus(); - return ret; + /* Restore old values on failure */ + if (err) { + watchdog_thresh = old_thresh; + watchdog_user_enabled = old_enabled; + } +out: + mutex_unlock(&watchdog_proc_mutex); + return err; } #endif /* CONFIG_SYSCTL */ -static struct smp_hotplug_thread watchdog_threads = { - .store = &softlockup_watchdog, - .thread_should_run = watchdog_should_run, - .thread_fn = watchdog, - .thread_comm = "watchdog/%u", - .setup = watchdog_enable, - .park = watchdog_disable, - .unpark = watchdog_enable, -}; - void __init lockup_detector_init(void) { set_sample_period(); - if (smpboot_register_percpu_thread(&watchdog_threads)) { - pr_err("Failed to create watchdog threads, disabled\n"); - watchdog_disabled = -ENODEV; - } + + if (watchdog_user_enabled) + watchdog_enable_all_cpus(false); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ee8e29a2320..987293d03eb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -16,9 +16,10 @@ * * This is the generic async execution mechanism. Work items as are * executed in process context. The worker pool is shared and - * automatically managed. There is one worker pool for each CPU and - * one extra for works which are better served by workers which are - * not bound to any specific CPU. + * automatically managed. There are two worker pools for each CPU (one for + * normal work items and the other for high priority ones) and some extra + * pools for workqueues which are not bound to any specific CPU - the + * number of these backing pools is dynamic. * * Please read Documentation/workqueue.txt for details. */ @@ -272,6 +273,15 @@ static cpumask_var_t *wq_numa_possible_cpumask; static bool wq_disable_numa; module_param_named(disable_numa, wq_disable_numa, bool, 0444); +/* see the comment above the definition of WQ_POWER_EFFICIENT */ +#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT +static bool wq_power_efficient = true; +#else +static bool wq_power_efficient; +#endif + +module_param_named(power_efficient, wq_power_efficient, bool, 0444); + static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ @@ -305,6 +315,10 @@ struct workqueue_struct *system_unbound_wq __read_mostly; EXPORT_SYMBOL_GPL(system_unbound_wq); struct workqueue_struct *system_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_freezable_wq); +struct workqueue_struct *system_power_efficient_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_power_efficient_wq); +struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); static void copy_workqueue_attrs(struct workqueue_attrs *to, @@ -527,6 +541,8 @@ static int worker_pool_assign_id(struct worker_pool *pool) * This must be called either with pwq_lock held or sched RCU read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. + * + * Return: The unbound pool_workqueue for @node. */ static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, int node) @@ -625,8 +641,6 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) * get_work_pool - return the worker_pool a given work was associated with * @work: the work item of interest * - * Return the worker_pool @work was last associated with. %NULL if none. - * * Pools are created and destroyed under wq_pool_mutex, and allows read * access under sched-RCU read lock. As such, this function should be * called under wq_pool_mutex or with preemption disabled. @@ -635,6 +649,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) * mentioned locking is in effect. If the returned pool needs to be used * beyond the critical section, the caller is responsible for ensuring the * returned pool is and stays online. + * + * Return: The worker_pool @work was last associated with. %NULL if none. */ static struct worker_pool *get_work_pool(struct work_struct *work) { @@ -658,7 +674,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work) * get_work_pool_id - return the worker pool ID a given work is associated with * @work: the work item of interest * - * Return the worker_pool ID @work was last associated with. + * Return: The worker_pool ID @work was last associated with. * %WORK_OFFQ_POOL_NONE if none. */ static int get_work_pool_id(struct work_struct *work) @@ -817,7 +833,7 @@ void wq_worker_waking_up(struct task_struct *task, int cpu) * CONTEXT: * spin_lock_irq(rq->lock) * - * RETURNS: + * Return: * Worker task on @cpu to wake up, %NULL if none. */ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) @@ -952,8 +968,8 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) * CONTEXT: * spin_lock_irq(pool->lock). * - * RETURNS: - * Pointer to worker which is executing @work if found, NULL + * Return: + * Pointer to worker which is executing @work if found, %NULL * otherwise. */ static struct worker *find_worker_executing_work(struct worker_pool *pool, @@ -1141,14 +1157,16 @@ out_put: * @flags: place to store irq state * * Try to grab PENDING bit of @work. This function can handle @work in any - * stable state - idle, on timer or on worklist. Return values are + * stable state - idle, on timer or on worklist. * + * Return: * 1 if @work was pending and we successfully stole PENDING * 0 if @work was idle and we claimed PENDING * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry * -ENOENT if someone else is canceling @work, this state may persist * for arbitrarily long * + * Note: * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting * interrupted while holding PENDING and @work off queue, irq must be * disabled on entry. This, combined with delayed_work->timer being @@ -1390,10 +1408,10 @@ retry: * @wq: workqueue to use * @work: work to queue * - * Returns %false if @work was already on a queue, %true otherwise. - * * We queue the work to a specific CPU, the caller must ensure it * can't go away. + * + * Return: %false if @work was already on a queue, %true otherwise. */ bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) @@ -1463,7 +1481,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, * @dwork: work to queue * @delay: number of jiffies to wait before queueing * - * Returns %false if @work was already on a queue, %true otherwise. If + * Return: %false if @work was already on a queue, %true otherwise. If * @delay is zero and @dwork is idle, it will be scheduled for immediate * execution. */ @@ -1499,7 +1517,7 @@ EXPORT_SYMBOL(queue_delayed_work_on); * zero, @work is guaranteed to be scheduled immediately regardless of its * current state. * - * Returns %false if @dwork was idle and queued, %true if @dwork was + * Return: %false if @dwork was idle and queued, %true if @dwork was * pending and its timer was modified. * * This function is safe to call from any context including IRQ handler. @@ -1614,7 +1632,7 @@ static void worker_leave_idle(struct worker *worker) * Might sleep. Called without any lock but returns with pool->lock * held. * - * RETURNS: + * Return: * %true if the associated pool is online (@worker is successfully * bound), %false if offline. */ @@ -1675,7 +1693,7 @@ static struct worker *alloc_worker(void) * CONTEXT: * Might sleep. Does GFP_KERNEL allocations. * - * RETURNS: + * Return: * Pointer to the newly created worker. */ static struct worker *create_worker(struct worker_pool *pool) @@ -1775,6 +1793,8 @@ static void start_worker(struct worker *worker) * @pool: the target pool * * Grab the managership of @pool and create and start a new worker for it. + * + * Return: 0 on success. A negative error code otherwise. */ static int create_and_start_worker(struct worker_pool *pool) { @@ -1919,7 +1939,7 @@ static void pool_mayday_timeout(unsigned long __pool) * multiple times. Does GFP_KERNEL allocations. Called only from * manager. * - * RETURNS: + * Return: * %false if no action was taken and pool->lock stayed locked, %true * otherwise. */ @@ -1976,7 +1996,7 @@ restart: * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Called only from manager. * - * RETURNS: + * Return: * %false if no action was taken and pool->lock stayed locked, %true * otherwise. */ @@ -2019,9 +2039,12 @@ static bool maybe_destroy_workers(struct worker_pool *pool) * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. * - * RETURNS: - * spin_lock_irq(pool->lock) which may be released and regrabbed - * multiple times. Does GFP_KERNEL allocations. + * Return: + * %false if the pool don't need management and the caller can safely start + * processing works, %true indicates that the function released pool->lock + * and reacquired it to perform some management function and that the + * conditions that the caller verified while holding the lock before + * calling the function might no longer be true. */ static bool manage_workers(struct worker *worker) { @@ -2188,6 +2211,15 @@ __acquires(&pool->lock) dump_stack(); } + /* + * The following prevents a kworker from hogging CPU on !PREEMPT + * kernels, where a requeueing work item waiting for something to + * happen could deadlock with stop_machine as such work item could + * indefinitely requeue itself while all other CPUs are trapped in + * stop_machine. + */ + cond_resched(); + spin_lock_irq(&pool->lock); /* clear cpu intensive status */ @@ -2233,6 +2265,8 @@ static void process_scheduled_works(struct worker *worker) * work items regardless of their specific target workqueue. The only * exception is work items which belong to workqueues with a rescuer which * will be explained in rescuer_thread(). + * + * Return: 0 */ static int worker_thread(void *__worker) { @@ -2331,6 +2365,8 @@ sleep: * those works so that forward progress can be guaranteed. * * This should happen rarely. + * + * Return: 0 */ static int rescuer_thread(void *__rescuer) { @@ -2503,7 +2539,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, * CONTEXT: * mutex_lock(wq->mutex). * - * RETURNS: + * Return: * %true if @flush_color >= 0 and there's something to flush. %false * otherwise. */ @@ -2804,6 +2840,19 @@ already_gone: return false; } +static bool __flush_work(struct work_struct *work) +{ + struct wq_barrier barr; + + if (start_flush_work(work, &barr)) { + wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + return true; + } else { + return false; + } +} + /** * flush_work - wait for a work to finish executing the last queueing instance * @work: the work to flush @@ -2811,24 +2860,16 @@ already_gone: * Wait until @work has finished execution. @work is guaranteed to be idle * on return if it hasn't been requeued since flush started. * - * RETURNS: + * Return: * %true if flush_work() waited for the work to finish execution, * %false if it was already idle. */ bool flush_work(struct work_struct *work) { - struct wq_barrier barr; - lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); - if (start_flush_work(work, &barr)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - return true; - } else { - return false; - } + return __flush_work(work); } EXPORT_SYMBOL_GPL(flush_work); @@ -2871,7 +2912,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) * The caller must ensure that the workqueue on which @work was last * queued can't be destroyed before this function returns. * - * RETURNS: + * Return: * %true if @work was pending, %false otherwise. */ bool cancel_work_sync(struct work_struct *work) @@ -2888,7 +2929,7 @@ EXPORT_SYMBOL_GPL(cancel_work_sync); * immediate execution. Like flush_work(), this function only * considers the last queueing instance of @dwork. * - * RETURNS: + * Return: * %true if flush_work() waited for the work to finish execution, * %false if it was already idle. */ @@ -2906,11 +2947,15 @@ EXPORT_SYMBOL(flush_delayed_work); * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel * - * Kill off a pending delayed_work. Returns %true if @dwork was pending - * and canceled; %false if wasn't pending. Note that the work callback - * function may still be running on return, unless it returns %true and the - * work doesn't re-arm itself. Explicitly flush or use - * cancel_delayed_work_sync() to wait on it. + * Kill off a pending delayed_work. + * + * Return: %true if @dwork was pending and canceled; %false if it wasn't + * pending. + * + * Note: + * The work callback function may still be running on return, unless + * it returns %true and the work doesn't re-arm itself. Explicitly flush or + * use cancel_delayed_work_sync() to wait on it. * * This function is safe to call from any context including IRQ handler. */ @@ -2939,7 +2984,7 @@ EXPORT_SYMBOL(cancel_delayed_work); * * This is cancel_work_sync() for delayed works. * - * RETURNS: + * Return: * %true if @dwork was pending, %false otherwise. */ bool cancel_delayed_work_sync(struct delayed_work *dwork) @@ -2956,7 +3001,7 @@ EXPORT_SYMBOL(cancel_delayed_work_sync); * system workqueue and blocks until all CPUs have completed. * schedule_on_each_cpu() is very slow. * - * RETURNS: + * Return: * 0 on success, -errno on failure. */ int schedule_on_each_cpu(work_func_t func) @@ -3024,7 +3069,7 @@ EXPORT_SYMBOL(flush_scheduled_work); * Executes the function immediately if process context is available, * otherwise schedules the function for delayed execution. * - * Returns: 0 - function was executed + * Return: 0 - function was executed * 1 - function was scheduled for execution */ int execute_in_process_context(work_func_t fn, struct execute_work *ew) @@ -3068,25 +3113,26 @@ static struct workqueue_struct *dev_to_wq(struct device *dev) return wq_dev->wq; } -static ssize_t wq_per_cpu_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, + char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); } +static DEVICE_ATTR_RO(per_cpu); -static ssize_t wq_max_active_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t max_active_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); } -static ssize_t wq_max_active_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t max_active_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); int val; @@ -3097,12 +3143,14 @@ static ssize_t wq_max_active_store(struct device *dev, workqueue_set_max_active(wq, val); return count; } +static DEVICE_ATTR_RW(max_active); -static struct device_attribute wq_sysfs_attrs[] = { - __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL), - __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store), - __ATTR_NULL, +static struct attribute *wq_sysfs_attrs[] = { + &dev_attr_per_cpu.attr, + &dev_attr_max_active.attr, + NULL, }; +ATTRIBUTE_GROUPS(wq_sysfs); static ssize_t wq_pool_ids_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -3252,7 +3300,7 @@ static struct device_attribute wq_sysfs_unbound_attrs[] = { static struct bus_type wq_subsys = { .name = "workqueue", - .dev_attrs = wq_sysfs_attrs, + .dev_groups = wq_sysfs_groups, }; static int __init wq_sysfs_init(void) @@ -3281,7 +3329,7 @@ static void wq_device_release(struct device *dev) * apply_workqueue_attrs() may race against userland updating the * attributes. * - * Returns 0 on success, -errno on failure. + * Return: 0 on success, -errno on failure. */ int workqueue_sysfs_register(struct workqueue_struct *wq) { @@ -3374,7 +3422,9 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs) * @gfp_mask: allocation mask to use * * Allocate a new workqueue_attrs, initialize with default settings and - * return it. Returns NULL on failure. + * return it. + * + * Return: The allocated new workqueue_attr on success. %NULL on failure. */ struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) { @@ -3398,6 +3448,12 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, { to->nice = from->nice; cpumask_copy(to->cpumask, from->cpumask); + /* + * Unlike hash and equality test, this function doesn't ignore + * ->no_numa as it is used for both pool and wq attrs. Instead, + * get_unbound_pool() explicitly clears ->no_numa after copying. + */ + to->no_numa = from->no_numa; } /* hash value of the content of @attr */ @@ -3427,7 +3483,8 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, * @pool: worker_pool to initialize * * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. - * Returns 0 on success, -errno on failure. Even on failure, all fields + * + * Return: 0 on success, -errno on failure. Even on failure, all fields * inside @pool proper are initialized and put_unbound_pool() can be called * on @pool safely to release it. */ @@ -3534,9 +3591,12 @@ static void put_unbound_pool(struct worker_pool *pool) * Obtain a worker_pool which has the same attributes as @attrs, bump the * reference count and return it. If there already is a matching * worker_pool, it will be used; otherwise, this function attempts to - * create a new one. On failure, returns NULL. + * create a new one. * * Should be called with wq_pool_mutex held. + * + * Return: On success, a worker_pool with the same attributes as @attrs. + * On failure, %NULL. */ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) { @@ -3565,6 +3625,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ copy_workqueue_attrs(pool->attrs, attrs); + /* + * no_numa isn't a worker_pool attribute, always clear it. See + * 'struct workqueue_attrs' comments for detail. + */ + pool->attrs->no_numa = false; + /* if cpumask is contained inside a NUMA node, we belong to that node */ if (wq_numa_enabled) { for_each_node(node) { @@ -3766,9 +3832,7 @@ static void free_unbound_pwq(struct pool_workqueue *pwq) * * Calculate the cpumask a workqueue with @attrs should use on @node. If * @cpu_going_down is >= 0, that cpu is considered offline during - * calculation. The result is stored in @cpumask. This function returns - * %true if the resulting @cpumask is different from @attrs->cpumask, - * %false if equal. + * calculation. The result is stored in @cpumask. * * If NUMA affinity is not enabled, @attrs->cpumask is always used. If * enabled and @node has online CPUs requested by @attrs, the returned @@ -3777,6 +3841,9 @@ static void free_unbound_pwq(struct pool_workqueue *pwq) * * The caller is responsible for ensuring that the cpumask of @node stays * stable. + * + * Return: %true if the resulting @cpumask is different from @attrs->cpumask, + * %false if equal. */ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, int cpu_going_down, cpumask_t *cpumask) @@ -3830,8 +3897,9 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, * items finish. Note that a work item which repeatedly requeues itself * back-to-back will stay on its current pwq. * - * Performs GFP_KERNEL allocations. Returns 0 on success and -errno on - * failure. + * Performs GFP_KERNEL allocations. + * + * Return: 0 on success and -errno on failure. */ int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) @@ -4086,6 +4154,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, struct workqueue_struct *wq; struct pool_workqueue *pwq; + /* see the comment above the definition of WQ_POWER_EFFICIENT */ + if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) + flags |= WQ_UNBOUND; + /* allocate wq and format name */ if (flags & WQ_UNBOUND) tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); @@ -4295,6 +4367,8 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active); * * Determine whether %current is a workqueue rescuer. Can be used from * work functions to determine whether it's being run off the rescuer task. + * + * Return: %true if %current is a workqueue rescuer. %false otherwise. */ bool current_is_workqueue_rescuer(void) { @@ -4318,7 +4392,7 @@ bool current_is_workqueue_rescuer(void) * workqueue being congested on one CPU doesn't mean the workqueue is also * contested on other CPUs / NUMA nodes. * - * RETURNS: + * Return: * %true if congested, %false otherwise. */ bool workqueue_congested(int cpu, struct workqueue_struct *wq) @@ -4351,7 +4425,7 @@ EXPORT_SYMBOL_GPL(workqueue_congested); * synchronization around this function and the test result is * unreliable and only useful as advisory hints or for debugging. * - * RETURNS: + * Return: * OR'd bitmask of WORK_BUSY_* bits. */ unsigned int work_busy(struct work_struct *work) @@ -4627,7 +4701,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) * Workqueues should be brought up before normal priority CPU notifiers. * This will be registered high priority CPU notifier. */ -static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, +static int workqueue_cpu_up_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -4680,7 +4754,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, * Workqueues should be brought down after normal priority CPU notifiers. * This will be registered as low priority CPU notifier. */ -static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, +static int workqueue_cpu_down_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -4729,9 +4803,10 @@ static void work_for_cpu_fn(struct work_struct *work) * @fn: the function to run * @arg: the function arg * - * This will return the value @fn returns. * It is up to the caller to ensure that the cpu doesn't go offline. * The caller must not hold any locks which would prevent @fn from completing. + * + * Return: The value @fn returns. */ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { @@ -4739,7 +4814,14 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); schedule_work_on(cpu, &wfc.work); - flush_work(&wfc.work); + + /* + * The work item is on-stack and can't lead to deadlock through + * flushing. Use __flush_work() to avoid spurious lockdep warnings + * when work_on_cpu()s are nested. + */ + __flush_work(&wfc.work); + return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); @@ -4796,7 +4878,7 @@ void freeze_workqueues_begin(void) * CONTEXT: * Grabs and releases wq_pool_mutex. * - * RETURNS: + * Return: * %true if some freezable workqueues are still busy. %false if freezing * is complete. */ @@ -4985,8 +5067,15 @@ static int __init init_workqueues(void) WQ_UNBOUND_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); + system_power_efficient_wq = alloc_workqueue("events_power_efficient", + WQ_POWER_EFFICIENT, 0); + system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient", + WQ_FREEZABLE | WQ_POWER_EFFICIENT, + 0); BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || - !system_unbound_wq || !system_freezable_wq); + !system_unbound_wq || !system_freezable_wq || + !system_power_efficient_wq || + !system_freezable_power_efficient_wq); return 0; } early_initcall(init_workqueues); diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index ad83c96b2ec..7e2204db0b1 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -64,7 +64,7 @@ static inline struct worker *current_wq_worker(void) /* * Scheduler hooks for concurrency managed workqueue. Only to be used from - * sched.c and workqueue.c. + * sched/core.c and workqueue.c. */ void wq_worker_waking_up(struct task_struct *task, int cpu); struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); |