diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 2777 |
1 files changed, 954 insertions, 1823 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 105f273b6f8..8ab800c7bac 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -40,23 +40,21 @@ #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> -#include <linux/backing-dev.h> #include <linux/slab.h> -#include <linux/magic.h> #include <linux/spinlock.h> +#include <linux/rwsem.h> #include <linux/string.h> #include <linux/sort.h> #include <linux/kmod.h> -#include <linux/module.h> #include <linux/delayacct.h> #include <linux/cgroupstats.h> #include <linux/hashtable.h> -#include <linux/namei.h> #include <linux/pid_namespace.h> #include <linux/idr.h> #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <linux/flex_array.h> /* used in cgroup_attach_task */ #include <linux/kthread.h> +#include <linux/delay.h> #include <linux/atomic.h> @@ -68,21 +66,21 @@ */ #define CGROUP_PIDLIST_DESTROY_DELAY HZ +#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ + MAX_CFTYPE_NAME + 2) + +/* + * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file + * creation/removal and hierarchy changing operations including cgroup + * creation, removal, css association and controller rebinding. This outer + * lock is needed mainly to resolve the circular dependency between kernfs + * active ref and cgroup_mutex. cgroup_tree_mutex nests above both. + */ +static DEFINE_MUTEX(cgroup_tree_mutex); + /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. - * - * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify - * cgroupfs_root of any cgroup hierarchy - subsys list, flags, - * release_agent_path and so on. Modifying requires both cgroup_mutex and - * cgroup_root_mutex. Readers can acquire either of the two. This is to - * break the following locking order cycle. - * - * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem - * B. namespace_sem -> cgroup_mutex - * - * B happens only through cgroup_show_options() and using cgroup_root_mutex - * breaks it. */ #ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); @@ -91,20 +89,17 @@ EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ static DEFINE_MUTEX(cgroup_mutex); #endif -static DEFINE_MUTEX(cgroup_root_mutex); +/* + * Protects cgroup_subsys->release_agent_path. Modifying it also requires + * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. + */ +static DEFINE_SPINLOCK(release_agent_path_lock); -#define cgroup_assert_mutex_or_rcu_locked() \ +#define cgroup_assert_mutexes_or_rcu_locked() \ rcu_lockdep_assert(rcu_read_lock_held() || \ + lockdep_is_held(&cgroup_tree_mutex) || \ lockdep_is_held(&cgroup_mutex), \ - "cgroup_mutex or RCU read lock required"); - -#ifdef CONFIG_LOCKDEP -#define cgroup_assert_mutex_or_root_locked() \ - WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \ - !lockdep_is_held(&cgroup_root_mutex))) -#else -#define cgroup_assert_mutex_or_root_locked() do { } while (0) -#endif + "cgroup_[tree_]mutex or RCU read lock required"); /* * cgroup destruction makes heavy use of work items and there can be a lot @@ -120,17 +115,19 @@ static struct workqueue_struct *cgroup_destroy_wq; */ static struct workqueue_struct *cgroup_pidlist_destroy_wq; -/* - * Generate an array of cgroup subsystem pointers. At boot time, this is - * populated with the built in subsystems, and modular subsystems are - * registered after that. The mutable section of this array is protected by - * cgroup_mutex. - */ -#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, -#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) -static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = { +/* generate an array of cgroup subsystem pointers */ +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, +static struct cgroup_subsys *cgroup_subsys[] = { +#include <linux/cgroup_subsys.h> +}; +#undef SUBSYS + +/* array of cgroup subsystem names */ +#define SUBSYS(_x) [_x ## _cgrp_id] = #_x, +static const char *cgroup_subsys_name[] = { #include <linux/cgroup_subsys.h> }; +#undef SUBSYS /* * The dummy hierarchy, reserved for the subsystems that are otherwise @@ -147,15 +144,9 @@ static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; static LIST_HEAD(cgroup_roots); static int cgroup_root_count; -/* - * Hierarchy ID allocation and mapping. It follows the same exclusion - * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for - * writes, either for reads. - */ +/* hierarchy ID allocation and mapping, protected by cgroup_mutex */ static DEFINE_IDR(cgroup_hierarchy_idr); -static struct cgroup_name root_cgroup_name = { .name = "/" }; - /* * Assign a monotonically increasing serial number to cgroups. It * guarantees cgroups with bigger numbers are newer than those with smaller @@ -175,11 +166,13 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; +static void cgroup_put(struct cgroup *cgrp); +static int rebind_subsystems(struct cgroupfs_root *root, + unsigned long added_mask, unsigned removed_mask); static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); -static int cgroup_file_release(struct inode *inode, struct file *file); static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); /** @@ -197,8 +190,9 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { if (ss) - return rcu_dereference_check(cgrp->subsys[ss->subsys_id], - lockdep_is_held(&cgroup_mutex)); + return rcu_dereference_check(cgrp->subsys[ss->id], + lockdep_is_held(&cgroup_tree_mutex) || + lockdep_is_held(&cgroup_mutex)); else return &cgrp->dummy_css; } @@ -209,6 +203,27 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp) return test_bit(CGRP_DEAD, &cgrp->flags); } +struct cgroup_subsys_state *seq_css(struct seq_file *seq) +{ + struct kernfs_open_file *of = seq->private; + struct cgroup *cgrp = of->kn->parent->priv; + struct cftype *cft = seq_cft(seq); + + /* + * This is open and unprotected implementation of cgroup_css(). + * seq_css() is only called from a kernfs file operation which has + * an active reference on the file. Because all the subsystem + * files are drained before a css is disassociated with a cgroup, + * the matching css from the cgroup's subsys table is guaranteed to + * be and stay valid until the enclosing operation is complete. + */ + if (cft->ss) + return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); + else + return &cgrp->dummy_css; +} +EXPORT_SYMBOL_GPL(seq_css); + /** * cgroup_is_descendant - test ancestry * @cgrp: the cgroup to be tested @@ -227,7 +242,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) } return false; } -EXPORT_SYMBOL_GPL(cgroup_is_descendant); static int cgroup_is_releasable(const struct cgroup *cgrp) { @@ -254,54 +268,23 @@ static int notify_on_release(const struct cgroup *cgrp) for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ if (!((css) = rcu_dereference_check( \ (cgrp)->subsys[(ssid)], \ + lockdep_is_held(&cgroup_tree_mutex) || \ lockdep_is_held(&cgroup_mutex)))) { } \ else /** - * for_each_subsys - iterate all loaded cgroup subsystems + * for_each_subsys - iterate all enabled cgroup subsystems * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end - * - * Iterates through all loaded subsystems. Should be called under - * cgroup_mutex or cgroup_root_mutex. */ #define for_each_subsys(ss, ssid) \ - for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \ - (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ - if (!((ss) = cgroup_subsys[(ssid)])) { } \ - else - -/** - * for_each_builtin_subsys - iterate all built-in cgroup subsystems - * @ss: the iteration cursor - * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end - * - * Bulit-in subsystems are always present and iteration itself doesn't - * require any synchronization. - */ -#define for_each_builtin_subsys(ss, i) \ - for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ - (((ss) = cgroup_subsys[i]) || true); (i)++) + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ + (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) /* iterate across the active hierarchies */ #define for_each_active_root(root) \ list_for_each_entry((root), &cgroup_roots, root_list) -static inline struct cgroup *__d_cgrp(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -static inline struct cfent *__d_cfe(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -static inline struct cftype *__d_cft(struct dentry *dentry) -{ - return __d_cfe(dentry)->type; -} - /** * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. * @cgrp: the cgroup to be checked for liveness @@ -358,11 +341,10 @@ static struct css_set init_css_set; static struct cgrp_cset_link init_cgrp_cset_link; /* - * css_set_lock protects the list of css_set objects, and the chain of - * tasks off each css_set. Nests outside task->alloc_lock due to - * css_task_iter_start(). + * css_set_rwsem protects the list of css_set objects, and the chain of + * tasks off each css_set. */ -static DEFINE_RWLOCK(css_set_lock); +static DECLARE_RWSEM(css_set_rwsem); static int css_set_count; /* @@ -386,30 +368,14 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) return key; } -/* - * We don't maintain the lists running through each css_set to its task - * until after the first call to css_task_iter_start(). This reduces the - * fork()/exit() overhead for people who have cgroups compiled into their - * kernel but not actually in use. - */ -static int use_task_css_set_links __read_mostly; - -static void __put_css_set(struct css_set *cset, int taskexit) +static void put_css_set_locked(struct css_set *cset, bool taskexit) { struct cgrp_cset_link *link, *tmp_link; - /* - * Ensure that the refcount doesn't hit zero while any readers - * can see it. Similar to atomic_dec_and_lock(), but for an - * rwlock - */ - if (atomic_add_unless(&cset->refcount, -1, 1)) - return; - write_lock(&css_set_lock); - if (!atomic_dec_and_test(&cset->refcount)) { - write_unlock(&css_set_lock); + lockdep_assert_held(&css_set_rwsem); + + if (!atomic_dec_and_test(&cset->refcount)) return; - } /* This css_set is dead. unlink it and release cgroup refcounts */ hash_del(&cset->hlist); @@ -421,7 +387,7 @@ static void __put_css_set(struct css_set *cset, int taskexit) list_del(&link->cset_link); list_del(&link->cgrp_link); - /* @cgrp can't go away while we're holding css_set_lock */ + /* @cgrp can't go away while we're holding css_set_rwsem */ if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); @@ -431,10 +397,24 @@ static void __put_css_set(struct css_set *cset, int taskexit) kfree(link); } - write_unlock(&css_set_lock); kfree_rcu(cset, rcu_head); } +static void put_css_set(struct css_set *cset, bool taskexit) +{ + /* + * Ensure that the refcount doesn't hit zero while any readers + * can see it. Similar to atomic_dec_and_lock(), but for an + * rwlock + */ + if (atomic_add_unless(&cset->refcount, -1, 1)) + return; + + down_write(&css_set_rwsem); + put_css_set_locked(cset, taskexit); + up_write(&css_set_rwsem); +} + /* * refcounted get/put for css_set objects */ @@ -443,16 +423,6 @@ static inline void get_css_set(struct css_set *cset) atomic_inc(&cset->refcount); } -static inline void put_css_set(struct css_set *cset) -{ - __put_css_set(cset, 0); -} - -static inline void put_css_set_taskexit(struct css_set *cset) -{ - __put_css_set(cset, 1); -} - /** * compare_css_sets - helper function for find_existing_css_set(). * @cset: candidate css_set being tested @@ -652,11 +622,11 @@ static struct css_set *find_css_set(struct css_set *old_cset, /* First see if we already have a cgroup group that matches * the desired set */ - read_lock(&css_set_lock); + down_read(&css_set_rwsem); cset = find_existing_css_set(old_cset, cgrp, template); if (cset) get_css_set(cset); - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); if (cset) return cset; @@ -680,7 +650,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, * find_existing_css_set() */ memcpy(cset->subsys, template, sizeof(cset->subsys)); - write_lock(&css_set_lock); + down_write(&css_set_rwsem); /* Add reference counts and links from the new css_set. */ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -698,14 +668,98 @@ static struct css_set *find_css_set(struct css_set *old_cset, key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); - write_unlock(&css_set_lock); + up_write(&css_set_rwsem); return cset; } +static struct cgroupfs_root *cgroup_root_from_kf(struct kernfs_root *kf_root) +{ + struct cgroup *top_cgrp = kf_root->kn->priv; + + return top_cgrp->root; +} + +static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) +{ + int id; + + lockdep_assert_held(&cgroup_mutex); + + id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end, + GFP_KERNEL); + if (id < 0) + return id; + + root->hierarchy_id = id; + return 0; +} + +static void cgroup_exit_root_id(struct cgroupfs_root *root) +{ + lockdep_assert_held(&cgroup_mutex); + + if (root->hierarchy_id) { + idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); + root->hierarchy_id = 0; + } +} + +static void cgroup_free_root(struct cgroupfs_root *root) +{ + if (root) { + /* hierarhcy ID shoulid already have been released */ + WARN_ON_ONCE(root->hierarchy_id); + + idr_destroy(&root->cgroup_idr); + kfree(root); + } +} + +static void cgroup_destroy_root(struct cgroupfs_root *root) +{ + struct cgroup *cgrp = &root->top_cgroup; + struct cgrp_cset_link *link, *tmp_link; + + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); + + BUG_ON(atomic_read(&root->nr_cgrps)); + BUG_ON(!list_empty(&cgrp->children)); + + /* Rebind all subsystems back to the default hierarchy */ + WARN_ON(rebind_subsystems(root, 0, root->subsys_mask)); + + /* + * Release all the links from cset_links to this hierarchy's + * root cgroup + */ + down_write(&css_set_rwsem); + + list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { + list_del(&link->cset_link); + list_del(&link->cgrp_link); + kfree(link); + } + up_write(&css_set_rwsem); + + if (!list_empty(&root->root_list)) { + list_del(&root->root_list); + cgroup_root_count--; + } + + cgroup_exit_root_id(root); + + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); + + kernfs_destroy_root(root->kf_root); + cgroup_free_root(root); +} + /* * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex held. + * called with cgroup_mutex and css_set_rwsem held. */ static struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroupfs_root *root) @@ -713,8 +767,9 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, struct css_set *cset; struct cgroup *res = NULL; - BUG_ON(!mutex_is_locked(&cgroup_mutex)); - read_lock(&css_set_lock); + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&css_set_rwsem); + /* * No need to lock the task - since we hold cgroup_mutex the * task can't change groups, so the only thing that can happen @@ -735,7 +790,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, } } } - read_unlock(&css_set_lock); + BUG_ON(!res); return res; } @@ -790,78 +845,71 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ -/* - * A couple of forward declarations required, due to cyclic reference loop: - * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> - * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations - * -> cgroup_mkdir. - */ - -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); -static const struct inode_operations cgroup_dir_inode_operations; +static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static const struct file_operations proc_cgroupstats_operations; -static struct backing_dev_info cgroup_backing_dev_info = { - .name = "cgroup", - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - -static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) +static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, + char *buf) { - struct inode *inode = new_inode(sb); - - if (inode) { - inode->i_ino = get_next_ino(); - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; - } - return inode; + if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && + !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) + snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s", + cft->ss->name, cft->name); + else + strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + return buf; } -static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) +/** + * cgroup_file_mode - deduce file mode of a control file + * @cft: the control file in question + * + * returns cft->mode if ->mode is not 0 + * returns S_IRUGO|S_IWUSR if it has both a read and a write handler + * returns S_IRUGO if it has only a read handler + * returns S_IWUSR if it has only a write hander + */ +static umode_t cgroup_file_mode(const struct cftype *cft) { - struct cgroup_name *name; + umode_t mode = 0; - name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL); - if (!name) - return NULL; - strcpy(name->name, dentry->d_name.name); - return name; + if (cft->mode) + return cft->mode; + + if (cft->read_u64 || cft->read_s64 || cft->seq_show) + mode |= S_IRUGO; + + if (cft->write_u64 || cft->write_s64 || cft->write_string || + cft->trigger) + mode |= S_IWUSR; + + return mode; } static void cgroup_free_fn(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); - mutex_lock(&cgroup_mutex); - cgrp->root->number_of_cgroups--; - mutex_unlock(&cgroup_mutex); - - /* - * We get a ref to the parent's dentry, and put the ref when - * this cgroup is being freed, so it's guaranteed that the - * parent won't be destroyed before its children. - */ - dput(cgrp->parent->dentry); - - /* - * Drop the active superblock reference that we took when we - * created the cgroup. This will free cgrp->root, if we are - * holding the last reference to @sb. - */ - deactivate_super(cgrp->root->sb); - + atomic_dec(&cgrp->root->nr_cgrps); cgroup_pidlist_destroy_all(cgrp); - simple_xattrs_free(&cgrp->xattrs); - - kfree(rcu_dereference_raw(cgrp->name)); - kfree(cgrp); + if (cgrp->parent) { + /* + * We get a ref to the parent, and put the ref when this + * cgroup is being freed, so it's guaranteed that the + * parent won't be destroyed before its children. + */ + cgroup_put(cgrp->parent); + kernfs_put(cgrp->kn); + kfree(cgrp); + } else { + /* + * This is top cgroup's refcnt reaching zero, which + * indicates that the root should be released. + */ + cgroup_destroy_root(cgrp->root); + } } static void cgroup_free_rcu(struct rcu_head *head) @@ -872,73 +920,40 @@ static void cgroup_free_rcu(struct rcu_head *head) queue_work(cgroup_destroy_wq, &cgrp->destroy_work); } -static void cgroup_diput(struct dentry *dentry, struct inode *inode) -{ - /* is dentry a directory ? if so, kfree() associated cgroup */ - if (S_ISDIR(inode->i_mode)) { - struct cgroup *cgrp = dentry->d_fsdata; - - BUG_ON(!(cgroup_is_dead(cgrp))); - - /* - * XXX: cgrp->id is only used to look up css's. As cgroup - * and css's lifetimes will be decoupled, it should be made - * per-subsystem and moved to css->id so that lookups are - * successful until the target css is released. - */ - mutex_lock(&cgroup_mutex); - idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - mutex_unlock(&cgroup_mutex); - cgrp->id = -1; - - call_rcu(&cgrp->rcu_head, cgroup_free_rcu); - } else { - struct cfent *cfe = __d_cfe(dentry); - struct cgroup *cgrp = dentry->d_parent->d_fsdata; - - WARN_ONCE(!list_empty(&cfe->node) && - cgrp != &cgrp->root->top_cgroup, - "cfe still linked for %s\n", cfe->type->name); - simple_xattrs_free(&cfe->xattrs); - kfree(cfe); - } - iput(inode); -} - -static void remove_dir(struct dentry *d) +static void cgroup_get(struct cgroup *cgrp) { - struct dentry *parent = dget(d->d_parent); - - d_delete(d); - simple_rmdir(parent->d_inode, d); - dput(parent); + WARN_ON_ONCE(cgroup_is_dead(cgrp)); + WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0); + atomic_inc(&cgrp->refcnt); } -static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) +static void cgroup_put(struct cgroup *cgrp) { - struct cfent *cfe; - - lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); - lockdep_assert_held(&cgroup_mutex); + if (!atomic_dec_and_test(&cgrp->refcnt)) + return; + if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) + return; /* - * If we're doing cleanup due to failure of cgroup_create(), - * the corresponding @cfe may not exist. + * XXX: cgrp->id is only used to look up css's. As cgroup and + * css's lifetimes will be decoupled, it should be made + * per-subsystem and moved to css->id so that lookups are + * successful until the target css is released. */ - list_for_each_entry(cfe, &cgrp->files, node) { - struct dentry *d = cfe->dentry; + mutex_lock(&cgroup_mutex); + idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + mutex_unlock(&cgroup_mutex); + cgrp->id = -1; - if (cft && cfe->type != cft) - continue; + call_rcu(&cgrp->rcu_head, cgroup_free_rcu); +} - dget(d); - d_delete(d); - simple_unlink(cgrp->dentry->d_inode, d); - list_del_init(&cfe->node); - dput(d); +static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) +{ + char name[CGROUP_FILE_NAME_MAX]; - break; - } + lockdep_assert_held(&cgroup_tree_mutex); + kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } /** @@ -952,81 +967,41 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) int i; for_each_subsys(ss, i) { - struct cftype_set *set; + struct cftype *cfts; if (!test_bit(i, &subsys_mask)) continue; - list_for_each_entry(set, &ss->cftsets, node) - cgroup_addrm_files(cgrp, set->cfts, false); + list_for_each_entry(cfts, &ss->cfts, node) + cgroup_addrm_files(cgrp, cfts, false); } } -/* - * NOTE : the dentry must have been dget()'ed - */ -static void cgroup_d_remove_dir(struct dentry *dentry) -{ - struct dentry *parent; - - parent = dentry->d_parent; - spin_lock(&parent->d_lock); - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - list_del_init(&dentry->d_u.d_child); - spin_unlock(&dentry->d_lock); - spin_unlock(&parent->d_lock); - remove_dir(dentry); -} - -/* - * Call with cgroup_mutex held. Drops reference counts on modules, including - * any duplicate ones that parse_cgroupfs_options took. If this function - * returns an error, no reference counts are touched. - */ static int rebind_subsystems(struct cgroupfs_root *root, unsigned long added_mask, unsigned removed_mask) { struct cgroup *cgrp = &root->top_cgroup; struct cgroup_subsys *ss; - unsigned long pinned = 0; int i, ret; - BUG_ON(!mutex_is_locked(&cgroup_mutex)); - BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); + lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); /* Check that any added subsystems are currently free */ - for_each_subsys(ss, i) { - if (!(added_mask & (1 << i))) - continue; - - /* is the subsystem mounted elsewhere? */ - if (ss->root != &cgroup_dummy_root) { - ret = -EBUSY; - goto out_put; - } - - /* pin the module */ - if (!try_module_get(ss->module)) { - ret = -ENOENT; - goto out_put; - } - pinned |= 1 << i; - } - - /* subsys could be missing if unloaded between parsing and here */ - if (added_mask != pinned) { - ret = -ENOENT; - goto out_put; - } + for_each_subsys(ss, i) + if ((added_mask & (1 << i)) && ss->root != &cgroup_dummy_root) + return -EBUSY; ret = cgroup_populate_dir(cgrp, added_mask); if (ret) - goto out_put; + return ret; /* * Nothing can fail from this point on. Remove files for the * removed subsystems and rebind each subsystem. */ + mutex_unlock(&cgroup_mutex); cgroup_clear_dir(cgrp, removed_mask); + mutex_lock(&cgroup_mutex); for_each_subsys(ss, i) { unsigned long bit = 1UL << i; @@ -1059,35 +1034,21 @@ static int rebind_subsystems(struct cgroupfs_root *root, RCU_INIT_POINTER(cgrp->subsys[i], NULL); cgroup_subsys[i]->root = &cgroup_dummy_root; - - /* subsystem is now free - drop reference on module */ - module_put(ss->module); root->subsys_mask &= ~bit; } } - /* - * Mark @root has finished binding subsystems. @root->subsys_mask - * now matches the bound subsystems. - */ - root->flags |= CGRP_ROOT_SUBSYS_BOUND; - + kernfs_activate(cgrp->kn); return 0; - -out_put: - for_each_subsys(ss, i) - if (pinned & (1 << i)) - module_put(ss->module); - return ret; } -static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) +static int cgroup_show_options(struct seq_file *seq, + struct kernfs_root *kf_root) { - struct cgroupfs_root *root = dentry->d_sb->s_fs_info; + struct cgroupfs_root *root = cgroup_root_from_kf(kf_root); struct cgroup_subsys *ss; int ssid; - mutex_lock(&cgroup_root_mutex); for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) seq_printf(seq, ",%s", ss->name); @@ -1097,13 +1058,16 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) seq_puts(seq, ",xattr"); + + spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); + spin_unlock(&release_agent_path_lock); + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_printf(seq, ",name=%s", root->name); - mutex_unlock(&cgroup_root_mutex); return 0; } @@ -1115,9 +1079,6 @@ struct cgroup_sb_opts { char *name; /* User explicitly requested empty subsystem */ bool none; - - struct cgroupfs_root *new_root; - }; /* @@ -1137,7 +1098,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) BUG_ON(!mutex_is_locked(&cgroup_mutex)); #ifdef CONFIG_CPUSETS - mask = ~(1UL << cpuset_subsys_id); + mask = ~(1UL << cpuset_cgrp_id); #endif memset(opts, 0, sizeof(*opts)); @@ -1242,13 +1203,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); - if (opts->flags & CGRP_ROOT_NOPREFIX) { - pr_err("cgroup: sane_behavior: noprefix is not allowed\n"); - return -EINVAL; - } - - if (opts->cpuset_clone_children) { - pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); + if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || + opts->cpuset_clone_children || opts->release_agent || + opts->name) { + pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); return -EINVAL; } } @@ -1276,11 +1234,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return 0; } -static int cgroup_remount(struct super_block *sb, int *flags, char *data) +static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) { int ret = 0; - struct cgroupfs_root *root = sb->s_fs_info; - struct cgroup *cgrp = &root->top_cgroup; + struct cgroupfs_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; unsigned long added_mask, removed_mask; @@ -1289,9 +1246,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) return -EINVAL; } - mutex_lock(&cgrp->dentry->d_inode->i_mutex); + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); /* See what subsystems are wanted */ ret = parse_cgroupfs_options(data, &opts); @@ -1316,7 +1272,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) } /* remounting is not allowed for populated hierarchies */ - if (root->number_of_cgroups > 1) { + if (!list_empty(&root->top_cgroup.children)) { ret = -EBUSY; goto out_unlock; } @@ -1325,35 +1281,81 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) if (ret) goto out_unlock; - if (opts.release_agent) + if (opts.release_agent) { + spin_lock(&release_agent_path_lock); strcpy(root->release_agent_path, opts.release_agent); + spin_unlock(&release_agent_path_lock); + } out_unlock: kfree(opts.release_agent); kfree(opts.name); - mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); + mutex_unlock(&cgroup_tree_mutex); return ret; } -static const struct super_operations cgroup_ops = { - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, - .show_options = cgroup_show_options, - .remount_fs = cgroup_remount, -}; +/* + * To reduce the fork() overhead for systems that are not actually using + * their cgroups capability, we don't maintain the lists running through + * each css_set to its tasks until we see the list actually used - in other + * words after the first mount. + */ +static bool use_task_css_set_links __read_mostly; + +static void cgroup_enable_task_cg_lists(void) +{ + struct task_struct *p, *g; + + down_write(&css_set_rwsem); + + if (use_task_css_set_links) + goto out_unlock; + + use_task_css_set_links = true; + + /* + * We need tasklist_lock because RCU is not safe against + * while_each_thread(). Besides, a forking task that has passed + * cgroup_post_fork() without seeing use_task_css_set_links = 1 + * is not guaranteed to have its child immediately visible in the + * tasklist if we walk through it with RCU. + */ + read_lock(&tasklist_lock); + do_each_thread(g, p) { + task_lock(p); + + WARN_ON_ONCE(!list_empty(&p->cg_list) || + task_css_set(p) != &init_css_set); + + /* + * We should check if the process is exiting, otherwise + * it will race with cgroup_exit() in that the list + * entry won't be deleted though the process has exited. + * Do it while holding siglock so that we don't end up + * racing against cgroup_exit(). + */ + spin_lock_irq(&p->sighand->siglock); + if (!(p->flags & PF_EXITING)) + list_add(&p->cg_list, &task_css_set(p)->tasks); + spin_unlock_irq(&p->sighand->siglock); + + task_unlock(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); +out_unlock: + up_write(&css_set_rwsem); +} static void init_cgroup_housekeeping(struct cgroup *cgrp) { + atomic_set(&cgrp->refcnt, 1); INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); - INIT_LIST_HEAD(&cgrp->files); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->dummy_css.cgroup = cgrp; - simple_xattrs_init(&cgrp->xattrs); } static void init_cgroup_root(struct cgroupfs_root *root) @@ -1361,66 +1363,18 @@ static void init_cgroup_root(struct cgroupfs_root *root) struct cgroup *cgrp = &root->top_cgroup; INIT_LIST_HEAD(&root->root_list); - root->number_of_cgroups = 1; + atomic_set(&root->nr_cgrps, 1); cgrp->root = root; - RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); init_cgroup_housekeeping(cgrp); idr_init(&root->cgroup_idr); } -static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) -{ - int id; - - lockdep_assert_held(&cgroup_mutex); - lockdep_assert_held(&cgroup_root_mutex); - - id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end, - GFP_KERNEL); - if (id < 0) - return id; - - root->hierarchy_id = id; - return 0; -} - -static void cgroup_exit_root_id(struct cgroupfs_root *root) -{ - lockdep_assert_held(&cgroup_mutex); - lockdep_assert_held(&cgroup_root_mutex); - - if (root->hierarchy_id) { - idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); - root->hierarchy_id = 0; - } -} - -static int cgroup_test_super(struct super_block *sb, void *data) -{ - struct cgroup_sb_opts *opts = data; - struct cgroupfs_root *root = sb->s_fs_info; - - /* If we asked for a name then it must match */ - if (opts->name && strcmp(opts->name, root->name)) - return 0; - - /* - * If we asked for subsystems (or explicitly for no - * subsystems) then they must match - */ - if ((opts->subsys_mask || opts->none) - && (opts->subsys_mask != root->subsys_mask)) - return 0; - - return 1; -} - static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) { struct cgroupfs_root *root; if (!opts->subsys_mask && !opts->none) - return NULL; + return ERR_PTR(-EINVAL); root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) @@ -1428,15 +1382,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) init_cgroup_root(root); - /* - * We need to set @root->subsys_mask now so that @root can be - * matched by cgroup_test_super() before it finishes - * initialization; otherwise, competing mounts with the same - * options may try to bind the same subsystems instead of waiting - * for the first one leading to unexpected mount errors. - * SUBSYS_BOUND will be set once actual binding is complete. - */ - root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; if (opts->release_agent) strcpy(root->release_agent_path, opts->release_agent); @@ -1447,291 +1392,202 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) return root; } -static void cgroup_free_root(struct cgroupfs_root *root) +static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask) { - if (root) { - /* hierarhcy ID shoulid already have been released */ - WARN_ON_ONCE(root->hierarchy_id); - - idr_destroy(&root->cgroup_idr); - kfree(root); - } -} + LIST_HEAD(tmp_links); + struct cgroup *root_cgrp = &root->top_cgroup; + struct css_set *cset; + int i, ret; -static int cgroup_set_super(struct super_block *sb, void *data) -{ - int ret; - struct cgroup_sb_opts *opts = data; + lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); - /* If we don't have a new root, we can't set up a new sb */ - if (!opts->new_root) - return -EINVAL; + ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); + if (ret < 0) + goto out; + root_cgrp->id = ret; - BUG_ON(!opts->subsys_mask && !opts->none); + /* + * We're accessing css_set_count without locking css_set_rwsem here, + * but that's OK - it can only be increased by someone holding + * cgroup_lock, and that's us. The worst that can happen is that we + * have some link structures left over + */ + ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); + if (ret) + goto out; - ret = set_anon_super(sb, NULL); + /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ + ret = cgroup_init_root_id(root, 2, 0); if (ret) - return ret; + goto out; - sb->s_fs_info = opts->new_root; - opts->new_root->sb = sb; + root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, + KERNFS_ROOT_CREATE_DEACTIVATED, + root_cgrp); + if (IS_ERR(root->kf_root)) { + ret = PTR_ERR(root->kf_root); + goto exit_root_id; + } + root_cgrp->kn = root->kf_root->kn; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = CGROUP_SUPER_MAGIC; - sb->s_op = &cgroup_ops; + ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); + if (ret) + goto destroy_root; - return 0; -} + ret = rebind_subsystems(root, ss_mask, 0); + if (ret) + goto destroy_root; -static int cgroup_get_rootdir(struct super_block *sb) -{ - static const struct dentry_operations cgroup_dops = { - .d_iput = cgroup_diput, - .d_delete = always_delete_dentry, - }; + /* + * There must be no failure case after here, since rebinding takes + * care of subsystems' refcounts, which are explicitly dropped in + * the failure exit path. + */ + list_add(&root->root_list, &cgroup_roots); + cgroup_root_count++; - struct inode *inode = - cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); + /* + * Link the top cgroup in this hierarchy into all the css_set + * objects. + */ + down_write(&css_set_rwsem); + hash_for_each(css_set_table, i, cset, hlist) + link_css_set(&tmp_links, cset, root_cgrp); + up_write(&css_set_rwsem); - if (!inode) - return -ENOMEM; + BUG_ON(!list_empty(&root_cgrp->children)); + BUG_ON(atomic_read(&root->nr_cgrps) != 1); - inode->i_fop = &simple_dir_operations; - inode->i_op = &cgroup_dir_inode_operations; - /* directories start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - sb->s_root = d_make_root(inode); - if (!sb->s_root) - return -ENOMEM; - /* for everything else we want ->d_op set */ - sb->s_d_op = &cgroup_dops; - return 0; + kernfs_activate(root_cgrp->kn); + ret = 0; + goto out; + +destroy_root: + kernfs_destroy_root(root->kf_root); + root->kf_root = NULL; +exit_root_id: + cgroup_exit_root_id(root); +out: + free_cgrp_cset_links(&tmp_links); + return ret; } static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { - struct cgroup_sb_opts opts; struct cgroupfs_root *root; - int ret = 0; - struct super_block *sb; - struct cgroupfs_root *new_root; - struct list_head tmp_links; - struct inode *inode; - const struct cred *cred; - - /* First find the desired set of subsystems */ - mutex_lock(&cgroup_mutex); - ret = parse_cgroupfs_options(data, &opts); - mutex_unlock(&cgroup_mutex); - if (ret) - goto out_err; + struct cgroup_sb_opts opts; + struct dentry *dentry; + int ret; /* - * Allocate a new cgroup root. We may not need it if we're - * reusing an existing hierarchy. + * The first time anyone tries to mount a cgroup, enable the list + * linking each css_set to its tasks and fix up all existing tasks. */ - new_root = cgroup_root_from_opts(&opts); - if (IS_ERR(new_root)) { - ret = PTR_ERR(new_root); - goto out_err; - } - opts.new_root = new_root; - - /* Locate an existing or new sb for this hierarchy */ - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); - if (IS_ERR(sb)) { - ret = PTR_ERR(sb); - cgroup_free_root(opts.new_root); - goto out_err; - } - - root = sb->s_fs_info; - BUG_ON(!root); - if (root == opts.new_root) { - /* We used the new root structure, so this is a new hierarchy */ - struct cgroup *root_cgrp = &root->top_cgroup; - struct cgroupfs_root *existing_root; - int i; - struct css_set *cset; - - BUG_ON(sb->s_root != NULL); - - ret = cgroup_get_rootdir(sb); - if (ret) - goto drop_new_super; - inode = sb->s_root->d_inode; - - mutex_lock(&inode->i_mutex); - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); - - ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); - if (ret < 0) - goto unlock_drop; - root_cgrp->id = ret; - - /* Check for name clashes with existing mounts */ - ret = -EBUSY; - if (strlen(root->name)) - for_each_active_root(existing_root) - if (!strcmp(existing_root->name, root->name)) - goto unlock_drop; - - /* - * We're accessing css_set_count without locking - * css_set_lock here, but that's OK - it can only be - * increased by someone holding cgroup_lock, and - * that's us. The worst that can happen is that we - * have some link structures left over - */ - ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); - if (ret) - goto unlock_drop; - - /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ - ret = cgroup_init_root_id(root, 2, 0); - if (ret) - goto unlock_drop; - - sb->s_root->d_fsdata = root_cgrp; - root_cgrp->dentry = sb->s_root; - - /* - * We're inside get_sb() and will call lookup_one_len() to - * create the root files, which doesn't work if SELinux is - * in use. The following cred dancing somehow works around - * it. See 2ce9738ba ("cgroupfs: use init_cred when - * populating new cgroupfs mount") for more details. - */ - cred = override_creds(&init_cred); - - ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); - if (ret) - goto rm_base_files; + if (!use_task_css_set_links) + cgroup_enable_task_cg_lists(); +retry: + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); - ret = rebind_subsystems(root, root->subsys_mask, 0); - if (ret) - goto rm_base_files; + /* First find the desired set of subsystems */ + ret = parse_cgroupfs_options(data, &opts); + if (ret) + goto out_unlock; - revert_creds(cred); + /* look for a matching existing root */ + for_each_active_root(root) { + bool name_match = false; /* - * There must be no failure case after here, since rebinding - * takes care of subsystems' refcounts, which are explicitly - * dropped in the failure exit path. + * If we asked for a name then it must match. Also, if + * name matches but sybsys_mask doesn't, we should fail. + * Remember whether name matched. */ + if (opts.name) { + if (strcmp(opts.name, root->name)) + continue; + name_match = true; + } - list_add(&root->root_list, &cgroup_roots); - cgroup_root_count++; - - /* Link the top cgroup in this hierarchy into all - * the css_set objects */ - write_lock(&css_set_lock); - hash_for_each(css_set_table, i, cset, hlist) - link_css_set(&tmp_links, cset, root_cgrp); - write_unlock(&css_set_lock); - - free_cgrp_cset_links(&tmp_links); - - BUG_ON(!list_empty(&root_cgrp->children)); - BUG_ON(root->number_of_cgroups != 1); - - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - } else { /* - * We re-used an existing hierarchy - the new root (if - * any) is not needed + * If we asked for subsystems (or explicitly for no + * subsystems) then they must match. */ - cgroup_free_root(opts.new_root); + if ((opts.subsys_mask || opts.none) && + (opts.subsys_mask != root->subsys_mask)) { + if (!name_match) + continue; + ret = -EBUSY; + goto out_unlock; + } if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); ret = -EINVAL; - goto drop_new_super; + goto out_unlock; } else { pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); } } - } - kfree(opts.release_agent); - kfree(opts.name); - return dget(sb->s_root); - - rm_base_files: - free_cgrp_cset_links(&tmp_links); - cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); - revert_creds(cred); - unlock_drop: - cgroup_exit_root_id(root); - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - drop_new_super: - deactivate_locked_super(sb); - out_err: - kfree(opts.release_agent); - kfree(opts.name); - return ERR_PTR(ret); -} - -static void cgroup_kill_sb(struct super_block *sb) -{ - struct cgroupfs_root *root = sb->s_fs_info; - struct cgroup *cgrp = &root->top_cgroup; - struct cgrp_cset_link *link, *tmp_link; - int ret; - - BUG_ON(!root); - - BUG_ON(root->number_of_cgroups != 1); - BUG_ON(!list_empty(&cgrp->children)); + /* + * A root's lifetime is governed by its top cgroup. Zero + * ref indicate that the root is being destroyed. Wait for + * destruction to complete so that the subsystems are free. + * We can use wait_queue for the wait but this path is + * super cold. Let's just sleep for a bit and retry. + */ + if (!atomic_inc_not_zero(&root->top_cgroup.refcnt)) { + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); + kfree(opts.release_agent); + kfree(opts.name); + msleep(10); + goto retry; + } - mutex_lock(&cgrp->dentry->d_inode->i_mutex); - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); + ret = 0; + goto out_unlock; + } - /* Rebind all subsystems back to the default hierarchy */ - if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { - ret = rebind_subsystems(root, 0, root->subsys_mask); - /* Shouldn't be able to fail ... */ - BUG_ON(ret); + /* no such thing, create a new one */ + root = cgroup_root_from_opts(&opts); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out_unlock; } - /* - * Release all the links from cset_links to this hierarchy's - * root cgroup - */ - write_lock(&css_set_lock); + ret = cgroup_setup_root(root, opts.subsys_mask); + if (ret) + cgroup_free_root(root); - list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { - list_del(&link->cset_link); - list_del(&link->cgrp_link); - kfree(link); - } - write_unlock(&css_set_lock); +out_unlock: + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); - if (!list_empty(&root->root_list)) { - list_del(&root->root_list); - cgroup_root_count--; - } + kfree(opts.release_agent); + kfree(opts.name); - cgroup_exit_root_id(root); + if (ret) + return ERR_PTR(ret); - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); + dentry = kernfs_mount(fs_type, flags, root->kf_root); + if (IS_ERR(dentry)) + cgroup_put(&root->top_cgroup); + return dentry; +} - simple_xattrs_free(&cgrp->xattrs); +static void cgroup_kill_sb(struct super_block *sb) +{ + struct kernfs_root *kf_root = kernfs_root_from_sb(sb); + struct cgroupfs_root *root = cgroup_root_from_kf(kf_root); - kill_litter_super(sb); - cgroup_free_root(root); + cgroup_put(&root->top_cgroup); + kernfs_kill_sb(sb); } static struct file_system_type cgroup_fs_type = { @@ -1743,57 +1599,6 @@ static struct file_system_type cgroup_fs_type = { static struct kobject *cgroup_kobj; /** - * cgroup_path - generate the path of a cgroup - * @cgrp: the cgroup in question - * @buf: the buffer to write the path into - * @buflen: the length of the buffer - * - * Writes path of cgroup into buf. Returns 0 on success, -errno on error. - * - * We can't generate cgroup path using dentry->d_name, as accessing - * dentry->name must be protected by irq-unsafe dentry->d_lock or parent - * inode's i_mutex, while on the other hand cgroup_path() can be called - * with some irq-safe spinlocks held. - */ -int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) -{ - int ret = -ENAMETOOLONG; - char *start; - - if (!cgrp->parent) { - if (strlcpy(buf, "/", buflen) >= buflen) - return -ENAMETOOLONG; - return 0; - } - - start = buf + buflen - 1; - *start = '\0'; - - rcu_read_lock(); - do { - const char *name = cgroup_name(cgrp); - int len; - - len = strlen(name); - if ((start -= len) < buf) - goto out; - memcpy(start, name, len); - - if (--start < buf) - goto out; - *start = '/'; - - cgrp = cgrp->parent; - } while (cgrp->parent); - ret = 0; - memmove(buf, start, buf + buflen - start); -out: - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(cgroup_path); - -/** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task * @buf: the buffer to write the path into @@ -1804,31 +1609,32 @@ EXPORT_SYMBOL_GPL(cgroup_path); * function grabs cgroup_mutex and shouldn't be used inside locks used by * cgroup controller callbacks. * - * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short. + * Return value is the same as kernfs_path(). */ -int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) +char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) { struct cgroupfs_root *root; struct cgroup *cgrp; - int hierarchy_id = 1, ret = 0; - - if (buflen < 2) - return -ENAMETOOLONG; + int hierarchy_id = 1; + char *path = NULL; mutex_lock(&cgroup_mutex); + down_read(&css_set_rwsem); root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); if (root) { cgrp = task_cgroup_from_root(task, root); - ret = cgroup_path(cgrp, buf, buflen); + path = cgroup_path(cgrp, buf, buflen); } else { /* if no hierarchy exists, everyone is in "/" */ - memcpy(buf, "/", 2); + if (strlcpy(buf, "/", buflen) < buflen) + path = buf; } + up_read(&css_set_rwsem); mutex_unlock(&cgroup_mutex); - return ret; + return path; } EXPORT_SYMBOL_GPL(task_cgroup_path); @@ -1846,7 +1652,6 @@ struct cgroup_taskset { struct flex_array *tc_array; int tc_array_len; int idx; - struct cgroup *cur_cgrp; }; /** @@ -1861,11 +1666,9 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) tset->idx = 0; return cgroup_taskset_next(tset); } else { - tset->cur_cgrp = tset->single.cgrp; return tset->single.task; } } -EXPORT_SYMBOL_GPL(cgroup_taskset_first); /** * cgroup_taskset_next - iterate to the next task in taskset @@ -1882,42 +1685,16 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) return NULL; tc = flex_array_get(tset->tc_array, tset->idx++); - tset->cur_cgrp = tc->cgrp; return tc->task; } -EXPORT_SYMBOL_GPL(cgroup_taskset_next); /** - * cgroup_taskset_cur_css - return the matching css for the current task - * @tset: taskset of interest - * @subsys_id: the ID of the target subsystem - * - * Return the css for the current (last returned) task of @tset for - * subsystem specified by @subsys_id. This function must be preceded by - * either cgroup_taskset_first() or cgroup_taskset_next(). - */ -struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, - int subsys_id) -{ - return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]); -} -EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css); - -/** - * cgroup_taskset_size - return the number of tasks in taskset - * @tset: taskset of interest - */ -int cgroup_taskset_size(struct cgroup_taskset *tset) -{ - return tset->tc_array ? tset->tc_array_len : 1; -} -EXPORT_SYMBOL_GPL(cgroup_taskset_size); - - -/* * cgroup_task_migrate - move a task from one cgroup to another. + * @old_cgrp; the cgroup @tsk is being migrated from + * @tsk: the task being migrated + * @new_cset: the new css_set @tsk is being attached to * - * Must be called with cgroup_mutex and threadgroup locked. + * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked. */ static void cgroup_task_migrate(struct cgroup *old_cgrp, struct task_struct *tsk, @@ -1925,6 +1702,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, { struct css_set *old_cset; + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&css_set_rwsem); + /* * We are synchronized through threadgroup_lock() against PF_EXITING * setting such that we can't race against cgroup_exit() changing the @@ -1937,11 +1717,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, rcu_assign_pointer(tsk->cgroups, new_cset); task_unlock(tsk); - /* Update the css_set linked lists if we're using them */ - write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) - list_move(&tsk->cg_list, &new_cset->tasks); - write_unlock(&css_set_lock); + list_move(&tsk->cg_list, &new_cset->tasks); /* * We just gained a reference on old_cset by taking it from the @@ -1949,26 +1725,26 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, * we're safe to drop it here; it will be freed under RCU. */ set_bit(CGRP_RELEASABLE, &old_cgrp->flags); - put_css_set(old_cset); + put_css_set_locked(old_cset, false); } /** * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup * @cgrp: the cgroup to attach to - * @tsk: the task or the leader of the threadgroup to be attached + * @leader: the task or the leader of the threadgroup to be attached * @threadgroup: attach the whole threadgroup? * * Call holding cgroup_mutex and the group_rwsem of the leader. Will take * task_lock of @tsk or each thread in the threadgroup individually in turn. */ -static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, +static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, bool threadgroup) { - int retval, i, group_size; + int ret, i, group_size; struct cgroupfs_root *root = cgrp->root; struct cgroup_subsys_state *css, *failed_css = NULL; /* threadgroup list cursor and array */ - struct task_struct *leader = tsk; + struct task_struct *task; struct task_and_cgroup *tc; struct flex_array *group; struct cgroup_taskset tset = { }; @@ -1981,7 +1757,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * threads exit, this will just be an over-estimate. */ if (threadgroup) - group_size = get_nr_threads(tsk); + group_size = get_nr_threads(leader); else group_size = 1; /* flex_array supports very large thread-groups better than kmalloc. */ @@ -1989,8 +1765,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, if (!group) return -ENOMEM; /* pre-allocate to guarantee space while iterating in rcu read-side. */ - retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); - if (retval) + ret = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); + if (ret) goto out_free_group_list; i = 0; @@ -1999,18 +1775,20 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * already PF_EXITING could be freed from underneath us unless we * take an rcu_read_lock. */ + down_read(&css_set_rwsem); rcu_read_lock(); + task = leader; do { struct task_and_cgroup ent; - /* @tsk either already exited or can't exit until the end */ - if (tsk->flags & PF_EXITING) + /* @task either already exited or can't exit until the end */ + if (task->flags & PF_EXITING) goto next; /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); - ent.task = tsk; - ent.cgrp = task_cgroup_from_root(tsk, root); + ent.task = task; + ent.cgrp = task_cgroup_from_root(task, root); /* nothing to do if this task is already in the cgroup */ if (ent.cgrp == cgrp) goto next; @@ -2018,21 +1796,22 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * saying GFP_ATOMIC has no effect here because we did prealloc * earlier, but it's good form to communicate our expectations. */ - retval = flex_array_put(group, i, &ent, GFP_ATOMIC); - BUG_ON(retval != 0); + ret = flex_array_put(group, i, &ent, GFP_ATOMIC); + BUG_ON(ret != 0); i++; next: if (!threadgroup) break; - } while_each_thread(leader, tsk); + } while_each_thread(leader, task); rcu_read_unlock(); + up_read(&css_set_rwsem); /* remember the number of threads in the array for later. */ group_size = i; tset.tc_array = group; tset.tc_array_len = group_size; /* methods shouldn't be called if no task is actually migrating */ - retval = 0; + ret = 0; if (!group_size) goto out_free_group_list; @@ -2041,8 +1820,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, */ for_each_css(css, i, cgrp) { if (css->ss->can_attach) { - retval = css->ss->can_attach(css, &tset); - if (retval) { + ret = css->ss->can_attach(css, &tset); + if (ret) { failed_css = css; goto out_cancel_attach; } @@ -2060,7 +1839,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, old_cset = task_css_set(tc->task); tc->cset = find_css_set(old_cset, cgrp); if (!tc->cset) { - retval = -ENOMEM; + ret = -ENOMEM; goto out_put_css_set_refs; } } @@ -2070,10 +1849,12 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * proceed to move all tasks to the new cgroup. There are no * failure cases after here, so this is the commit point. */ + down_write(&css_set_rwsem); for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); cgroup_task_migrate(tc->cgrp, tc->task, tc->cset); } + up_write(&css_set_rwsem); /* nothing is sensitive to fork() after this point. */ /* @@ -2086,18 +1867,18 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, /* * step 5: success! and cleanup */ - retval = 0; + ret = 0; out_put_css_set_refs: - if (retval) { + if (ret) { for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); if (!tc->cset) break; - put_css_set(tc->cset); + put_css_set(tc->cset, false); } } out_cancel_attach: - if (retval) { + if (ret) { for_each_css(css, i, cgrp) { if (css == failed_css) break; @@ -2107,7 +1888,7 @@ out_cancel_attach: } out_free_group_list: flex_array_free(group); - return retval; + return ret; } /* @@ -2203,7 +1984,11 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) mutex_lock(&cgroup_mutex); for_each_active_root(root) { - struct cgroup *from_cgrp = task_cgroup_from_root(from, root); + struct cgroup *from_cgrp; + + down_read(&css_set_rwsem); + from_cgrp = task_cgroup_from_root(from, root); + up_read(&css_set_rwsem); retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) @@ -2230,14 +2015,15 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css, static int cgroup_release_agent_write(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { - BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); - if (strlen(buffer) >= PATH_MAX) - return -EINVAL; + struct cgroupfs_root *root = css->cgroup->root; + + BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); if (!cgroup_lock_live_group(css->cgroup)) return -ENODEV; - mutex_lock(&cgroup_root_mutex); - strcpy(css->cgroup->root->release_agent_path, buffer); - mutex_unlock(&cgroup_root_mutex); + spin_lock(&release_agent_path_lock); + strlcpy(root->release_agent_path, buffer, + sizeof(root->release_agent_path)); + spin_unlock(&release_agent_path_lock); mutex_unlock(&cgroup_mutex); return 0; } @@ -2262,32 +2048,23 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } -/* A buffer size big enough for numbers or short strings */ -#define CGROUP_LOCAL_BUFFER_SIZE 64 - -static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf, - size_t nbytes, loff_t *ppos) +static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) { - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cfe->css; - size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1; - char *buf; + struct cgroup *cgrp = of->kn->parent->priv; + struct cftype *cft = of->kn->priv; + struct cgroup_subsys_state *css; int ret; - if (nbytes >= max_bytes) - return -E2BIG; - - buf = kmalloc(nbytes + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - if (copy_from_user(buf, userbuf, nbytes)) { - ret = -EFAULT; - goto out_free; - } - - buf[nbytes] = '\0'; + /* + * kernfs guarantees that a file isn't deleted with operations in + * flight, which means that the matching css is and stays alive and + * doesn't need to be pinned. The RCU locking is not necessary + * either. It's just for the convenience of using cgroup_css(). + */ + rcu_read_lock(); + css = cgroup_css(cgrp, cft->ss); + rcu_read_unlock(); if (cft->write_string) { ret = cft->write_string(css, cft, strstrip(buf)); @@ -2306,53 +2083,23 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf, } else { ret = -EINVAL; } -out_free: - kfree(buf); + return ret ?: nbytes; } -/* - * seqfile ops/methods for returning structured data. Currently just - * supports string->u64 maps, but can be extended in future. - */ - static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { - struct cftype *cft = seq_cft(seq); - - if (cft->seq_start) { - return cft->seq_start(seq, ppos); - } else { - /* - * The same behavior and code as single_open(). Returns - * !NULL if pos is at the beginning; otherwise, NULL. - */ - return NULL + !*ppos; - } + return seq_cft(seq)->seq_start(seq, ppos); } static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) { - struct cftype *cft = seq_cft(seq); - - if (cft->seq_next) { - return cft->seq_next(seq, v, ppos); - } else { - /* - * The same behavior and code as single_open(), always - * terminate after the initial read. - */ - ++*ppos; - return NULL; - } + return seq_cft(seq)->seq_next(seq, v, ppos); } static void cgroup_seqfile_stop(struct seq_file *seq, void *v) { - struct cftype *cft = seq_cft(seq); - - if (cft->seq_stop) - cft->seq_stop(seq, v); + seq_cft(seq)->seq_stop(seq, v); } static int cgroup_seqfile_show(struct seq_file *m, void *arg) @@ -2372,96 +2119,35 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) return 0; } -static struct seq_operations cgroup_seq_operations = { - .start = cgroup_seqfile_start, - .next = cgroup_seqfile_next, - .stop = cgroup_seqfile_stop, - .show = cgroup_seqfile_show, +static struct kernfs_ops cgroup_kf_single_ops = { + .atomic_write_len = PAGE_SIZE, + .write = cgroup_file_write, + .seq_show = cgroup_seqfile_show, }; -static int cgroup_file_open(struct inode *inode, struct file *file) -{ - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); - struct cgroup_subsys_state *css; - struct cgroup_open_file *of; - int err; - - err = generic_file_open(inode, file); - if (err) - return err; - - /* - * If the file belongs to a subsystem, pin the css. Will be - * unpinned either on open failure or release. This ensures that - * @css stays alive for all file operations. - */ - rcu_read_lock(); - css = cgroup_css(cgrp, cft->ss); - if (cft->ss && !css_tryget(css)) - css = NULL; - rcu_read_unlock(); - - if (!css) - return -ENODEV; - - /* - * @cfe->css is used by read/write/close to determine the - * associated css. @file->private_data would be a better place but - * that's already used by seqfile. Multiple accessors may use it - * simultaneously which is okay as the association never changes. - */ - WARN_ON_ONCE(cfe->css && cfe->css != css); - cfe->css = css; - - of = __seq_open_private(file, &cgroup_seq_operations, - sizeof(struct cgroup_open_file)); - if (of) { - of->cfe = cfe; - return 0; - } - - if (css->ss) - css_put(css); - return -ENOMEM; -} - -static int cgroup_file_release(struct inode *inode, struct file *file) -{ - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cgroup_subsys_state *css = cfe->css; - - if (css->ss) - css_put(css); - return seq_release_private(inode, file); -} +static struct kernfs_ops cgroup_kf_ops = { + .atomic_write_len = PAGE_SIZE, + .write = cgroup_file_write, + .seq_start = cgroup_seqfile_start, + .seq_next = cgroup_seqfile_next, + .seq_stop = cgroup_seqfile_stop, + .seq_show = cgroup_seqfile_show, +}; /* * cgroup_rename - Only allow simple rename of directories in place. */ -static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) +static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name_str) { + struct cgroup *cgrp = kn->priv; int ret; - struct cgroup_name *name, *old_name; - struct cgroup *cgrp; - - /* - * It's convinient to use parent dir's i_mutex to protected - * cgrp->name. - */ - lockdep_assert_held(&old_dir->i_mutex); - if (!S_ISDIR(old_dentry->d_inode->i_mode)) + if (kernfs_type(kn) != KERNFS_DIR) return -ENOTDIR; - if (new_dentry->d_inode) - return -EEXIST; - if (old_dir != new_dir) + if (kn->parent != new_parent) return -EIO; - cgrp = __d_cgrp(old_dentry); - /* * This isn't a proper migration and its usefulness is very * limited. Disallow if sane_behavior. @@ -2469,218 +2155,29 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, if (cgroup_sane_behavior(cgrp)) return -EPERM; - name = cgroup_alloc_name(new_dentry); - if (!name) - return -ENOMEM; - - ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); - if (ret) { - kfree(name); - return ret; - } - - old_name = rcu_dereference_protected(cgrp->name, true); - rcu_assign_pointer(cgrp->name, name); - - kfree_rcu(old_name, rcu_head); - return 0; -} - -static struct simple_xattrs *__d_xattrs(struct dentry *dentry) -{ - if (S_ISDIR(dentry->d_inode->i_mode)) - return &__d_cgrp(dentry)->xattrs; - else - return &__d_cfe(dentry)->xattrs; -} - -static inline int xattr_enabled(struct dentry *dentry) -{ - struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - return root->flags & CGRP_ROOT_XATTR; -} - -static bool is_valid_xattr(const char *name) -{ - if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || - !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) - return true; - return false; -} - -static int cgroup_setxattr(struct dentry *dentry, const char *name, - const void *val, size_t size, int flags) -{ - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - if (!is_valid_xattr(name)) - return -EINVAL; - return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags); -} - -static int cgroup_removexattr(struct dentry *dentry, const char *name) -{ - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - if (!is_valid_xattr(name)) - return -EINVAL; - return simple_xattr_remove(__d_xattrs(dentry), name); -} - -static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name, - void *buf, size_t size) -{ - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - if (!is_valid_xattr(name)) - return -EINVAL; - return simple_xattr_get(__d_xattrs(dentry), name, buf, size); -} - -static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size) -{ - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - return simple_xattr_list(__d_xattrs(dentry), buf, size); -} - -static const struct file_operations cgroup_file_operations = { - .read = seq_read, - .write = cgroup_file_write, - .llseek = generic_file_llseek, - .open = cgroup_file_open, - .release = cgroup_file_release, -}; - -static const struct inode_operations cgroup_file_inode_operations = { - .setxattr = cgroup_setxattr, - .getxattr = cgroup_getxattr, - .listxattr = cgroup_listxattr, - .removexattr = cgroup_removexattr, -}; - -static const struct inode_operations cgroup_dir_inode_operations = { - .lookup = simple_lookup, - .mkdir = cgroup_mkdir, - .rmdir = cgroup_rmdir, - .rename = cgroup_rename, - .setxattr = cgroup_setxattr, - .getxattr = cgroup_getxattr, - .listxattr = cgroup_listxattr, - .removexattr = cgroup_removexattr, -}; - -static int cgroup_create_file(struct dentry *dentry, umode_t mode, - struct super_block *sb) -{ - struct inode *inode; - - if (!dentry) - return -ENOENT; - if (dentry->d_inode) - return -EEXIST; - - inode = cgroup_new_inode(mode, sb); - if (!inode) - return -ENOMEM; - - if (S_ISDIR(mode)) { - inode->i_op = &cgroup_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - - /* start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - inc_nlink(dentry->d_parent->d_inode); - - /* - * Control reaches here with cgroup_mutex held. - * @inode->i_mutex should nest outside cgroup_mutex but we - * want to populate it immediately without releasing - * cgroup_mutex. As @inode isn't visible to anyone else - * yet, trylock will always succeed without affecting - * lockdep checks. - */ - WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex)); - } else if (S_ISREG(mode)) { - inode->i_size = 0; - inode->i_fop = &cgroup_file_operations; - inode->i_op = &cgroup_file_inode_operations; - } - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ - return 0; -} - -/** - * cgroup_file_mode - deduce file mode of a control file - * @cft: the control file in question - * - * returns cft->mode if ->mode is not 0 - * returns S_IRUGO|S_IWUSR if it has both a read and a write handler - * returns S_IRUGO if it has only a read handler - * returns S_IWUSR if it has only a write hander - */ -static umode_t cgroup_file_mode(const struct cftype *cft) -{ - umode_t mode = 0; - - if (cft->mode) - return cft->mode; - - if (cft->read_u64 || cft->read_s64 || cft->seq_show) - mode |= S_IRUGO; + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); - if (cft->write_u64 || cft->write_s64 || cft->write_string || - cft->trigger) - mode |= S_IWUSR; + ret = kernfs_rename(kn, new_parent, new_name_str); - return mode; + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); + return ret; } static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) { - struct dentry *dir = cgrp->dentry; - struct cgroup *parent = __d_cgrp(dir); - struct dentry *dentry; - struct cfent *cfe; - int error; - umode_t mode; - char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - - if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && - !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { - strcpy(name, cft->ss->name); - strcat(name, "."); - } - strcat(name, cft->name); + char name[CGROUP_FILE_NAME_MAX]; + struct kernfs_node *kn; + struct lock_class_key *key = NULL; - BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); - - cfe = kzalloc(sizeof(*cfe), GFP_KERNEL); - if (!cfe) - return -ENOMEM; - - dentry = lookup_one_len(name, dir, strlen(name)); - if (IS_ERR(dentry)) { - error = PTR_ERR(dentry); - goto out; - } - - cfe->type = (void *)cft; - cfe->dentry = dentry; - dentry->d_fsdata = cfe; - simple_xattrs_init(&cfe->xattrs); - - mode = cgroup_file_mode(cft); - error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); - if (!error) { - list_add_tail(&cfe->node, &parent->files); - cfe = NULL; - } - dput(dentry); -out: - kfree(cfe); - return error; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + key = &cft->lockdep_key; +#endif + kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), + cgroup_file_mode(cft), 0, cft->kf_ops, cft, + NULL, false, key); + return PTR_ERR_OR_ZERO(kn); } /** @@ -2700,8 +2197,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], struct cftype *cft; int ret; - lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); - lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&cgroup_tree_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ @@ -2726,44 +2222,19 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], return 0; } -static void cgroup_cfts_prepare(void) - __acquires(&cgroup_mutex) -{ - /* - * Thanks to the entanglement with vfs inode locking, we can't walk - * the existing cgroups under cgroup_mutex and create files. - * Instead, we use css_for_each_descendant_pre() and drop RCU read - * lock before calling cgroup_addrm_files(). - */ - mutex_lock(&cgroup_mutex); -} - -static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) - __releases(&cgroup_mutex) +static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) { LIST_HEAD(pending); struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *root = &ss->root->top_cgroup; - struct super_block *sb = ss->root->sb; - struct dentry *prev = NULL; - struct inode *inode; struct cgroup_subsys_state *css; - u64 update_before; int ret = 0; - /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ - if (!cfts || ss->root == &cgroup_dummy_root || - !atomic_inc_not_zero(&sb->s_active)) { - mutex_unlock(&cgroup_mutex); - return 0; - } + lockdep_assert_held(&cgroup_tree_mutex); - /* - * All cgroups which are created after we drop cgroup_mutex will - * have the updated set of files, so we only need to update the - * cgroups created before the current @cgroup_serial_nr_next. - */ - update_before = cgroup_serial_nr_next; + /* don't bother if @ss isn't attached */ + if (ss->root == &cgroup_dummy_root) + return 0; /* add/rm files for all cgroups created before */ css_for_each_descendant_pre(css, cgroup_css(root, ss)) { @@ -2772,62 +2243,75 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) if (cgroup_is_dead(cgrp)) continue; - inode = cgrp->dentry->d_inode; - dget(cgrp->dentry); - dput(prev); - prev = cgrp->dentry; - - mutex_unlock(&cgroup_mutex); - mutex_lock(&inode->i_mutex); - mutex_lock(&cgroup_mutex); - if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) - ret = cgroup_addrm_files(cgrp, cfts, is_add); - mutex_unlock(&inode->i_mutex); + ret = cgroup_addrm_files(cgrp, cfts, is_add); if (ret) break; } - mutex_unlock(&cgroup_mutex); - dput(prev); - deactivate_super(sb); + + if (is_add && !ret) + kernfs_activate(root->kn); return ret; } -/** - * cgroup_add_cftypes - add an array of cftypes to a subsystem - * @ss: target cgroup subsystem - * @cfts: zero-length name terminated array of cftypes - * - * Register @cfts to @ss. Files described by @cfts are created for all - * existing cgroups to which @ss is attached and all future cgroups will - * have them too. This function can be called anytime whether @ss is - * attached or not. - * - * Returns 0 on successful registration, -errno on failure. Note that this - * function currently returns 0 as long as @cfts registration is successful - * even if some file creation attempts on existing cgroups fail. - */ -int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +static void cgroup_exit_cftypes(struct cftype *cfts) { - struct cftype_set *set; struct cftype *cft; - int ret; - set = kzalloc(sizeof(*set), GFP_KERNEL); - if (!set) - return -ENOMEM; + for (cft = cfts; cft->name[0] != '\0'; cft++) { + /* free copy for custom atomic_write_len, see init_cftypes() */ + if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) + kfree(cft->kf_ops); + cft->kf_ops = NULL; + cft->ss = NULL; + } +} + +static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +{ + struct cftype *cft; - for (cft = cfts; cft->name[0] != '\0'; cft++) + for (cft = cfts; cft->name[0] != '\0'; cft++) { + struct kernfs_ops *kf_ops; + + WARN_ON(cft->ss || cft->kf_ops); + + if (cft->seq_start) + kf_ops = &cgroup_kf_ops; + else + kf_ops = &cgroup_kf_single_ops; + + /* + * Ugh... if @cft wants a custom max_write_len, we need to + * make a copy of kf_ops to set its atomic_write_len. + */ + if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) { + kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL); + if (!kf_ops) { + cgroup_exit_cftypes(cfts); + return -ENOMEM; + } + kf_ops->atomic_write_len = cft->max_write_len; + } + + cft->kf_ops = kf_ops; cft->ss = ss; + } - cgroup_cfts_prepare(); - set->cfts = cfts; - list_add_tail(&set->node, &ss->cftsets); - ret = cgroup_cfts_commit(cfts, true); - if (ret) - cgroup_rm_cftypes(cfts); - return ret; + return 0; +} + +static int cgroup_rm_cftypes_locked(struct cftype *cfts) +{ + lockdep_assert_held(&cgroup_tree_mutex); + + if (!cfts || !cfts[0].ss) + return -ENOENT; + + list_del(&cfts->node); + cgroup_apply_cftypes(cfts, false); + cgroup_exit_cftypes(cfts); + return 0; } -EXPORT_SYMBOL_GPL(cgroup_add_cftypes); /** * cgroup_rm_cftypes - remove an array of cftypes from a subsystem @@ -2842,24 +2326,48 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes); */ int cgroup_rm_cftypes(struct cftype *cfts) { - struct cftype_set *set; + int ret; - if (!cfts || !cfts[0].ss) - return -ENOENT; + mutex_lock(&cgroup_tree_mutex); + ret = cgroup_rm_cftypes_locked(cfts); + mutex_unlock(&cgroup_tree_mutex); + return ret; +} - cgroup_cfts_prepare(); +/** + * cgroup_add_cftypes - add an array of cftypes to a subsystem + * @ss: target cgroup subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Register @cfts to @ss. Files described by @cfts are created for all + * existing cgroups to which @ss is attached and all future cgroups will + * have them too. This function can be called anytime whether @ss is + * attached or not. + * + * Returns 0 on successful registration, -errno on failure. Note that this + * function currently returns 0 as long as @cfts registration is successful + * even if some file creation attempts on existing cgroups fail. + */ +int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +{ + int ret; - list_for_each_entry(set, &cfts[0].ss->cftsets, node) { - if (set->cfts == cfts) { - list_del(&set->node); - kfree(set); - cgroup_cfts_commit(cfts, false); - return 0; - } - } + if (!cfts || cfts[0].name[0] == '\0') + return 0; + + ret = cgroup_init_cftypes(ss, cfts); + if (ret) + return ret; + + mutex_lock(&cgroup_tree_mutex); - cgroup_cfts_commit(NULL, false); - return -ENOENT; + list_add_tail(&cfts->node, &ss->cfts); + ret = cgroup_apply_cftypes(cfts, true); + if (ret) + cgroup_rm_cftypes_locked(cfts); + + mutex_unlock(&cgroup_tree_mutex); + return ret; } /** @@ -2868,57 +2376,18 @@ int cgroup_rm_cftypes(struct cftype *cfts) * * Return the number of tasks in the cgroup. */ -int cgroup_task_count(const struct cgroup *cgrp) +static int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; struct cgrp_cset_link *link; - read_lock(&css_set_lock); + down_read(&css_set_rwsem); list_for_each_entry(link, &cgrp->cset_links, cset_link) count += atomic_read(&link->cset->refcount); - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); return count; } -/* - * To reduce the fork() overhead for systems that are not actually using - * their cgroups capability, we don't maintain the lists running through - * each css_set to its tasks until we see the list actually used - in other - * words after the first call to css_task_iter_start(). - */ -static void cgroup_enable_task_cg_lists(void) -{ - struct task_struct *p, *g; - write_lock(&css_set_lock); - use_task_css_set_links = 1; - /* - * We need tasklist_lock because RCU is not safe against - * while_each_thread(). Besides, a forking task that has passed - * cgroup_post_fork() without seeing use_task_css_set_links = 1 - * is not guaranteed to have its child immediately visible in the - * tasklist if we walk through it with RCU. - */ - read_lock(&tasklist_lock); - do_each_thread(g, p) { - task_lock(p); - /* - * We should check if the process is exiting, otherwise - * it will race with cgroup_exit() in that the list - * entry won't be deleted though the process has exited. - * Do it while holding siglock so that we don't end up - * racing against cgroup_exit(). - */ - spin_lock_irq(&p->sighand->siglock); - if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) - list_add(&p->cg_list, &task_css_set(p)->tasks); - spin_unlock_irq(&p->sighand->siglock); - - task_unlock(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); - write_unlock(&css_set_lock); -} - /** * css_next_child - find the next child of a given css * @pos_css: the current position (%NULL to initiate traversal) @@ -2937,7 +2406,7 @@ css_next_child(struct cgroup_subsys_state *pos_css, struct cgroup *cgrp = parent_css->cgroup; struct cgroup *next; - cgroup_assert_mutex_or_rcu_locked(); + cgroup_assert_mutexes_or_rcu_locked(); /* * @pos could already have been removed. Once a cgroup is removed, @@ -2973,7 +2442,6 @@ css_next_child(struct cgroup_subsys_state *pos_css, return cgroup_css(next, parent_css->ss); } -EXPORT_SYMBOL_GPL(css_next_child); /** * css_next_descendant_pre - find the next descendant for pre-order walk @@ -2995,7 +2463,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - cgroup_assert_mutex_or_rcu_locked(); + cgroup_assert_mutexes_or_rcu_locked(); /* if first iteration, visit @root */ if (!pos) @@ -3016,7 +2484,6 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, return NULL; } -EXPORT_SYMBOL_GPL(css_next_descendant_pre); /** * css_rightmost_descendant - return the rightmost descendant of a css @@ -3036,7 +2503,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last, *tmp; - cgroup_assert_mutex_or_rcu_locked(); + cgroup_assert_mutexes_or_rcu_locked(); do { last = pos; @@ -3048,7 +2515,6 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos) return last; } -EXPORT_SYMBOL_GPL(css_rightmost_descendant); static struct cgroup_subsys_state * css_leftmost_descendant(struct cgroup_subsys_state *pos) @@ -3084,7 +2550,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - cgroup_assert_mutex_or_rcu_locked(); + cgroup_assert_mutexes_or_rcu_locked(); /* if first iteration, visit leftmost descendant which may be @root */ if (!pos) @@ -3102,7 +2568,6 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, /* no sibling left, visit parent */ return css_parent(pos); } -EXPORT_SYMBOL_GPL(css_next_descendant_post); /** * css_advance_task_iter - advance a task itererator to the next css_set @@ -3146,17 +2611,12 @@ static void css_advance_task_iter(struct css_task_iter *it) */ void css_task_iter_start(struct cgroup_subsys_state *css, struct css_task_iter *it) - __acquires(css_set_lock) + __acquires(css_set_rwsem) { - /* - * The first time anyone tries to iterate across a css, we need to - * enable the list linking each css_set to its tasks, and fix up - * all existing tasks. - */ - if (!use_task_css_set_links) - cgroup_enable_task_cg_lists(); + /* no one should try to iterate before mounting cgroups */ + WARN_ON_ONCE(!use_task_css_set_links); - read_lock(&css_set_lock); + down_read(&css_set_rwsem); it->origin_css = css; it->cset_link = &css->cgroup->cset_links; @@ -3204,180 +2664,9 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) * Finish task iteration started by css_task_iter_start(). */ void css_task_iter_end(struct css_task_iter *it) - __releases(css_set_lock) -{ - read_unlock(&css_set_lock); -} - -static inline int started_after_time(struct task_struct *t1, - struct timespec *time, - struct task_struct *t2) -{ - int start_diff = timespec_compare(&t1->start_time, time); - if (start_diff > 0) { - return 1; - } else if (start_diff < 0) { - return 0; - } else { - /* - * Arbitrarily, if two processes started at the same - * time, we'll say that the lower pointer value - * started first. Note that t2 may have exited by now - * so this may not be a valid pointer any longer, but - * that's fine - it still serves to distinguish - * between two tasks started (effectively) simultaneously. - */ - return t1 > t2; - } -} - -/* - * This function is a callback from heap_insert() and is used to order - * the heap. - * In this case we order the heap in descending task start time. - */ -static inline int started_after(void *p1, void *p2) -{ - struct task_struct *t1 = p1; - struct task_struct *t2 = p2; - return started_after_time(t1, &t2->start_time, t2); -} - -/** - * css_scan_tasks - iterate though all the tasks in a css - * @css: the css to iterate tasks of - * @test: optional test callback - * @process: process callback - * @data: data passed to @test and @process - * @heap: optional pre-allocated heap used for task iteration - * - * Iterate through all the tasks in @css, calling @test for each, and if it - * returns %true, call @process for it also. - * - * @test may be NULL, meaning always true (select all tasks), which - * effectively duplicates css_task_iter_{start,next,end}() but does not - * lock css_set_lock for the call to @process. - * - * It is guaranteed that @process will act on every task that is a member - * of @css for the duration of this call. This function may or may not - * call @process for tasks that exit or move to a different css during the - * call, or are forked or move into the css during the call. - * - * Note that @test may be called with locks held, and may in some - * situations be called multiple times for the same task, so it should be - * cheap. - * - * If @heap is non-NULL, a heap has been pre-allocated and will be used for - * heap operations (and its "gt" member will be overwritten), else a - * temporary heap will be used (allocation of which may cause this function - * to fail). - */ -int css_scan_tasks(struct cgroup_subsys_state *css, - bool (*test)(struct task_struct *, void *), - void (*process)(struct task_struct *, void *), - void *data, struct ptr_heap *heap) -{ - int retval, i; - struct css_task_iter it; - struct task_struct *p, *dropped; - /* Never dereference latest_task, since it's not refcounted */ - struct task_struct *latest_task = NULL; - struct ptr_heap tmp_heap; - struct timespec latest_time = { 0, 0 }; - - if (heap) { - /* The caller supplied our heap and pre-allocated its memory */ - heap->gt = &started_after; - } else { - /* We need to allocate our own heap memory */ - heap = &tmp_heap; - retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after); - if (retval) - /* cannot allocate the heap */ - return retval; - } - - again: - /* - * Scan tasks in the css, using the @test callback to determine - * which are of interest, and invoking @process callback on the - * ones which need an update. Since we don't want to hold any - * locks during the task updates, gather tasks to be processed in a - * heap structure. The heap is sorted by descending task start - * time. If the statically-sized heap fills up, we overflow tasks - * that started later, and in future iterations only consider tasks - * that started after the latest task in the previous pass. This - * guarantees forward progress and that we don't miss any tasks. - */ - heap->size = 0; - css_task_iter_start(css, &it); - while ((p = css_task_iter_next(&it))) { - /* - * Only affect tasks that qualify per the caller's callback, - * if he provided one - */ - if (test && !test(p, data)) - continue; - /* - * Only process tasks that started after the last task - * we processed - */ - if (!started_after_time(p, &latest_time, latest_task)) - continue; - dropped = heap_insert(heap, p); - if (dropped == NULL) { - /* - * The new task was inserted; the heap wasn't - * previously full - */ - get_task_struct(p); - } else if (dropped != p) { - /* - * The new task was inserted, and pushed out a - * different task - */ - get_task_struct(p); - put_task_struct(dropped); - } - /* - * Else the new task was newer than anything already in - * the heap and wasn't inserted - */ - } - css_task_iter_end(&it); - - if (heap->size) { - for (i = 0; i < heap->size; i++) { - struct task_struct *q = heap->ptrs[i]; - if (i == 0) { - latest_time = q->start_time; - latest_task = q; - } - /* Process the task per the caller's callback */ - process(q, data); - put_task_struct(q); - } - /* - * If we had to process any tasks at all, scan again - * in case some of them were in the middle of forking - * children that didn't get processed. - * Not the most efficient way to do it, but it avoids - * having to take callback_mutex in the fork path - */ - goto again; - } - if (heap == &tmp_heap) - heap_free(&tmp_heap); - return 0; -} - -static void cgroup_transfer_one_task(struct task_struct *task, void *data) + __releases(css_set_rwsem) { - struct cgroup *new_cgroup = data; - - mutex_lock(&cgroup_mutex); - cgroup_attach_task(new_cgroup, task, false); - mutex_unlock(&cgroup_mutex); + up_read(&css_set_rwsem); } /** @@ -3387,8 +2676,26 @@ static void cgroup_transfer_one_task(struct task_struct *task, void *data) */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { - return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task, - to, NULL); + struct css_task_iter it; + struct task_struct *task; + int ret = 0; + + do { + css_task_iter_start(&from->dummy_css, &it); + task = css_task_iter_next(&it); + if (task) + get_task_struct(task); + css_task_iter_end(&it); + + if (task) { + mutex_lock(&cgroup_mutex); + ret = cgroup_attach_task(to, task, false); + mutex_unlock(&cgroup_mutex); + put_task_struct(task); + } + } while (task && !ret); + + return ret; } /* @@ -3687,21 +2994,31 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { - int ret = -EINVAL; + struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct cgroup *cgrp; struct css_task_iter it; struct task_struct *tsk; + /* it should be kernfs_node belonging to cgroupfs and is a directory */ + if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || + kernfs_type(kn) != KERNFS_DIR) + return -EINVAL; + + mutex_lock(&cgroup_mutex); + /* - * Validate dentry by checking the superblock operations, - * and make sure it's a directory. + * We aren't being called from kernfs and there's no guarantee on + * @kn->priv's validity. For this and css_tryget_from_dir(), + * @kn->priv is RCU safe. Let's do the RCU dancing. */ - if (dentry->d_sb->s_op != &cgroup_ops || - !S_ISDIR(dentry->d_inode->i_mode)) - goto err; - - ret = 0; - cgrp = dentry->d_fsdata; + rcu_read_lock(); + cgrp = rcu_dereference(kn->priv); + if (!cgrp || cgroup_is_dead(cgrp)) { + rcu_read_unlock(); + mutex_unlock(&cgroup_mutex); + return -ENOENT; + } + rcu_read_unlock(); css_task_iter_start(&cgrp->dummy_css, &it); while ((tsk = css_task_iter_next(&it))) { @@ -3726,8 +3043,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } css_task_iter_end(&it); -err: - return ret; + mutex_unlock(&cgroup_mutex); + return 0; } @@ -3745,7 +3062,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup_open_file *of = s->private; + struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_pidlist *l; enum cgroup_filetype type = seq_cft(s)->private; @@ -3800,7 +3117,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) static void cgroup_pidlist_stop(struct seq_file *s, void *v) { - struct cgroup_open_file *of = s->private; + struct kernfs_open_file *of = s->private; struct cgroup_pidlist *l = of->priv; if (l) @@ -3811,7 +3128,7 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v) static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup_open_file *of = s->private; + struct kernfs_open_file *of = s->private; struct cgroup_pidlist *l = of->priv; pid_t *p = v; pid_t *end = l->list + l->length; @@ -3861,23 +3178,6 @@ static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, return 0; } -/* - * When dput() is called asynchronously, if umount has been done and - * then deactivate_super() in cgroup_free_fn() kills the superblock, - * there's a small window that vfs will see the root dentry with non-zero - * refcnt and trigger BUG(). - * - * That's why we hold a reference before dput() and drop it right after. - */ -static void cgroup_dput(struct cgroup *cgrp) -{ - struct super_block *sb = cgrp->root->sb; - - atomic_inc(&sb->s_active); - dput(cgrp->dentry); - deactivate_super(sb); -} - static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -3944,7 +3244,7 @@ static struct cftype cgroup_base_files[] = { .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, - .max_write_len = PATH_MAX, + .max_write_len = PATH_MAX - 1, }, { } /* terminate */ }; @@ -3963,13 +3263,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) /* process cftsets of each subsystem */ for_each_subsys(ss, i) { - struct cftype_set *set; + struct cftype *cfts; if (!test_bit(i, &subsys_mask)) continue; - list_for_each_entry(set, &ss->cftsets, node) { - ret = cgroup_addrm_files(cgrp, set->cfts, true); + list_for_each_entry(cfts, &ss->cfts, node) { + ret = cgroup_addrm_files(cgrp, cfts, true); if (ret < 0) goto err; } @@ -4012,7 +3312,7 @@ static void css_free_work_fn(struct work_struct *work) css_put(css->parent); css->ss->css_free(css); - cgroup_dput(cgrp); + cgroup_put(cgrp); } static void css_free_rcu_fn(struct rcu_head *rcu_head) @@ -4020,10 +3320,6 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) struct cgroup_subsys_state *css = container_of(rcu_head, struct cgroup_subsys_state, rcu_head); - /* - * css holds an extra ref to @cgrp->dentry which is put on the last - * css_put(). dput() requires process context which we don't have. - */ INIT_WORK(&css->destroy_work, css_free_work_fn); queue_work(cgroup_destroy_wq, &css->destroy_work); } @@ -4033,7 +3329,7 @@ static void css_release(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL); + rcu_assign_pointer(css->cgroup->subsys[css->ss->id], NULL); call_rcu(&css->rcu_head, css_free_rcu_fn); } @@ -4058,6 +3354,7 @@ static int online_css(struct cgroup_subsys_state *css) struct cgroup_subsys *ss = css->ss; int ret = 0; + lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (ss->css_online) @@ -4065,7 +3362,7 @@ static int online_css(struct cgroup_subsys_state *css) if (!ret) { css->flags |= CSS_ONLINE; css->cgroup->nr_css++; - rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); + rcu_assign_pointer(css->cgroup->subsys[ss->id], css); } return ret; } @@ -4075,6 +3372,7 @@ static void offline_css(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; + lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (!(css->flags & CSS_ONLINE)) @@ -4085,7 +3383,7 @@ static void offline_css(struct cgroup_subsys_state *css) css->flags &= ~CSS_ONLINE; css->cgroup->nr_css--; - RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); + RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css); } /** @@ -4103,7 +3401,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) struct cgroup_subsys_state *css; int err; - lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); css = ss->css_alloc(cgroup_css(parent, ss)); @@ -4116,7 +3413,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) init_css(css, ss, cgrp); - err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); + err = cgroup_populate_dir(cgrp, 1 << ss->id); if (err) goto err_free; @@ -4124,7 +3421,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (err) goto err_free; - dget(cgrp->dentry); + cgroup_get(cgrp); css_get(css->parent); if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && @@ -4144,35 +3441,27 @@ err_free: return err; } -/* +/** * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup - * @dentry: dentry of the new cgroup - * @mode: mode to set on new inode - * - * Must be called with the mutex on the parent inode held + * @name: name of the new cgroup + * @mode: mode to set on new cgroup */ -static long cgroup_create(struct cgroup *parent, struct dentry *dentry, - umode_t mode) +static long cgroup_create(struct cgroup *parent, const char *name, + umode_t mode) { struct cgroup *cgrp; - struct cgroup_name *name; struct cgroupfs_root *root = parent->root; int ssid, err; struct cgroup_subsys *ss; - struct super_block *sb = root->sb; + struct kernfs_node *kn; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); if (!cgrp) return -ENOMEM; - name = cgroup_alloc_name(dentry); - if (!name) { - err = -ENOMEM; - goto err_free_cgrp; - } - rcu_assign_pointer(cgrp->name, name); + mutex_lock(&cgroup_tree_mutex); /* * Only live parents can have children. Note that the liveliness @@ -4183,7 +3472,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ if (!cgroup_lock_live_group(parent)) { err = -ENODEV; - goto err_free_name; + goto err_unlock_tree; } /* @@ -4196,18 +3485,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_unlock; } - /* Grab a reference on the superblock so the hierarchy doesn't - * get deleted on unmount if there are child cgroups. This - * can be done outside cgroup_mutex, since the sb can't - * disappear while someone has an open control file on the - * fs */ - atomic_inc(&sb->s_active); - init_cgroup_housekeeping(cgrp); - dentry->d_fsdata = cgrp; - cgrp->dentry = dentry; - cgrp->parent = parent; cgrp->dummy_css.parent = &parent->dummy_css; cgrp->root = parent->root; @@ -4218,24 +3497,26 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + /* create the directory */ + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + if (IS_ERR(kn)) { + err = PTR_ERR(kn); + goto err_free_id; + } + cgrp->kn = kn; + /* - * Create directory. cgroup_create_file() returns with the new - * directory locked on success so that it can be populated without - * dropping cgroup_mutex. + * This extra ref will be put in cgroup_free_fn() and guarantees + * that @cgrp->kn is always accessible. */ - err = cgroup_create_file(dentry, S_IFDIR | mode, sb); - if (err < 0) - goto err_free_id; - lockdep_assert_held(&dentry->d_inode->i_mutex); + kernfs_get(kn); cgrp->serial_nr = cgroup_serial_nr_next++; /* allocation complete, commit to creation */ list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); - root->number_of_cgroups++; - - /* hold a ref to the parent's dentry */ - dget(parent->dentry); + atomic_inc(&root->nr_cgrps); + cgroup_get(parent); /* * @cgrp is now fully operational. If something fails after this @@ -4256,36 +3537,35 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } } + kernfs_activate(kn); + mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); + mutex_unlock(&cgroup_tree_mutex); return 0; err_free_id: idr_remove(&root->cgroup_idr, cgrp->id); - /* Release the reference count that we took on the superblock */ - deactivate_super(sb); err_unlock: mutex_unlock(&cgroup_mutex); -err_free_name: - kfree(rcu_dereference_raw(cgrp->name)); -err_free_cgrp: +err_unlock_tree: + mutex_unlock(&cgroup_tree_mutex); kfree(cgrp); return err; err_destroy: cgroup_destroy_locked(cgrp); mutex_unlock(&cgroup_mutex); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&cgroup_tree_mutex); return err; } -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) { - struct cgroup *c_parent = dentry->d_parent->d_fsdata; + struct cgroup *parent = parent_kn->priv; - /* the vfs holds inode->i_mutex already */ - return cgroup_create(c_parent, dentry, mode | S_IFDIR); + return cgroup_create(parent, name, mode); } /* @@ -4298,6 +3578,7 @@ static void css_killed_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup *cgrp = css->cgroup; + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* @@ -4315,6 +3596,7 @@ static void css_killed_work_fn(struct work_struct *work) cgroup_destroy_css_killed(cgrp); mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); /* * Put the css refs from kill_css(). Each css holds an extra @@ -4347,7 +3629,11 @@ static void css_killed_ref_fn(struct percpu_ref *ref) */ static void kill_css(struct cgroup_subsys_state *css) { - cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); + /* + * This must happen before css is disassociated with its cgroup. + * See seq_css() for details. + */ + cgroup_clear_dir(css->cgroup, 1 << css->ss->id); /* * Killing would put the base ref, but we need to keep it alive @@ -4395,22 +3681,21 @@ static void kill_css(struct cgroup_subsys_state *css) static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { - struct dentry *d = cgrp->dentry; - struct cgroup_subsys_state *css; struct cgroup *child; + struct cgroup_subsys_state *css; bool empty; int ssid; - lockdep_assert_held(&d->d_inode->i_mutex); + lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* - * css_set_lock synchronizes access to ->cset_links and prevents - * @cgrp from being removed while __put_css_set() is in progress. + * css_set_rwsem synchronizes access to ->cset_links and prevents + * @cgrp from being removed while put_css_set() is in progress. */ - read_lock(&css_set_lock); + down_read(&css_set_rwsem); empty = list_empty(&cgrp->cset_links); - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); if (!empty) return -EBUSY; @@ -4433,10 +3718,13 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Initiate massacre of all css's. cgroup_destroy_css_killed() * will be invoked to perform the rest of destruction once the - * percpu refs of all css's are confirmed to be killed. + * percpu refs of all css's are confirmed to be killed. This + * involves removing the subsystem's files, drop cgroup_mutex. */ + mutex_unlock(&cgroup_mutex); for_each_css(css, ssid, cgrp) kill_css(css); + mutex_lock(&cgroup_mutex); /* * Mark @cgrp dead. This prevents further task migration and child @@ -4462,14 +3750,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (!cgrp->nr_css) cgroup_destroy_css_killed(cgrp); + /* remove @cgrp directory along with the base files */ + mutex_unlock(&cgroup_mutex); + /* - * Clear the base files and remove @cgrp directory. The removal - * puts the base ref but we aren't quite done with @cgrp yet, so - * hold onto it. + * There are two control paths which try to determine cgroup from + * dentry without going through kernfs - cgroupstats_build() and + * css_tryget_from_dir(). Those are supported by RCU protecting + * clearing of cgrp->kn->priv backpointer, which should happen + * after all files under it have been removed. */ - cgroup_addrm_files(cgrp, cgroup_base_files, false); - dget(d); - cgroup_d_remove_dir(d); + kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */ + RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); + + mutex_lock(&cgroup_mutex); return 0; }; @@ -4486,59 +3780,69 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) static void cgroup_destroy_css_killed(struct cgroup *cgrp) { struct cgroup *parent = cgrp->parent; - struct dentry *d = cgrp->dentry; + lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); - dput(d); + cgroup_put(cgrp); set_bit(CGRP_RELEASABLE, &parent->flags); check_for_release(parent); } -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) +static int cgroup_rmdir(struct kernfs_node *kn) { - int ret; - - mutex_lock(&cgroup_mutex); - ret = cgroup_destroy_locked(dentry->d_fsdata); - mutex_unlock(&cgroup_mutex); + struct cgroup *cgrp = kn->priv; + int ret = 0; - return ret; -} + /* + * This is self-destruction but @kn can't be removed while this + * callback is in progress. Let's break active protection. Once + * the protection is broken, @cgrp can be destroyed at any point. + * Pin it so that it stays accessible. + */ + cgroup_get(cgrp); + kernfs_break_active_protection(kn); -static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) -{ - INIT_LIST_HEAD(&ss->cftsets); + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); /* - * base_cftset is embedded in subsys itself, no need to worry about - * deregistration. + * @cgrp might already have been destroyed while we're trying to + * grab the mutexes. */ - if (ss->base_cftypes) { - struct cftype *cft; + if (!cgroup_is_dead(cgrp)) + ret = cgroup_destroy_locked(cgrp); - for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) - cft->ss = ss; + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); - ss->base_cftset.cfts = ss->base_cftypes; - list_add_tail(&ss->base_cftset.node, &ss->cftsets); - } + kernfs_unbreak_active_protection(kn); + cgroup_put(cgrp); + return ret; } +static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { + .remount_fs = cgroup_remount, + .show_options = cgroup_show_options, + .mkdir = cgroup_mkdir, + .rmdir = cgroup_rmdir, + .rename = cgroup_rename, +}; + static void __init cgroup_init_subsys(struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); - /* init base cftset */ - cgroup_init_cftsets(ss); + INIT_LIST_HEAD(&ss->cfts); /* Create the top cgroup state for this subsystem */ ss->root = &cgroup_dummy_root; @@ -4551,7 +3855,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * pointer to this state - since the subsystem is * newly registered, all tasks and hence the * init_css_set is in the subsystem's top cgroup. */ - init_css_set.subsys[ss->subsys_id] = css; + init_css_set.subsys[ss->id] = css; need_forkexit_callback |= ss->fork || ss->exit; @@ -4563,184 +3867,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(online_css(css)); mutex_unlock(&cgroup_mutex); - - /* this function shouldn't be used with modular subsystems, since they - * need to register a subsys_id, among other things */ - BUG_ON(ss->module); -} - -/** - * cgroup_load_subsys: load and register a modular subsystem at runtime - * @ss: the subsystem to load - * - * This function should be called in a modular subsystem's initcall. If the - * subsystem is built as a module, it will be assigned a new subsys_id and set - * up for use. If the subsystem is built-in anyway, work is delegated to the - * simpler cgroup_init_subsys. - */ -int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) -{ - struct cgroup_subsys_state *css; - int i, ret; - struct hlist_node *tmp; - struct css_set *cset; - unsigned long key; - - /* check name and function validity */ - if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || - ss->css_alloc == NULL || ss->css_free == NULL) - return -EINVAL; - - /* - * we don't support callbacks in modular subsystems. this check is - * before the ss->module check for consistency; a subsystem that could - * be a module should still have no callbacks even if the user isn't - * compiling it as one. - */ - if (ss->fork || ss->exit) - return -EINVAL; - - /* - * an optionally modular subsystem is built-in: we want to do nothing, - * since cgroup_init_subsys will have already taken care of it. - */ - if (ss->module == NULL) { - /* a sanity check */ - BUG_ON(cgroup_subsys[ss->subsys_id] != ss); - return 0; - } - - /* init base cftset */ - cgroup_init_cftsets(ss); - - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); - cgroup_subsys[ss->subsys_id] = ss; - - /* - * no ss->css_alloc seems to need anything important in the ss - * struct, so this can happen first (i.e. before the dummy root - * attachment). - */ - css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); - if (IS_ERR(css)) { - /* failure case - need to deassign the cgroup_subsys[] slot. */ - cgroup_subsys[ss->subsys_id] = NULL; - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - return PTR_ERR(css); - } - - ss->root = &cgroup_dummy_root; - - /* our new subsystem will be attached to the dummy hierarchy. */ - init_css(css, ss, cgroup_dummy_top); - - /* - * Now we need to entangle the css into the existing css_sets. unlike - * in cgroup_init_subsys, there are now multiple css_sets, so each one - * will need a new pointer to it; done by iterating the css_set_table. - * furthermore, modifying the existing css_sets will corrupt the hash - * table state, so each changed css_set will need its hash recomputed. - * this is all done under the css_set_lock. - */ - write_lock(&css_set_lock); - hash_for_each_safe(css_set_table, i, tmp, cset, hlist) { - /* skip entries that we already rehashed */ - if (cset->subsys[ss->subsys_id]) - continue; - /* remove existing entry */ - hash_del(&cset->hlist); - /* set new value */ - cset->subsys[ss->subsys_id] = css; - /* recompute hash and restore entry */ - key = css_set_hash(cset->subsys); - hash_add(css_set_table, &cset->hlist, key); - } - write_unlock(&css_set_lock); - - ret = online_css(css); - if (ret) { - ss->css_free(css); - goto err_unload; - } - - /* success! */ - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - return 0; - -err_unload: - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - /* @ss can't be mounted here as try_module_get() would fail */ - cgroup_unload_subsys(ss); - return ret; -} -EXPORT_SYMBOL_GPL(cgroup_load_subsys); - -/** - * cgroup_unload_subsys: unload a modular subsystem - * @ss: the subsystem to unload - * - * This function should be called in a modular subsystem's exitcall. When this - * function is invoked, the refcount on the subsystem's module will be 0, so - * the subsystem will not be attached to any hierarchy. - */ -void cgroup_unload_subsys(struct cgroup_subsys *ss) -{ - struct cgrp_cset_link *link; - struct cgroup_subsys_state *css; - - BUG_ON(ss->module == NULL); - - /* - * we shouldn't be called if the subsystem is in use, and the use of - * try_module_get() in rebind_subsystems() should ensure that it - * doesn't start being used while we're killing it off. - */ - BUG_ON(ss->root != &cgroup_dummy_root); - - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); - - css = cgroup_css(cgroup_dummy_top, ss); - if (css) - offline_css(css); - - /* deassign the subsys_id */ - cgroup_subsys[ss->subsys_id] = NULL; - - /* - * disentangle the css from all css_sets attached to the dummy - * top. as in loading, we need to pay our respects to the hashtable - * gods. - */ - write_lock(&css_set_lock); - list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) { - struct css_set *cset = link->cset; - unsigned long key; - - hash_del(&cset->hlist); - cset->subsys[ss->subsys_id] = NULL; - key = css_set_hash(cset->subsys); - hash_add(css_set_table, &cset->hlist, key); - } - write_unlock(&css_set_lock); - - /* - * remove subsystem's css from the cgroup_dummy_top and free it - - * need to free before marking as null because ss->css_free needs - * the cgrp->subsys pointer to find their state. - */ - if (css) - ss->css_free(css); - RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); - - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); } -EXPORT_SYMBOL_GPL(cgroup_unload_subsys); /** * cgroup_init_early - cgroup initialization at system boot @@ -4767,17 +3895,16 @@ int __init cgroup_init_early(void) list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); - /* at bootup time, we don't worry about modular subsystems */ - for_each_builtin_subsys(ss, i) { - BUG_ON(!ss->name); - BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); - BUG_ON(!ss->css_alloc); - BUG_ON(!ss->css_free); - if (ss->subsys_id != i) { - printk(KERN_ERR "cgroup: Subsys %s id == %d\n", - ss->name, ss->subsys_id); - BUG(); - } + for_each_subsys(ss, i) { + WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, + "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n", + i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, + ss->id, ss->name); + WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, + "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]); + + ss->id = i; + ss->name = cgroup_subsys_name[i]; if (ss->early_init) cgroup_init_subsys(ss); @@ -4797,18 +3924,22 @@ int __init cgroup_init(void) unsigned long key; int i, err; - err = bdi_init(&cgroup_backing_dev_info); - if (err) - return err; + BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); - for_each_builtin_subsys(ss, i) { + for_each_subsys(ss, i) { if (!ss->early_init) cgroup_init_subsys(ss); + + /* + * cftype registration needs kmalloc and can't be done + * during early_init. Register base cftypes separately. + */ + if (ss->base_cftypes) + WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); } /* allocate id for the dummy hierarchy */ mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); /* Add init_css_set to the hash table */ key = css_set_hash(init_css_set.subsys); @@ -4820,28 +3951,20 @@ int __init cgroup_init(void) 0, 1, GFP_KERNEL); BUG_ON(err < 0); - mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); - if (!cgroup_kobj) { - err = -ENOMEM; - goto out; - } + if (!cgroup_kobj) + return -ENOMEM; err = register_filesystem(&cgroup_fs_type); if (err < 0) { kobject_put(cgroup_kobj); - goto out; + return err; } proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); - -out: - if (err) - bdi_destroy(&cgroup_backing_dev_info); - - return err; + return 0; } static int __init cgroup_wq_init(void) @@ -4886,12 +4009,12 @@ int proc_cgroup_show(struct seq_file *m, void *v) { struct pid *pid; struct task_struct *tsk; - char *buf; + char *buf, *path; int retval; struct cgroupfs_root *root; retval = -ENOMEM; - buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) goto out; @@ -4904,6 +4027,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) retval = 0; mutex_lock(&cgroup_mutex); + down_read(&css_set_rwsem); for_each_active_root(root) { struct cgroup_subsys *ss; @@ -4919,14 +4043,17 @@ int proc_cgroup_show(struct seq_file *m, void *v) root->name); seq_putc(m, ':'); cgrp = task_cgroup_from_root(tsk, root); - retval = cgroup_path(cgrp, buf, PAGE_SIZE); - if (retval < 0) + path = cgroup_path(cgrp, buf, PATH_MAX); + if (!path) { + retval = -ENAMETOOLONG; goto out_unlock; - seq_puts(m, buf); + } + seq_puts(m, path); seq_putc(m, '\n'); } out_unlock: + up_read(&css_set_rwsem); mutex_unlock(&cgroup_mutex); put_task_struct(tsk); out_free: @@ -4952,7 +4079,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) for_each_subsys(ss, i) seq_printf(m, "%s\t%d\t%d\t%d\n", ss->name, ss->root->hierarchy_id, - ss->root->number_of_cgroups, !ss->disabled); + atomic_read(&ss->root->nr_cgrps), !ss->disabled); mutex_unlock(&cgroup_mutex); return 0; @@ -5022,12 +4149,12 @@ void cgroup_post_fork(struct task_struct *child) * lock on fork. */ if (use_task_css_set_links) { - write_lock(&css_set_lock); + down_write(&css_set_rwsem); task_lock(child); if (list_empty(&child->cg_list)) list_add(&child->cg_list, &task_css_set(child)->tasks); task_unlock(child); - write_unlock(&css_set_lock); + up_write(&css_set_rwsem); } /* @@ -5036,15 +4163,7 @@ void cgroup_post_fork(struct task_struct *child) * and addition to css_set. */ if (need_forkexit_callback) { - /* - * fork/exit callbacks are supported only for builtin - * subsystems, and the builtin section of the subsys - * array is immutable, so we don't need to lock the - * subsys array here. On the other hand, modular section - * of the array can be freed at module unload, so we - * can't touch that. - */ - for_each_builtin_subsys(ss, i) + for_each_subsys(ss, i) if (ss->fork) ss->fork(child); } @@ -5092,15 +4211,14 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) int i; /* - * Unlink from the css_set task list if necessary. - * Optimistically check cg_list before taking - * css_set_lock + * Unlink from the css_set task list if necessary. Optimistically + * check cg_list before taking css_set_rwsem. */ if (!list_empty(&tsk->cg_list)) { - write_lock(&css_set_lock); + down_write(&css_set_rwsem); if (!list_empty(&tsk->cg_list)) list_del_init(&tsk->cg_list); - write_unlock(&css_set_lock); + up_write(&css_set_rwsem); } /* Reassign the task to the init_css_set. */ @@ -5109,11 +4227,8 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) RCU_INIT_POINTER(tsk->cgroups, &init_css_set); if (run_callbacks && need_forkexit_callback) { - /* - * fork/exit callbacks are supported only for builtin - * subsystems, see cgroup_post_fork() for details. - */ - for_each_builtin_subsys(ss, i) { + /* see cgroup_post_fork() for details */ + for_each_subsys(ss, i) { if (ss->exit) { struct cgroup_subsys_state *old_css = cset->subsys[i]; struct cgroup_subsys_state *css = task_css(tsk, i); @@ -5124,7 +4239,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } task_unlock(tsk); - put_css_set_taskexit(cset); + put_css_set(cset, true); } static void check_for_release(struct cgroup *cgrp) @@ -5181,16 +4296,17 @@ static void cgroup_release_agent(struct work_struct *work) while (!list_empty(&release_list)) { char *argv[3], *envp[3]; int i; - char *pathbuf = NULL, *agentbuf = NULL; + char *pathbuf = NULL, *agentbuf = NULL, *path; struct cgroup *cgrp = list_entry(release_list.next, struct cgroup, release_list); list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); - pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf) goto continue_free; - if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) + path = cgroup_path(cgrp, pathbuf, PATH_MAX); + if (!path) goto continue_free; agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); if (!agentbuf) @@ -5198,7 +4314,7 @@ static void cgroup_release_agent(struct work_struct *work) i = 0; argv[i++] = agentbuf; - argv[i++] = pathbuf; + argv[i++] = path; argv[i] = NULL; i = 0; @@ -5232,11 +4348,7 @@ static int __init cgroup_disable(char *str) if (!*token) continue; - /* - * cgroup_disable, being at boot time, can't know about - * module subsystems, so we don't worry about them. - */ - for_each_builtin_subsys(ss, i) { + for_each_subsys(ss, i) { if (!strcmp(token, ss->name)) { ss->disabled = 1; printk(KERN_INFO "Disabling %s control group" @@ -5250,28 +4362,42 @@ static int __init cgroup_disable(char *str) __setup("cgroup_disable=", cgroup_disable); /** - * css_from_dir - get corresponding css from the dentry of a cgroup dir + * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir * @dentry: directory dentry of interest * @ss: subsystem of interest * - * Must be called under cgroup_mutex or RCU read lock. The caller is - * responsible for pinning the returned css if it needs to be accessed - * outside the critical section. + * If @dentry is a directory for a cgroup which has @ss enabled on it, try + * to get the corresponding css and return it. If such css doesn't exist + * or can't be pinned, an ERR_PTR value is returned. */ -struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, - struct cgroup_subsys *ss) +struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss) { + struct kernfs_node *kn = kernfs_node_from_dentry(dentry); + struct cgroup_subsys_state *css = NULL; struct cgroup *cgrp; - cgroup_assert_mutex_or_rcu_locked(); - /* is @dentry a cgroup dir? */ - if (!dentry->d_inode || - dentry->d_inode->i_op != &cgroup_dir_inode_operations) + if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || + kernfs_type(kn) != KERNFS_DIR) return ERR_PTR(-EBADF); - cgrp = __d_cgrp(dentry); - return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); + rcu_read_lock(); + + /* + * This path doesn't originate from kernfs and @kn could already + * have been or be removed at any point. @kn->priv is RCU + * protected for this access. See destroy_locked() for details. + */ + cgrp = rcu_dereference(kn->priv); + if (cgrp) + css = cgroup_css(cgrp, ss); + + if (!css || !css_tryget(css)) + css = ERR_PTR(-ENOENT); + + rcu_read_unlock(); + return css; } /** @@ -5286,7 +4412,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { struct cgroup *cgrp; - cgroup_assert_mutex_or_rcu_locked(); + cgroup_assert_mutexes_or_rcu_locked(); cgrp = idr_find(&ss->root->cgroup_idr, id); if (cgrp) @@ -5338,23 +4464,30 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) { struct cgrp_cset_link *link; struct css_set *cset; + char *name_buf; + + name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name_buf) + return -ENOMEM; - read_lock(&css_set_lock); + down_read(&css_set_rwsem); rcu_read_lock(); cset = rcu_dereference(current->cgroups); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; - const char *name; + const char *name = "?"; + + if (c != cgroup_dummy_top) { + cgroup_name(c, name_buf, NAME_MAX + 1); + name = name_buf; + } - if (c->dentry) - name = c->dentry->d_name.name; - else - name = "?"; seq_printf(seq, "Root %d group %s\n", c->root->hierarchy_id, name); } rcu_read_unlock(); - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); + kfree(name_buf); return 0; } @@ -5364,7 +4497,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; - read_lock(&css_set_lock); + down_read(&css_set_rwsem); list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; @@ -5380,7 +4513,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) } } } - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); return 0; } @@ -5423,11 +4556,9 @@ static struct cftype debug_files[] = { { } /* terminate */ }; -struct cgroup_subsys debug_subsys = { - .name = "debug", +struct cgroup_subsys debug_cgrp_subsys = { .css_alloc = debug_css_alloc, .css_free = debug_css_free, - .subsys_id = debug_subsys_id, .base_cftypes = debug_files, }; #endif /* CONFIG_CGROUP_DEBUG */ |