diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 904 |
1 files changed, 559 insertions, 345 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a5d3b5325f7..2905977e0f3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -63,6 +63,9 @@ #include <linux/atomic.h> +/* css deactivation bias, makes css->refcnt negative to deny new trygets */ +#define CSS_DEACT_BIAS INT_MIN + /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. @@ -127,6 +130,9 @@ struct cgroupfs_root { /* A list running through the active hierarchies */ struct list_head root_list; + /* All cgroups on this root, cgroup_mutex protected */ + struct list_head allcg_list; + /* Hierarchy-specific flags */ unsigned long flags; @@ -145,6 +151,15 @@ struct cgroupfs_root { static struct cgroupfs_root rootnode; /* + * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. + */ +struct cfent { + struct list_head node; + struct dentry *dentry; + struct cftype *type; +}; + +/* * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when * cgroup_subsys->use_id != 0. */ @@ -239,6 +254,14 @@ int cgroup_lock_is_held(void) EXPORT_SYMBOL_GPL(cgroup_lock_is_held); +/* the current nr of refs, always >= 0 whether @css is deactivated or not */ +static int css_refcnt(struct cgroup_subsys_state *css) +{ + int v = atomic_read(&css->refcnt); + + return v >= 0 ? v : v - CSS_DEACT_BIAS; +} + /* convenient tests for these bits */ inline int cgroup_is_removed(const struct cgroup *cgrp) { @@ -279,6 +302,21 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling) #define for_each_active_root(_root) \ list_for_each_entry(_root, &roots, root_list) +static inline struct cgroup *__d_cgrp(struct dentry *dentry) +{ + return dentry->d_fsdata; +} + +static inline struct cfent *__d_cfe(struct dentry *dentry) +{ + return dentry->d_fsdata; +} + +static inline struct cftype *__d_cft(struct dentry *dentry) +{ + return __d_cfe(dentry)->type; +} + /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ static LIST_HEAD(release_list); @@ -816,12 +854,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) struct cgroup_subsys *ss; int ret = 0; - for_each_subsys(cgrp->root, ss) - if (ss->pre_destroy) { - ret = ss->pre_destroy(ss, cgrp); - if (ret) - break; + for_each_subsys(cgrp->root, ss) { + if (!ss->pre_destroy) + continue; + + ret = ss->pre_destroy(cgrp); + if (ret) { + /* ->pre_destroy() failure is being deprecated */ + WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); + break; } + } return ret; } @@ -846,7 +889,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) * Release the subsystem state objects. */ for_each_subsys(cgrp->root, ss) - ss->destroy(ss, cgrp); + ss->destroy(cgrp); cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -864,6 +907,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) BUG_ON(!list_empty(&cgrp->pidlists)); kfree_rcu(cgrp, rcu_head); + } else { + struct cfent *cfe = __d_cfe(dentry); + struct cgroup *cgrp = dentry->d_parent->d_fsdata; + + WARN_ONCE(!list_empty(&cfe->node) && + cgrp != &cgrp->root->top_cgroup, + "cfe still linked for %s\n", cfe->type->name); + kfree(cfe); } iput(inode); } @@ -882,34 +933,36 @@ static void remove_dir(struct dentry *d) dput(parent); } -static void cgroup_clear_directory(struct dentry *dentry) -{ - struct list_head *node; - - BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); - spin_lock(&dentry->d_lock); - node = dentry->d_subdirs.next; - while (node != &dentry->d_subdirs) { - struct dentry *d = list_entry(node, struct dentry, d_u.d_child); - - spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); - list_del_init(node); - if (d->d_inode) { - /* This should never be called on a cgroup - * directory with child cgroups */ - BUG_ON(d->d_inode->i_mode & S_IFDIR); - dget_dlock(d); - spin_unlock(&d->d_lock); - spin_unlock(&dentry->d_lock); - d_delete(d); - simple_unlink(dentry->d_inode, d); - dput(d); - spin_lock(&dentry->d_lock); - } else - spin_unlock(&d->d_lock); - node = dentry->d_subdirs.next; +static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) +{ + struct cfent *cfe; + + lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); + lockdep_assert_held(&cgroup_mutex); + + list_for_each_entry(cfe, &cgrp->files, node) { + struct dentry *d = cfe->dentry; + + if (cft && cfe->type != cft) + continue; + + dget(d); + d_delete(d); + simple_unlink(d->d_inode, d); + list_del_init(&cfe->node); + dput(d); + + return 0; } - spin_unlock(&dentry->d_lock); + return -ENOENT; +} + +static void cgroup_clear_directory(struct dentry *dir) +{ + struct cgroup *cgrp = __d_cgrp(dir); + + while (!list_empty(&cgrp->files)) + cgroup_rm_file(cgrp, NULL); } /* @@ -1015,7 +1068,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) - ss->bind(ss, cgrp); + ss->bind(cgrp); mutex_unlock(&ss->hierarchy_mutex); /* refcount was already taken, and we're keeping it */ } else if (bit & removed_bits) { @@ -1025,7 +1078,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]->cgroup != cgrp); mutex_lock(&ss->hierarchy_mutex); if (ss->bind) - ss->bind(ss, dummytop); + ss->bind(dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; subsys[i]->root = &rootnode; @@ -1294,6 +1347,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) if (ret) goto out_unlock; + /* See feature-removal-schedule.txt */ + if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent) + pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", + task_tgid_nr(current), current->comm); + /* Don't allow flags or name to change at remount */ if (opts.flags != root->flags || (opts.name && strcmp(opts.name, root->name))) { @@ -1308,7 +1366,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } - /* (re)populate subsystem files */ + /* clear out any existing files and repopulate subsystem files */ + cgroup_clear_directory(cgrp->dentry); cgroup_populate_dir(cgrp); if (opts.release_agent) @@ -1333,6 +1392,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) { INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); + INIT_LIST_HEAD(&cgrp->files); INIT_LIST_HEAD(&cgrp->css_sets); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); @@ -1344,11 +1404,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; + INIT_LIST_HEAD(&root->subsys_list); INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD(&root->allcg_list); root->number_of_cgroups = 1; cgrp->root = root; cgrp->top_cgroup = cgrp; + list_add_tail(&cgrp->allcg_node, &root->allcg_list); init_cgroup_housekeeping(cgrp); } @@ -1472,7 +1535,6 @@ static int cgroup_get_rootdir(struct super_block *sb) struct inode *inode = cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); - struct dentry *dentry; if (!inode) return -ENOMEM; @@ -1481,12 +1543,9 @@ static int cgroup_get_rootdir(struct super_block *sb) inode->i_op = &cgroup_dir_inode_operations; /* directories start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); - dentry = d_alloc_root(inode); - if (!dentry) { - iput(inode); + sb->s_root = d_make_root(inode); + if (!sb->s_root) return -ENOMEM; - } - sb->s_root = dentry; /* for everything else we want ->d_op set */ sb->s_d_op = &cgroup_dops; return 0; @@ -1696,16 +1755,6 @@ static struct file_system_type cgroup_fs_type = { static struct kobject *cgroup_kobj; -static inline struct cgroup *__d_cgrp(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -static inline struct cftype *__d_cft(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - /** * cgroup_path - generate the path of a cgroup * @cgrp: the cgroup in question @@ -1763,6 +1812,7 @@ EXPORT_SYMBOL_GPL(cgroup_path); struct task_and_cgroup { struct task_struct *task; struct cgroup *cgrp; + struct css_set *cg; }; struct cgroup_taskset { @@ -1843,11 +1893,10 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); * will already exist. If not set, this function might sleep, and can fail with * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. */ -static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, - struct task_struct *tsk, bool guarantee) +static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, + struct task_struct *tsk, struct css_set *newcg) { struct css_set *oldcg; - struct css_set *newcg; /* * We are synchronized through threadgroup_lock() against PF_EXITING @@ -1857,23 +1906,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, WARN_ON_ONCE(tsk->flags & PF_EXITING); oldcg = tsk->cgroups; - /* locate or allocate a new css_set for this task. */ - if (guarantee) { - /* we know the css_set we want already exists. */ - struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - read_lock(&css_set_lock); - newcg = find_existing_css_set(oldcg, cgrp, template); - BUG_ON(!newcg); - get_css_set(newcg); - read_unlock(&css_set_lock); - } else { - might_sleep(); - /* find_css_set will give us newcg already referenced. */ - newcg = find_css_set(oldcg, cgrp); - if (!newcg) - return -ENOMEM; - } - task_lock(tsk); rcu_assign_pointer(tsk->cgroups, newcg); task_unlock(tsk); @@ -1892,7 +1924,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, put_css_set(oldcg); set_bit(CGRP_RELEASABLE, &oldcgrp->flags); - return 0; } /** @@ -1905,11 +1936,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, */ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { - int retval; + int retval = 0; struct cgroup_subsys *ss, *failed_ss = NULL; struct cgroup *oldcgrp; struct cgroupfs_root *root = cgrp->root; struct cgroup_taskset tset = { }; + struct css_set *newcg; /* @tsk either already exited or can't exit until the end */ if (tsk->flags & PF_EXITING) @@ -1925,7 +1957,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, &tset); + retval = ss->can_attach(cgrp, &tset); if (retval) { /* * Remember on which subsystem the can_attach() @@ -1939,13 +1971,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } } - retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); - if (retval) + newcg = find_css_set(tsk->cgroups, cgrp); + if (!newcg) { + retval = -ENOMEM; goto out; + } + + cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg); for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, &tset); + ss->attach(cgrp, &tset); } synchronize_rcu(); @@ -1967,7 +2003,7 @@ out: */ break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, &tset); + ss->cancel_attach(cgrp, &tset); } } return retval; @@ -1997,66 +2033,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); -/* - * cgroup_attach_proc works in two stages, the first of which prefetches all - * new css_sets needed (to make sure we have enough memory before committing - * to the move) and stores them in a list of entries of the following type. - * TODO: possible optimization: use css_set->rcu_head for chaining instead - */ -struct cg_list_entry { - struct css_set *cg; - struct list_head links; -}; - -static bool css_set_check_fetched(struct cgroup *cgrp, - struct task_struct *tsk, struct css_set *cg, - struct list_head *newcg_list) -{ - struct css_set *newcg; - struct cg_list_entry *cg_entry; - struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - - read_lock(&css_set_lock); - newcg = find_existing_css_set(cg, cgrp, template); - read_unlock(&css_set_lock); - - /* doesn't exist at all? */ - if (!newcg) - return false; - /* see if it's already in the list */ - list_for_each_entry(cg_entry, newcg_list, links) - if (cg_entry->cg == newcg) - return true; - - /* not found */ - return false; -} - -/* - * Find the new css_set and store it in the list in preparation for moving the - * given task to the given cgroup. Returns 0 or -ENOMEM. - */ -static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, - struct list_head *newcg_list) -{ - struct css_set *newcg; - struct cg_list_entry *cg_entry; - - /* ensure a new css_set will exist for this thread */ - newcg = find_css_set(cg, cgrp); - if (!newcg) - return -ENOMEM; - /* add it to the list */ - cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); - if (!cg_entry) { - put_css_set(newcg); - return -ENOMEM; - } - cg_entry->cg = newcg; - list_add(&cg_entry->links, newcg_list); - return 0; -} - /** * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup * @cgrp: the cgroup to attach to @@ -2070,20 +2046,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) int retval, i, group_size; struct cgroup_subsys *ss, *failed_ss = NULL; /* guaranteed to be initialized later, but the compiler needs this */ - struct css_set *oldcg; struct cgroupfs_root *root = cgrp->root; /* threadgroup list cursor and array */ struct task_struct *tsk; struct task_and_cgroup *tc; struct flex_array *group; struct cgroup_taskset tset = { }; - /* - * we need to make sure we have css_sets for all the tasks we're - * going to move -before- we actually start moving them, so that in - * case we get an ENOMEM we can bail out before making any changes. - */ - struct list_head newcg_list; - struct cg_list_entry *cg_entry, *temp_nobe; /* * step 0: in order to do expensive, possibly blocking operations for @@ -2102,23 +2070,14 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) if (retval) goto out_free_group_list; - /* prevent changes to the threadgroup list while we take a snapshot. */ - read_lock(&tasklist_lock); - if (!thread_group_leader(leader)) { - /* - * a race with de_thread from another thread's exec() may strip - * us of our leadership, making while_each_thread unsafe to use - * on this task. if this happens, there is no choice but to - * throw this task away and try again (from cgroup_procs_write); - * this is "double-double-toil-and-trouble-check locking". - */ - read_unlock(&tasklist_lock); - retval = -EAGAIN; - goto out_free_group_list; - } - tsk = leader; i = 0; + /* + * Prevent freeing of tasks while we take a snapshot. Tasks that are + * already PF_EXITING could be freed from underneath us unless we + * take an rcu_read_lock. + */ + rcu_read_lock(); do { struct task_and_cgroup ent; @@ -2128,24 +2087,24 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); - /* - * saying GFP_ATOMIC has no effect here because we did prealloc - * earlier, but it's good form to communicate our expectations. - */ ent.task = tsk; ent.cgrp = task_cgroup_from_root(tsk, root); /* nothing to do if this task is already in the cgroup */ if (ent.cgrp == cgrp) continue; + /* + * saying GFP_ATOMIC has no effect here because we did prealloc + * earlier, but it's good form to communicate our expectations. + */ retval = flex_array_put(group, i, &ent, GFP_ATOMIC); BUG_ON(retval != 0); i++; } while_each_thread(leader, tsk); + rcu_read_unlock(); /* remember the number of threads in the array for later. */ group_size = i; tset.tc_array = group; tset.tc_array_len = group_size; - read_unlock(&tasklist_lock); /* methods shouldn't be called if no task is actually migrating */ retval = 0; @@ -2157,7 +2116,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, &tset); + retval = ss->can_attach(cgrp, &tset); if (retval) { failed_ss = ss; goto out_cancel_attach; @@ -2169,17 +2128,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * step 2: make sure css_sets exist for all threads to be migrated. * we use find_css_set, which allocates a new one if necessary. */ - INIT_LIST_HEAD(&newcg_list); for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - oldcg = tc->task->cgroups; - - /* if we don't already have it in the list get a new one */ - if (!css_set_check_fetched(cgrp, tc->task, oldcg, - &newcg_list)) { - retval = css_set_prefetch(cgrp, oldcg, &newcg_list); - if (retval) - goto out_list_teardown; + tc->cg = find_css_set(tc->task->cgroups, cgrp); + if (!tc->cg) { + retval = -ENOMEM; + goto out_put_css_set_refs; } } @@ -2190,8 +2144,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); - BUG_ON(retval); + cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg); } /* nothing is sensitive to fork() after this point. */ @@ -2200,7 +2153,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, &tset); + ss->attach(cgrp, &tset); } /* @@ -2209,21 +2162,22 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) synchronize_rcu(); cgroup_wakeup_rmdir_waiter(cgrp); retval = 0; -out_list_teardown: - /* clean up the list of prefetched css_sets. */ - list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { - list_del(&cg_entry->links); - put_css_set(cg_entry->cg); - kfree(cg_entry); +out_put_css_set_refs: + if (retval) { + for (i = 0; i < group_size; i++) { + tc = flex_array_get(group, i); + if (!tc->cg) + break; + put_css_set(tc->cg); + } } out_cancel_attach: - /* same deal as in cgroup_attach_task */ if (retval) { for_each_subsys(root, ss) { if (ss == failed_ss) break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, &tset); + ss->cancel_attach(cgrp, &tset); } } out_free_group_list: @@ -2245,22 +2199,14 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) if (!cgroup_lock_live_group(cgrp)) return -ENODEV; +retry_find_task: + rcu_read_lock(); if (pid) { - rcu_read_lock(); tsk = find_task_by_vpid(pid); if (!tsk) { rcu_read_unlock(); - cgroup_unlock(); - return -ESRCH; - } - if (threadgroup) { - /* - * RCU protects this access, since tsk was found in the - * tid map. a race with de_thread may cause group_leader - * to stop being the leader, but cgroup_attach_proc will - * detect it later. - */ - tsk = tsk->group_leader; + ret= -ESRCH; + goto out_unlock_cgroup; } /* * even if we're attaching all tasks in the thread group, we @@ -2271,29 +2217,38 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) cred->euid != tcred->uid && cred->euid != tcred->suid) { rcu_read_unlock(); - cgroup_unlock(); - return -EACCES; + ret = -EACCES; + goto out_unlock_cgroup; } - get_task_struct(tsk); - rcu_read_unlock(); - } else { - if (threadgroup) - tsk = current->group_leader; - else - tsk = current; - get_task_struct(tsk); - } - - threadgroup_lock(tsk); + } else + tsk = current; if (threadgroup) + tsk = tsk->group_leader; + get_task_struct(tsk); + rcu_read_unlock(); + + threadgroup_lock(tsk); + if (threadgroup) { + if (!thread_group_leader(tsk)) { + /* + * a race with de_thread from another thread's exec() + * may strip us of our leadership, if this happens, + * there is no choice but to throw this task away and + * try again; this is + * "double-double-toil-and-trouble-check locking". + */ + threadgroup_unlock(tsk); + put_task_struct(tsk); + goto retry_find_task; + } ret = cgroup_attach_proc(cgrp, tsk); - else + } else ret = cgroup_attach_task(cgrp, tsk); - threadgroup_unlock(tsk); put_task_struct(tsk); +out_unlock_cgroup: cgroup_unlock(); return ret; } @@ -2305,16 +2260,7 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) { - int ret; - do { - /* - * attach_proc fails with -EAGAIN if threadgroup leadership - * changes in the middle of the operation, in which case we need - * to find the task_struct for the new leader and start over. - */ - ret = attach_task_by_pid(cgrp, tgid, true); - } while (ret == -EAGAIN); - return ret; + return attach_task_by_pid(cgrp, tgid, true); } /** @@ -2710,50 +2656,191 @@ static umode_t cgroup_file_mode(const struct cftype *cft) return mode; } -int cgroup_add_file(struct cgroup *cgrp, - struct cgroup_subsys *subsys, - const struct cftype *cft) +static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, + const struct cftype *cft) { struct dentry *dir = cgrp->dentry; + struct cgroup *parent = __d_cgrp(dir); struct dentry *dentry; + struct cfent *cfe; int error; umode_t mode; - char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; + + /* does @cft->flags tell us to skip creation on @cgrp? */ + if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) + return 0; + if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) + return 0; + if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { strcpy(name, subsys->name); strcat(name, "."); } strcat(name, cft->name); + BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); + + cfe = kzalloc(sizeof(*cfe), GFP_KERNEL); + if (!cfe) + return -ENOMEM; + dentry = lookup_one_len(name, dir, strlen(name)); - if (!IS_ERR(dentry)) { - mode = cgroup_file_mode(cft); - error = cgroup_create_file(dentry, mode | S_IFREG, - cgrp->root->sb); - if (!error) - dentry->d_fsdata = (void *)cft; - dput(dentry); - } else + if (IS_ERR(dentry)) { error = PTR_ERR(dentry); + goto out; + } + + mode = cgroup_file_mode(cft); + error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); + if (!error) { + cfe->type = (void *)cft; + cfe->dentry = dentry; + dentry->d_fsdata = cfe; + list_add_tail(&cfe->node, &parent->files); + cfe = NULL; + } + dput(dentry); +out: + kfree(cfe); return error; } -EXPORT_SYMBOL_GPL(cgroup_add_file); -int cgroup_add_files(struct cgroup *cgrp, - struct cgroup_subsys *subsys, - const struct cftype cft[], - int count) +static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, + const struct cftype cfts[], bool is_add) { - int i, err; - for (i = 0; i < count; i++) { - err = cgroup_add_file(cgrp, subsys, &cft[i]); - if (err) - return err; + const struct cftype *cft; + int err, ret = 0; + + for (cft = cfts; cft->name[0] != '\0'; cft++) { + if (is_add) + err = cgroup_add_file(cgrp, subsys, cft); + else + err = cgroup_rm_file(cgrp, cft); + if (err) { + pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n", + is_add ? "add" : "remove", cft->name, err); + ret = err; + } + } + return ret; +} + +static DEFINE_MUTEX(cgroup_cft_mutex); + +static void cgroup_cfts_prepare(void) + __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) +{ + /* + * Thanks to the entanglement with vfs inode locking, we can't walk + * the existing cgroups under cgroup_mutex and create files. + * Instead, we increment reference on all cgroups and build list of + * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure + * exclusive access to the field. + */ + mutex_lock(&cgroup_cft_mutex); + mutex_lock(&cgroup_mutex); +} + +static void cgroup_cfts_commit(struct cgroup_subsys *ss, + const struct cftype *cfts, bool is_add) + __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) +{ + LIST_HEAD(pending); + struct cgroup *cgrp, *n; + + /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ + if (cfts && ss->root != &rootnode) { + list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { + dget(cgrp->dentry); + list_add_tail(&cgrp->cft_q_node, &pending); + } + } + + mutex_unlock(&cgroup_mutex); + + /* + * All new cgroups will see @cfts update on @ss->cftsets. Add/rm + * files for all cgroups which were created before. + */ + list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { + struct inode *inode = cgrp->dentry->d_inode; + + mutex_lock(&inode->i_mutex); + mutex_lock(&cgroup_mutex); + if (!cgroup_is_removed(cgrp)) + cgroup_addrm_files(cgrp, ss, cfts, is_add); + mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + + list_del_init(&cgrp->cft_q_node); + dput(cgrp->dentry); } + + mutex_unlock(&cgroup_cft_mutex); +} + +/** + * cgroup_add_cftypes - add an array of cftypes to a subsystem + * @ss: target cgroup subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Register @cfts to @ss. Files described by @cfts are created for all + * existing cgroups to which @ss is attached and all future cgroups will + * have them too. This function can be called anytime whether @ss is + * attached or not. + * + * Returns 0 on successful registration, -errno on failure. Note that this + * function currently returns 0 as long as @cfts registration is successful + * even if some file creation attempts on existing cgroups fail. + */ +int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) +{ + struct cftype_set *set; + + set = kzalloc(sizeof(*set), GFP_KERNEL); + if (!set) + return -ENOMEM; + + cgroup_cfts_prepare(); + set->cfts = cfts; + list_add_tail(&set->node, &ss->cftsets); + cgroup_cfts_commit(ss, cfts, true); + return 0; } -EXPORT_SYMBOL_GPL(cgroup_add_files); +EXPORT_SYMBOL_GPL(cgroup_add_cftypes); + +/** + * cgroup_rm_cftypes - remove an array of cftypes from a subsystem + * @ss: target cgroup subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Unregister @cfts from @ss. Files described by @cfts are removed from + * all existing cgroups to which @ss is attached and all future cgroups + * won't have them either. This function can be called anytime whether @ss + * is attached or not. + * + * Returns 0 on successful unregistration, -ENOENT if @cfts is not + * registered with @ss. + */ +int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) +{ + struct cftype_set *set; + + cgroup_cfts_prepare(); + + list_for_each_entry(set, &ss->cftsets, node) { + if (set->cfts == cfts) { + list_del_init(&set->node); + cgroup_cfts_commit(ss, cfts, false); + return 0; + } + } + + cgroup_cfts_commit(ss, NULL, false); + return -ENOENT; +} /** * cgroup_task_count - count the number of tasks in a cgroup. @@ -2804,15 +2891,20 @@ static void cgroup_advance_iter(struct cgroup *cgrp, * using their cgroups capability, we don't maintain the lists running * through each css_set to its tasks until we see the list actually * used - in other words after the first call to cgroup_iter_start(). - * - * The tasklist_lock is not held here, as do_each_thread() and - * while_each_thread() are protected by RCU. */ static void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; write_lock(&css_set_lock); use_task_css_set_links = 1; + /* + * We need tasklist_lock because RCU is not safe against + * while_each_thread(). Besides, a forking task that has passed + * cgroup_post_fork() without seeing use_task_css_set_links = 1 + * is not guaranteed to have its child immediately visible in the + * tasklist if we walk through it with RCU. + */ + read_lock(&tasklist_lock); do_each_thread(g, p) { task_lock(p); /* @@ -2824,6 +2916,7 @@ static void cgroup_enable_task_cg_lists(void) list_add(&p->cg_list, &p->cgroups->tasks); task_unlock(p); } while_each_thread(g, p); + read_unlock(&tasklist_lock); write_unlock(&css_set_lock); } @@ -3043,6 +3136,38 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * */ +/* which pidlist file are we talking about? */ +enum cgroup_filetype { + CGROUP_FILE_PROCS, + CGROUP_FILE_TASKS, +}; + +/* + * A pidlist is a list of pids that virtually represents the contents of one + * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, + * a pair (one each for procs, tasks) for each pid namespace that's relevant + * to the cgroup. + */ +struct cgroup_pidlist { + /* + * used to find which pidlist is wanted. doesn't change as long as + * this particular list stays in the list. + */ + struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; + /* array of xids */ + pid_t *list; + /* how many elements the above list has */ + int length; + /* how many files are using the current array */ + int use_count; + /* each of these stored in a list by its cgroup */ + struct list_head links; + /* pointer to the cgroup we belong to, for list removal purposes */ + struct cgroup *owner; + /* protects the other fields */ + struct rw_semaphore mutex; +}; + /* * The following two functions "fix" the issue where there are more pids * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. @@ -3694,13 +3819,14 @@ static struct cftype files[] = { .read_u64 = cgroup_clone_children_read, .write_u64 = cgroup_clone_children_write, }, -}; - -static struct cftype cft_release_agent = { - .name = "release_agent", - .read_seq_string = cgroup_release_agent_show, - .write_string = cgroup_release_agent_write, - .max_write_len = PATH_MAX, + { + .name = "release_agent", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_seq_string = cgroup_release_agent_show, + .write_string = cgroup_release_agent_write, + .max_write_len = PATH_MAX, + }, + { } /* terminate */ }; static int cgroup_populate_dir(struct cgroup *cgrp) @@ -3708,22 +3834,21 @@ static int cgroup_populate_dir(struct cgroup *cgrp) int err; struct cgroup_subsys *ss; - /* First clear out any existing files */ - cgroup_clear_directory(cgrp->dentry); - - err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files)); + err = cgroup_addrm_files(cgrp, NULL, files, true); if (err < 0) return err; - if (cgrp == cgrp->top_cgroup) { - if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0) - return err; - } - + /* process cftsets of each subsystem */ for_each_subsys(cgrp->root, ss) { + struct cftype_set *set; + if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) return err; + + list_for_each_entry(set, &ss->cftsets, node) + cgroup_addrm_files(cgrp, ss, set->cfts, true); } + /* This cgroup is ready now */ for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; @@ -3739,6 +3864,14 @@ static int cgroup_populate_dir(struct cgroup *cgrp) return 0; } +static void css_dput_fn(struct work_struct *work) +{ + struct cgroup_subsys_state *css = + container_of(work, struct cgroup_subsys_state, dput_work); + + dput(css->cgroup->dentry); +} + static void init_cgroup_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) @@ -3751,6 +3884,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, set_bit(CSS_ROOT, &css->flags); BUG_ON(cgrp->subsys[ss->subsys_id]); cgrp->subsys[ss->subsys_id] = css; + + /* + * If !clear_css_refs, css holds an extra ref to @cgrp->dentry + * which is put on the last css_put(). dput() requires process + * context, which css_put() may be called without. @css->dput_work + * will be used to invoke dput() asynchronously from css_put(). + */ + INIT_WORK(&css->dput_work, css_dput_fn); + if (ss->__DEPRECATED_clear_css_refs) + set_bit(CSS_CLEAR_CSS_REFS, &css->flags); } static void cgroup_lock_hierarchy(struct cgroupfs_root *root) @@ -3827,7 +3970,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); for_each_subsys(root, ss) { - struct cgroup_subsys_state *css = ss->create(ss, cgrp); + struct cgroup_subsys_state *css = ss->create(cgrp); if (IS_ERR(css)) { err = PTR_ERR(css); @@ -3841,7 +3984,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } /* At error, ->destroy() callback has to free assigned ID. */ if (clone_children(parent) && ss->post_clone) - ss->post_clone(ss, cgrp); + ss->post_clone(cgrp); } cgroup_lock_hierarchy(root); @@ -3853,9 +3996,16 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (err < 0) goto err_remove; + /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ + for_each_subsys(root, ss) + if (!ss->__DEPRECATED_clear_css_refs) + dget(dentry); + /* The cgroup directory was pre-locked for us */ BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); + list_add_tail(&cgrp->allcg_node, &root->allcg_list); + err = cgroup_populate_dir(cgrp); /* If err < 0, we have a half-filled directory - oh well ;) */ @@ -3875,7 +4025,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_subsys(root, ss) { if (cgrp->subsys[ss->subsys_id]) - ss->destroy(ss, cgrp); + ss->destroy(cgrp); } mutex_unlock(&cgroup_mutex); @@ -3895,18 +4045,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return cgroup_create(c_parent, dentry, mode | S_IFDIR); } +/* + * Check the reference count on each subsystem. Since we already + * established that there are no tasks in the cgroup, if the css refcount + * is also 1, then there should be no outstanding references, so the + * subsystem is safe to destroy. We scan across all subsystems rather than + * using the per-hierarchy linked list of mounted subsystems since we can + * be called via check_for_release() with no synchronization other than + * RCU, and the subsystem linked list isn't RCU-safe. + */ static int cgroup_has_css_refs(struct cgroup *cgrp) { - /* Check the reference count on each subsystem. Since we - * already established that there are no tasks in the - * cgroup, if the css refcount is also 1, then there should - * be no outstanding references, so the subsystem is safe to - * destroy. We scan across all subsystems rather than using - * the per-hierarchy linked list of mounted subsystems since - * we can be called via check_for_release() with no - * synchronization other than RCU, and the subsystem linked - * list isn't RCU-safe */ int i; + /* * We won't need to lock the subsys array, because the subsystems * we're concerned about aren't going anywhere since our cgroup root @@ -3915,17 +4066,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; struct cgroup_subsys_state *css; + /* Skip subsystems not present or not in this hierarchy */ if (ss == NULL || ss->root != cgrp->root) continue; + css = cgrp->subsys[ss->subsys_id]; - /* When called from check_for_release() it's possible + /* + * When called from check_for_release() it's possible * that by this point the cgroup has been removed * and the css deleted. But a false-positive doesn't * matter, since it can only happen if the cgroup * has been deleted and hence no longer needs the - * release agent to be called anyway. */ - if (css && (atomic_read(&css->refcnt) > 1)) + * release agent to be called anyway. + */ + if (css && css_refcnt(css) > 1) return 1; } return 0; @@ -3935,51 +4090,63 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) * Atomically mark all (or else none) of the cgroup's CSS objects as * CSS_REMOVED. Return true on success, or false if the cgroup has * busy subsystems. Call with cgroup_mutex held + * + * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or + * not, cgroup removal behaves differently. + * + * If clear is set, css refcnt for the subsystem should be zero before + * cgroup removal can be committed. This is implemented by + * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be + * called multiple times until all css refcnts reach zero and is allowed to + * veto removal on any invocation. This behavior is deprecated and will be + * removed as soon as the existing user (memcg) is updated. + * + * If clear is not set, each css holds an extra reference to the cgroup's + * dentry and cgroup removal proceeds regardless of css refs. + * ->pre_destroy() will be called at least once and is not allowed to fail. + * On the last put of each css, whenever that may be, the extra dentry ref + * is put so that dentry destruction happens only after all css's are + * released. */ - static int cgroup_clear_css_refs(struct cgroup *cgrp) { struct cgroup_subsys *ss; unsigned long flags; bool failed = false; + local_irq_save(flags); + + /* + * Block new css_tryget() by deactivating refcnt. If all refcnts + * for subsystems w/ clear_css_refs set were 1 at the moment of + * deactivation, we succeeded. + */ for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - int refcnt; - while (1) { - /* We can only remove a CSS with a refcnt==1 */ - refcnt = atomic_read(&css->refcnt); - if (refcnt > 1) { - failed = true; - goto done; - } - BUG_ON(!refcnt); - /* - * Drop the refcnt to 0 while we check other - * subsystems. This will cause any racing - * css_tryget() to spin until we set the - * CSS_REMOVED bits or abort - */ - if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt) - break; - cpu_relax(); - } + + WARN_ON(atomic_read(&css->refcnt) < 0); + atomic_add(CSS_DEACT_BIAS, &css->refcnt); + + if (ss->__DEPRECATED_clear_css_refs) + failed |= css_refcnt(css) != 1; } - done: + + /* + * If succeeded, set REMOVED and put all the base refs; otherwise, + * restore refcnts to positive values. Either way, all in-progress + * css_tryget() will be released. + */ for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - if (failed) { - /* - * Restore old refcnt if we previously managed - * to clear it from 1 to 0 - */ - if (!atomic_read(&css->refcnt)) - atomic_set(&css->refcnt, 1); - } else { - /* Commit the fact that the CSS is removed */ + + if (!failed) { set_bit(CSS_REMOVED, &css->flags); + css_put(css); + } else { + atomic_sub(CSS_DEACT_BIAS, &css->refcnt); } } + local_irq_restore(flags); return !failed; } @@ -4064,6 +4231,8 @@ again: list_del_init(&cgrp->sibling); cgroup_unlock_hierarchy(cgrp->root); + list_del_init(&cgrp->allcg_node); + d = dget(cgrp->dentry); cgroup_d_remove_dir(d); @@ -4090,16 +4259,33 @@ again: return 0; } +static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) +{ + INIT_LIST_HEAD(&ss->cftsets); + + /* + * base_cftset is embedded in subsys itself, no need to worry about + * deregistration. + */ + if (ss->base_cftypes) { + ss->base_cftset.cfts = ss->base_cftypes; + list_add_tail(&ss->base_cftset.node, &ss->cftsets); + } +} + static void __init cgroup_init_subsys(struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); + /* init base cftset */ + cgroup_init_cftsets(ss); + /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &rootnode.subsys_list); ss->root = &rootnode; - css = ss->create(ss, dummytop); + css = ss->create(dummytop); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, dummytop); @@ -4165,6 +4351,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) return 0; } + /* init base cftset */ + cgroup_init_cftsets(ss); + /* * need to register a subsys id before anything else - for example, * init_cgroup_css needs it. @@ -4188,7 +4377,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * no ss->create seems to need anything important in the ss struct, so * this can happen first (i.e. before the rootnode attachment). */ - css = ss->create(ss, dummytop); + css = ss->create(dummytop); if (IS_ERR(css)) { /* failure case - need to deassign the subsys[] slot. */ subsys[i] = NULL; @@ -4206,7 +4395,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) int ret = cgroup_init_idr(ss, css); if (ret) { dummytop->subsys[ss->subsys_id] = NULL; - ss->destroy(ss, dummytop); + ss->destroy(dummytop); subsys[i] = NULL; mutex_unlock(&cgroup_mutex); return ret; @@ -4304,7 +4493,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * pointer to find their state. note that this also takes care of * freeing the css_id. */ - ss->destroy(ss, dummytop); + ss->destroy(dummytop); dummytop->subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); @@ -4580,7 +4769,7 @@ void cgroup_fork_callbacks(struct task_struct *child) for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->fork) - ss->fork(ss, child); + ss->fork(child); } } } @@ -4596,6 +4785,17 @@ void cgroup_fork_callbacks(struct task_struct *child) */ void cgroup_post_fork(struct task_struct *child) { + /* + * use_task_css_set_links is set to 1 before we walk the tasklist + * under the tasklist_lock and we read it here after we added the child + * to the tasklist under the tasklist_lock as well. If the child wasn't + * yet in the tasklist when we walked through it from + * cgroup_enable_task_cg_lists(), then use_task_css_set_links value + * should be visible now due to the paired locking and barriers implied + * by LOCK/UNLOCK: it is written before the tasklist_lock unlock + * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock + * lock on fork. + */ if (use_task_css_set_links) { write_lock(&css_set_lock); if (list_empty(&child->cg_list)) { @@ -4682,7 +4882,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) struct cgroup *old_cgrp = rcu_dereference_raw(cg->subsys[i])->cgroup; struct cgroup *cgrp = task_cgroup(tsk, i); - ss->exit(ss, cgrp, old_cgrp, tsk); + ss->exit(cgrp, old_cgrp, tsk); } } } @@ -4743,21 +4943,41 @@ static void check_for_release(struct cgroup *cgrp) } /* Caller must verify that the css is not for root cgroup */ -void __css_put(struct cgroup_subsys_state *css, int count) +bool __css_tryget(struct cgroup_subsys_state *css) +{ + do { + int v = css_refcnt(css); + + if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) + return true; + cpu_relax(); + } while (!test_bit(CSS_REMOVED, &css->flags)); + + return false; +} +EXPORT_SYMBOL_GPL(__css_tryget); + +/* Caller must verify that the css is not for root cgroup */ +void __css_put(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; - int val; + rcu_read_lock(); - val = atomic_sub_return(count, &css->refcnt); - if (val == 1) { + atomic_dec(&css->refcnt); + switch (css_refcnt(css)) { + case 1: if (notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } cgroup_wakeup_rmdir_waiter(cgrp); + break; + case 0: + if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) + schedule_work(&css->dput_work); + break; } rcu_read_unlock(); - WARN_ON_ONCE(val < 1); } EXPORT_SYMBOL_GPL(__css_put); @@ -4876,7 +5096,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) * on this or this is under rcu_read_lock(). Once css->id is allocated, * it's unchanged until freed. */ - cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); + cssid = rcu_dereference_check(css->id, css_refcnt(css)); if (cssid) return cssid->id; @@ -4888,7 +5108,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css) { struct css_id *cssid; - cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); + cssid = rcu_dereference_check(css->id, css_refcnt(css)); if (cssid) return cssid->depth; @@ -4939,9 +5159,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) rcu_assign_pointer(id->css, NULL); rcu_assign_pointer(css->id, NULL); - write_lock(&ss->id_lock); + spin_lock(&ss->id_lock); idr_remove(&ss->idr, id->id); - write_unlock(&ss->id_lock); + spin_unlock(&ss->id_lock); kfree_rcu(id, rcu_head); } EXPORT_SYMBOL_GPL(free_css_id); @@ -4967,10 +5187,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) error = -ENOMEM; goto err_out; } - write_lock(&ss->id_lock); + spin_lock(&ss->id_lock); /* Don't use 0. allocates an ID of 1-65535 */ error = idr_get_new_above(&ss->idr, newid, 1, &myid); - write_unlock(&ss->id_lock); + spin_unlock(&ss->id_lock); /* Returns error when there are no free spaces for new ID.*/ if (error) { @@ -4985,9 +5205,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) return newid; remove_idr: error = -ENOSPC; - write_lock(&ss->id_lock); + spin_lock(&ss->id_lock); idr_remove(&ss->idr, myid); - write_unlock(&ss->id_lock); + spin_unlock(&ss->id_lock); err_out: kfree(newid); return ERR_PTR(error); @@ -4999,7 +5219,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, { struct css_id *newid; - rwlock_init(&ss->id_lock); + spin_lock_init(&ss->id_lock); idr_init(&ss->idr); newid = get_new_cssid(ss, 0); @@ -5087,6 +5307,8 @@ css_get_next(struct cgroup_subsys *ss, int id, return NULL; BUG_ON(!ss->use_id); + WARN_ON_ONCE(!rcu_read_lock_held()); + /* fill start point for scan */ tmpid = id; while (1) { @@ -5094,10 +5316,7 @@ css_get_next(struct cgroup_subsys *ss, int id, * scan next entry from bitmap(tree), tmpid is updated after * idr_get_next(). */ - read_lock(&ss->id_lock); tmp = idr_get_next(&ss->idr, &tmpid); - read_unlock(&ss->id_lock); - if (!tmp) break; if (tmp->depth >= depth && tmp->stack[depth] == rootid) { @@ -5137,8 +5356,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) } #ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, - struct cgroup *cont) +static struct cgroup_subsys_state *debug_create(struct cgroup *cont) { struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); @@ -5148,7 +5366,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, return css; } -static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +static void debug_destroy(struct cgroup *cont) { kfree(cont->subsys[debug_subsys_id]); } @@ -5271,19 +5489,15 @@ static struct cftype debug_files[] = { .name = "releasable", .read_u64 = releasable_read, }, -}; -static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ - return cgroup_add_files(cont, ss, debug_files, - ARRAY_SIZE(debug_files)); -} + { } /* terminate */ +}; struct cgroup_subsys debug_subsys = { .name = "debug", .create = debug_create, .destroy = debug_destroy, - .populate = debug_populate, .subsys_id = debug_subsys_id, + .base_cftypes = debug_files, }; #endif /* CONFIG_CGROUP_DEBUG */ |