summaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c293
1 files changed, 196 insertions, 97 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7b69b8d0313..909a35510af 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -157,7 +157,7 @@ struct css_id {
};
/*
- * cgroup_event represents events which userspace want to recieve.
+ * cgroup_event represents events which userspace want to receive.
*/
struct cgroup_event {
/*
@@ -243,6 +243,11 @@ static int notify_on_release(const struct cgroup *cgrp)
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}
+static int clone_children(const struct cgroup *cgrp)
+{
+ return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+}
+
/*
* for_each_subsys() allows you to iterate on each subsystem attached to
* an active hierarchy
@@ -321,12 +326,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
return &css_set_table[index];
}
-static void free_css_set_rcu(struct rcu_head *obj)
-{
- struct css_set *cg = container_of(obj, struct css_set, rcu_head);
- kfree(cg);
-}
-
/* We don't maintain the lists running through each css_set to its
* task until after the first call to cgroup_iter_start(). This
* reduces the fork()/exit() overhead for people who have cgroups
@@ -370,7 +369,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
}
write_unlock(&css_set_lock);
- call_rcu(&cg->rcu_head, free_css_set_rcu);
+ kfree_rcu(cg, rcu_head);
}
/*
@@ -759,6 +758,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
*/
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
static int cgroup_populate_dir(struct cgroup *cgrp);
static const struct inode_operations cgroup_dir_inode_operations;
@@ -777,6 +777,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
struct inode *inode = new_inode(sb);
if (inode) {
+ inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
@@ -805,13 +806,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
return ret;
}
-static void free_cgroup_rcu(struct rcu_head *obj)
-{
- struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
-
- kfree(cgrp);
-}
-
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
/* is dentry a directory ? if so, kfree() associated cgroup */
@@ -849,11 +843,16 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
*/
BUG_ON(!list_empty(&cgrp->pidlists));
- call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
+ kfree_rcu(cgrp, rcu_head);
}
iput(inode);
}
+static int cgroup_delete(const struct dentry *d)
+{
+ return 1;
+}
+
static void remove_dir(struct dentry *d)
{
struct dentry *parent = dget(d->d_parent);
@@ -868,25 +867,29 @@ static void cgroup_clear_directory(struct dentry *dentry)
struct list_head *node;
BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
- spin_lock(&dcache_lock);
+ spin_lock(&dentry->d_lock);
node = dentry->d_subdirs.next;
while (node != &dentry->d_subdirs) {
struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
+
+ spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
list_del_init(node);
if (d->d_inode) {
/* This should never be called on a cgroup
* directory with child cgroups */
BUG_ON(d->d_inode->i_mode & S_IFDIR);
- d = dget_locked(d);
- spin_unlock(&dcache_lock);
+ dget_dlock(d);
+ spin_unlock(&d->d_lock);
+ spin_unlock(&dentry->d_lock);
d_delete(d);
simple_unlink(dentry->d_inode, d);
dput(d);
- spin_lock(&dcache_lock);
- }
+ spin_lock(&dentry->d_lock);
+ } else
+ spin_unlock(&d->d_lock);
node = dentry->d_subdirs.next;
}
- spin_unlock(&dcache_lock);
+ spin_unlock(&dentry->d_lock);
}
/*
@@ -894,11 +897,16 @@ static void cgroup_clear_directory(struct dentry *dentry)
*/
static void cgroup_d_remove_dir(struct dentry *dentry)
{
+ struct dentry *parent;
+
cgroup_clear_directory(dentry);
- spin_lock(&dcache_lock);
+ parent = dentry->d_parent;
+ spin_lock(&parent->d_lock);
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
list_del_init(&dentry->d_u.d_child);
- spin_unlock(&dcache_lock);
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&parent->d_lock);
remove_dir(dentry);
}
@@ -1039,6 +1047,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",noprefix");
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+ if (clone_children(&root->top_cgroup))
+ seq_puts(seq, ",clone_children");
if (strlen(root->name))
seq_printf(seq, ",name=%s", root->name);
mutex_unlock(&cgroup_mutex);
@@ -1049,6 +1059,7 @@ struct cgroup_sb_opts {
unsigned long subsys_bits;
unsigned long flags;
char *release_agent;
+ bool clone_children;
char *name;
/* User explicitly requested empty subsystem */
bool none;
@@ -1065,7 +1076,8 @@ struct cgroup_sb_opts {
*/
static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
{
- char *token, *o = data ?: "all";
+ char *token, *o = data;
+ bool all_ss = false, one_ss = false;
unsigned long mask = (unsigned long)-1;
int i;
bool module_pin_failed = false;
@@ -1081,22 +1093,27 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
while ((token = strsep(&o, ",")) != NULL) {
if (!*token)
return -EINVAL;
- if (!strcmp(token, "all")) {
- /* Add all non-disabled subsystems */
- opts->subsys_bits = 0;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss == NULL)
- continue;
- if (!ss->disabled)
- opts->subsys_bits |= 1ul << i;
- }
- } else if (!strcmp(token, "none")) {
+ if (!strcmp(token, "none")) {
/* Explicitly have no subsystems */
opts->none = true;
- } else if (!strcmp(token, "noprefix")) {
+ continue;
+ }
+ if (!strcmp(token, "all")) {
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (one_ss)
+ return -EINVAL;
+ all_ss = true;
+ continue;
+ }
+ if (!strcmp(token, "noprefix")) {
set_bit(ROOT_NOPREFIX, &opts->flags);
- } else if (!strncmp(token, "release_agent=", 14)) {
+ continue;
+ }
+ if (!strcmp(token, "clone_children")) {
+ opts->clone_children = true;
+ continue;
+ }
+ if (!strncmp(token, "release_agent=", 14)) {
/* Specifying two release agents is forbidden */
if (opts->release_agent)
return -EINVAL;
@@ -1104,7 +1121,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
if (!opts->release_agent)
return -ENOMEM;
- } else if (!strncmp(token, "name=", 5)) {
+ continue;
+ }
+ if (!strncmp(token, "name=", 5)) {
const char *name = token + 5;
/* Can't specify an empty name */
if (!strlen(name))
@@ -1126,20 +1145,44 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
GFP_KERNEL);
if (!opts->name)
return -ENOMEM;
- } else {
- struct cgroup_subsys *ss;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- ss = subsys[i];
- if (ss == NULL)
- continue;
- if (!strcmp(token, ss->name)) {
- if (!ss->disabled)
- set_bit(i, &opts->subsys_bits);
- break;
- }
- }
- if (i == CGROUP_SUBSYS_COUNT)
- return -ENOENT;
+
+ continue;
+ }
+
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss == NULL)
+ continue;
+ if (strcmp(token, ss->name))
+ continue;
+ if (ss->disabled)
+ continue;
+
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (all_ss)
+ return -EINVAL;
+ set_bit(i, &opts->subsys_bits);
+ one_ss = true;
+
+ break;
+ }
+ if (i == CGROUP_SUBSYS_COUNT)
+ return -ENOENT;
+ }
+
+ /*
+ * If the 'all' option was specified select all the subsystems,
+ * otherwise 'all, 'none' and a subsystem name options were not
+ * specified, let's default to 'all'
+ */
+ if (all_ss || (!all_ss && !one_ss && !opts->none)) {
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss == NULL)
+ continue;
+ if (ss->disabled)
+ continue;
+ set_bit(i, &opts->subsys_bits);
}
}
@@ -1354,6 +1397,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
strcpy(root->release_agent_path, opts->release_agent);
if (opts->name)
strcpy(root->name, opts->name);
+ if (opts->clone_children)
+ set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
return root;
}
@@ -1397,6 +1442,11 @@ static int cgroup_set_super(struct super_block *sb, void *data)
static int cgroup_get_rootdir(struct super_block *sb)
{
+ static const struct dentry_operations cgroup_dops = {
+ .d_iput = cgroup_diput,
+ .d_delete = cgroup_delete,
+ };
+
struct inode *inode =
cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
struct dentry *dentry;
@@ -1414,12 +1464,14 @@ static int cgroup_get_rootdir(struct super_block *sb)
return -ENOMEM;
}
sb->s_root = dentry;
+ /* for everything else we want ->d_op set */
+ sb->s_d_op = &cgroup_dops;
return 0;
}
-static int cgroup_get_sb(struct file_system_type *fs_type,
+static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
- void *data, struct vfsmount *mnt)
+ void *data)
{
struct cgroup_sb_opts opts;
struct cgroupfs_root *root;
@@ -1553,10 +1605,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
drop_parsed_module_refcounts(opts.subsys_bits);
}
- simple_set_mnt(mnt, sb);
kfree(opts.release_agent);
kfree(opts.name);
- return 0;
+ return dget(sb->s_root);
drop_new_super:
deactivate_locked_super(sb);
@@ -1565,7 +1616,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
out_err:
kfree(opts.release_agent);
kfree(opts.name);
- return ret;
+ return ERR_PTR(ret);
}
static void cgroup_kill_sb(struct super_block *sb) {
@@ -1615,7 +1666,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
static struct file_system_type cgroup_fs_type = {
.name = "cgroup",
- .get_sb = cgroup_get_sb,
+ .mount = cgroup_mount,
.kill_sb = cgroup_kill_sb,
};
@@ -1749,10 +1800,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
/* Update the css_set linked lists if we're using them */
write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list)) {
- list_del(&tsk->cg_list);
- list_add(&tsk->cg_list, &newcg->tasks);
- }
+ if (!list_empty(&tsk->cg_list))
+ list_move(&tsk->cg_list, &newcg->tasks);
write_unlock(&css_set_lock);
for_each_subsys(root, ss) {
@@ -1879,6 +1928,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
const char *buffer)
{
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+ if (strlen(buffer) >= PATH_MAX)
+ return -EINVAL;
if (!cgroup_lock_live_group(cgrp))
return -ENODEV;
strcpy(cgrp->root->release_agent_path, buffer);
@@ -2136,12 +2187,20 @@ static const struct file_operations cgroup_file_operations = {
};
static const struct inode_operations cgroup_dir_inode_operations = {
- .lookup = simple_lookup,
+ .lookup = cgroup_lookup,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
.rename = cgroup_rename,
};
+static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+ if (dentry->d_name.len > NAME_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+ d_add(dentry, NULL);
+ return NULL;
+}
+
/*
* Check if a file is a control file
*/
@@ -2155,10 +2214,6 @@ static inline struct cftype *__file_cft(struct file *file)
static int cgroup_create_file(struct dentry *dentry, mode_t mode,
struct super_block *sb)
{
- static const struct dentry_operations cgroup_dops = {
- .d_iput = cgroup_diput,
- };
-
struct inode *inode;
if (!dentry)
@@ -2184,7 +2239,6 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,
inode->i_size = 0;
inode->i_fop = &cgroup_file_operations;
}
- dentry->d_op = &cgroup_dops;
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
return 0;
@@ -3172,6 +3226,23 @@ fail:
return ret;
}
+static u64 cgroup_clone_children_read(struct cgroup *cgrp,
+ struct cftype *cft)
+{
+ return clone_children(cgrp);
+}
+
+static int cgroup_clone_children_write(struct cgroup *cgrp,
+ struct cftype *cft,
+ u64 val)
+{
+ if (val)
+ set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+ else
+ clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+ return 0;
+}
+
/*
* for the common functions, 'private' gives the type of file
*/
@@ -3202,6 +3273,11 @@ static struct cftype files[] = {
.write_string = cgroup_write_event_control,
.mode = S_IWUGO,
},
+ {
+ .name = "cgroup.clone_children",
+ .read_u64 = cgroup_clone_children_read,
+ .write_u64 = cgroup_clone_children_write,
+ },
};
static struct cftype cft_release_agent = {
@@ -3331,6 +3407,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+ if (clone_children(parent))
+ set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+
for_each_subsys(root, ss) {
struct cgroup_subsys_state *css = ss->create(ss, cgrp);
@@ -3345,6 +3424,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
goto err_destroy;
}
/* At error, ->destroy() callback has to free assigned ID. */
+ if (clone_children(parent) && ss->post_clone)
+ ss->post_clone(ss, cgrp);
}
cgroup_lock_hierarchy(root);
@@ -3559,17 +3640,15 @@ again:
spin_lock(&release_list_lock);
set_bit(CGRP_REMOVED, &cgrp->flags);
if (!list_empty(&cgrp->release_list))
- list_del(&cgrp->release_list);
+ list_del_init(&cgrp->release_list);
spin_unlock(&release_list_lock);
cgroup_lock_hierarchy(cgrp->root);
/* delete this cgroup from parent->children */
- list_del(&cgrp->sibling);
+ list_del_init(&cgrp->sibling);
cgroup_unlock_hierarchy(cgrp->root);
- spin_lock(&cgrp->dentry->d_lock);
d = dget(cgrp->dentry);
- spin_unlock(&d->d_lock);
cgroup_d_remove_dir(d);
dput(d);
@@ -3785,7 +3864,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
subsys[ss->subsys_id] = NULL;
/* remove subsystem from rootnode's list of subsystems */
- list_del(&ss->sibling);
+ list_del_init(&ss->sibling);
/*
* disentangle the css from all css_sets attached to the dummytop. as
@@ -4136,20 +4215,8 @@ void cgroup_post_fork(struct task_struct *child)
*/
void cgroup_exit(struct task_struct *tsk, int run_callbacks)
{
- int i;
struct css_set *cg;
-
- if (run_callbacks && need_forkexit_callback) {
- /*
- * modular subsystems can't use callbacks, so no need to lock
- * the subsys array
- */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss->exit)
- ss->exit(ss, tsk);
- }
- }
+ int i;
/*
* Unlink from the css_set task list if necessary.
@@ -4159,7 +4226,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
if (!list_empty(&tsk->cg_list)) {
write_lock(&css_set_lock);
if (!list_empty(&tsk->cg_list))
- list_del(&tsk->cg_list);
+ list_del_init(&tsk->cg_list);
write_unlock(&css_set_lock);
}
@@ -4167,7 +4234,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
task_lock(tsk);
cg = tsk->cgroups;
tsk->cgroups = &init_css_set;
+
+ if (run_callbacks && need_forkexit_callback) {
+ /*
+ * modular subsystems can't use callbacks, so no need to lock
+ * the subsys array
+ */
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss->exit) {
+ struct cgroup *old_cgrp =
+ rcu_dereference_raw(cg->subsys[i])->cgroup;
+ struct cgroup *cgrp = task_cgroup(tsk, i);
+ ss->exit(ss, cgrp, old_cgrp, tsk);
+ }
+ }
+ }
task_unlock(tsk);
+
if (cg)
put_css_set_taskexit(cg);
}
@@ -4526,14 +4610,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
return ret;
}
-static void __free_css_id_cb(struct rcu_head *head)
-{
- struct css_id *id;
-
- id = container_of(head, struct css_id, rcu_head);
- kfree(id);
-}
-
void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
{
struct css_id *id = css->id;
@@ -4548,7 +4624,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
spin_lock(&ss->id_lock);
idr_remove(&ss->idr, id->id);
spin_unlock(&ss->id_lock);
- call_rcu(&id->rcu_head, __free_css_id_cb);
+ kfree_rcu(id, rcu_head);
}
EXPORT_SYMBOL_GPL(free_css_id);
@@ -4719,6 +4795,29 @@ css_get_next(struct cgroup_subsys *ss, int id,
return ret;
}
+/*
+ * get corresponding css from file open on cgroupfs directory
+ */
+struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
+{
+ struct cgroup *cgrp;
+ struct inode *inode;
+ struct cgroup_subsys_state *css;
+
+ inode = f->f_dentry->d_inode;
+ /* check in cgroup filesystem dir */
+ if (inode->i_op != &cgroup_dir_inode_operations)
+ return ERR_PTR(-EBADF);
+
+ if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
+ return ERR_PTR(-EINVAL);
+
+ /* get cgroup */
+ cgrp = __d_cgrp(f->f_dentry);
+ css = cgrp->subsys[id];
+ return css ? css : ERR_PTR(-ENOENT);
+}
+
#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
struct cgroup *cont)