From 75139b8274c3e30354daea623f14b43a482a0bb5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:07:33 -0800 Subject: cgroups: remove some redundant NULL checks - In cgroup_clone(), if vfs_mkdir() returns successfully, dentry->d_fsdata will be the pointer to the newly created cgroup and won't be NULL. - a cgroup file's dentry->d_fsdata won't be NULL, guaranteed by cgroup_add_file(). - When walking through the subsystems of a cgroup_fs (using for_each_subsys), cgrp->subsys[ss->subsys_id] won't be NULL, guaranteed by cgroup_create(). (Also remove 2 unused variables in cgroup_rmdir(). Signed-off-by: Li Zefan Cc: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f221446aa02..220e0fd659f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -586,7 +586,7 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp) { struct cgroup_subsys *ss; for_each_subsys(cgrp->root, ss) - if (ss->pre_destroy && cgrp->subsys[ss->subsys_id]) + if (ss->pre_destroy) ss->pre_destroy(ss, cgrp); return; } @@ -610,10 +610,8 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) /* * Release the subsystem state objects. */ - for_each_subsys(cgrp->root, ss) { - if (cgrp->subsys[ss->subsys_id]) - ss->destroy(ss, cgrp); - } + for_each_subsys(cgrp->root, ss) + ss->destroy(ss, cgrp); cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -1445,7 +1443,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (!cft || cgroup_is_removed(cgrp)) + if (cgroup_is_removed(cgrp)) return -ENODEV; if (cft->write) return cft->write(cgrp, cft, file, buf, nbytes, ppos); @@ -1490,7 +1488,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (!cft || cgroup_is_removed(cgrp)) + if (cgroup_is_removed(cgrp)) return -ENODEV; if (cft->read) @@ -1554,10 +1552,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file) err = generic_file_open(inode, file); if (err) return err; - cft = __d_cft(file->f_dentry); - if (!cft) - return -ENODEV; + if (cft->read_map || cft->read_seq_string) { struct cgroup_seqfile_state *state = kzalloc(sizeof(*state), GFP_USER); @@ -2463,8 +2459,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) struct cgroup *cgrp = dentry->d_fsdata; struct dentry *d; struct cgroup *parent; - struct super_block *sb; - struct cgroupfs_root *root; /* the vfs holds both inode->i_mutex already */ @@ -2487,8 +2481,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) mutex_lock(&cgroup_mutex); parent = cgrp->parent; - root = cgrp->root; - sb = root->sb; if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children) @@ -2937,7 +2929,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, } /* Create the cgroup directory, which also creates the cgroup */ - ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); + ret = vfs_mkdir(inode, dentry, 0755); child = __d_cgrp(dentry); dput(dentry); if (ret) { @@ -2947,13 +2939,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, goto out_release; } - if (!child) { - printk(KERN_INFO - "Couldn't find new cgroup %s\n", nodename); - ret = -ENOMEM; - goto out_release; - } - /* The cgroup now exists. Retake cgroup_mutex and check * that we're still in the same state that we thought we * were. */ -- cgit v1.2.3-70-g09d2 From b12b533fa523e94e0cc9dc23274ae4f9439f1313 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 7 Jan 2009 18:07:36 -0800 Subject: cgroups: add lock for child->cgroups in cgroup_post_fork() When cgroup_post_fork() is called, child is seen by find_task_by_vpid(), so child->cgroups maybe be changed, It'll incorrect. child->cgroups's refcnt is decreased child->cgroups's refcnt is increased but child->cg_list is added to child->cgroups's list. Signed-off-by: Lai Jiangshan Reviewed-by: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 220e0fd659f..d7ab4ffd8fd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2792,8 +2792,10 @@ void cgroup_post_fork(struct task_struct *child) { if (use_task_css_set_links) { write_lock(&css_set_lock); + task_lock(child); if (list_empty(&child->cg_list)) list_add(&child->cg_list, &child->cgroups->tasks); + task_unlock(child); write_unlock(&css_set_lock); } } -- cgit v1.2.3-70-g09d2 From 2019f634ce5904c19eba4e86f51b1a119a53a9f1 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 7 Jan 2009 18:07:36 -0800 Subject: cgroups: fix cgroup_iter_next() bug We access res->cgroups without the task_lock(), so res->cgroups may be changed. it's unreliable, and "if (l == &res->cgroups->tasks)" may be false forever. We don't need add any lock for fixing this bug. we just access to struct css_set by struct cg_cgroup_link, not by struct task_struct. Since we hold css_set_lock, struct cg_cgroup_link is reliable. Signed-off-by: Lai Jiangshan Reviewed-by: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d7ab4ffd8fd..a391ab3bdfc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1808,6 +1808,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, { struct task_struct *res; struct list_head *l = it->task; + struct cg_cgroup_link *link; /* If the iterator cg is NULL, we have no tasks */ if (!it->cg_link) @@ -1815,7 +1816,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, res = list_entry(l, struct task_struct, cg_list); /* Advance iterator to find next entry */ l = l->next; - if (l == &res->cgroups->tasks) { + link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); + if (l == &link->cg->tasks) { /* We reached the end of this task list - move on to * the next cg_cgroup_link */ cgroup_advance_iter(cgrp, it); -- cgit v1.2.3-70-g09d2 From b2aa30f7bb381e04c93eed106089ba55553955f1 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 7 Jan 2009 18:07:37 -0800 Subject: cgroups: don't put struct cgroupfs_root protected by RCU We don't access struct cgroupfs_root in fast path, so we should not put struct cgroupfs_root protected by RCU But the comment in struct cgroup_subsys.root confuse us. struct cgroup_subsys.root is used in these places: 1 find_css_set(): if (ss->root->subsys_list.next == &ss->sibling) 2 rebind_subsystems(): if (ss->root != &rootnode) rcu_assign_pointer(ss->root, root); rcu_assign_pointer(subsys[i]->root, &rootnode); 3 cgroup_has_css_refs(): if (ss->root != cgrp->root) 4 cgroup_init_subsys(): ss->root = &rootnode; 5 proc_cgroupstats_show(): ss->name, ss->root->subsys_bits, ss->root->number_of_cgroups, !ss->disabled); 6 cgroup_clone(): root = subsys->root; if ((root != subsys->root) || All these place we have held cgroup_lock() or we don't dereference to struct cgroupfs_root. It's means wo don't need RCU when use struct cgroup_subsys.root, and we should not put struct cgroupfs_root protected by RCU. Signed-off-by: Lai Jiangshan Reviewed-by: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 1 - kernel/cgroup.c | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 08b78c09b09..f68dfd8dd53 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -337,7 +337,6 @@ struct cgroup_subsys { #define MAX_CGROUP_TYPE_NAMELEN 32 const char *name; - /* Protected by RCU */ struct cgroupfs_root *root; struct list_head sibling; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a391ab3bdfc..a288da176e4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -713,7 +713,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, cgrp->subsys[i] = dummytop->subsys[i]; cgrp->subsys[i]->cgroup = cgrp; list_add(&ss->sibling, &root->subsys_list); - rcu_assign_pointer(ss->root, root); + ss->root = root; if (ss->bind) ss->bind(ss, cgrp); @@ -725,7 +725,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, ss->bind(ss, dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; - rcu_assign_pointer(subsys[i]->root, &rootnode); + subsys[i]->root = &rootnode; list_del(&ss->sibling); } else if (bit & final_bits) { /* Subsystem state should already exist */ -- cgit v1.2.3-70-g09d2 From 104cbd55377029e70fc2cee01089e84b9c36e5dc Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 7 Jan 2009 18:07:38 -0800 Subject: cgroups: use task_lock() for access tsk->cgroups safe in cgroup_clone() Use task_lock() protect tsk->cgroups and get_css_set(tsk->cgroups). Signed-off-by: Lai Jiangshan Acked-by: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a288da176e4..00d5136d38c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2903,6 +2903,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, mutex_unlock(&cgroup_mutex); return 0; } + task_lock(tsk); cg = tsk->cgroups; parent = task_cgroup(tsk, subsys->subsys_id); @@ -2915,6 +2916,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, /* Keep the cgroup alive */ get_css_set(cg); + task_unlock(tsk); mutex_unlock(&cgroup_mutex); /* Now do the VFS work to create a cgroup */ -- cgit v1.2.3-70-g09d2 From 77efecd9e0526327548152df715ab8644ecb5ba0 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 7 Jan 2009 18:07:39 -0800 Subject: cgroups: call find_css_set() safely in cgroup_attach_task() In cgroup_attach_task(), tsk maybe exit when we call find_css_set(). and find_css_set() will access to invalid css_set. This patch increases the count before get_css_set(), and decreases it after find_css_set(). NOTE: css_set's refcount is also taskcount, after this patch applied, taskcount may be off-by-one WHEN cgroup_lock() is not held. but I reviewed other code which use taskcount, they are still correct. No regression found by reviewing and simply testing. So I do not use two counters in css_set. (one counter for taskcount, the other for refcount. like struct mm_struct) If this fix cause regression, we will use two counters in css_set. Signed-off-by: Lai Jiangshan Cc: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 00d5136d38c..61e92c5867e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1214,7 +1214,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) int retval = 0; struct cgroup_subsys *ss; struct cgroup *oldcgrp; - struct css_set *cg = tsk->cgroups; + struct css_set *cg; struct css_set *newcg; struct cgroupfs_root *root = cgrp->root; int subsys_id; @@ -1234,11 +1234,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } } + task_lock(tsk); + cg = tsk->cgroups; + get_css_set(cg); + task_unlock(tsk); /* * Locate or allocate a new css_set for this task, * based on its final set of cgroups */ newcg = find_css_set(cg, cgrp); + put_css_set(cg); if (!newcg) return -ENOMEM; -- cgit v1.2.3-70-g09d2 From 7534432dcc3c654a8671b6b0cdffd1dbdbc73074 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 7 Jan 2009 18:07:40 -0800 Subject: cgroups: remove rcu_read_lock() in cgroupstats_build() cgroup_iter_* do not need rcu_read_lock(). In cgroup_enable_task_cg_lists(), do_each_thread() and while_each_thread() are protected by RCU, it's OK, for write_lock(&css_set_lock) implies rcu_read_lock() in non-RT kernel. If we need explicit rcu_read_lock(), we should add rcu_read_lock() in cgroup_enable_task_cg_lists(), not cgroup_iter_*. Signed-off-by: Lai Jiangshan Acked-by: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 61e92c5867e..f55af3daffc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2055,7 +2055,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) ret = 0; cgrp = dentry->d_fsdata; - rcu_read_lock(); cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { @@ -2080,7 +2079,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } cgroup_iter_end(cgrp, &it); - rcu_read_unlock(); err: return ret; } -- cgit v1.2.3-70-g09d2 From e5f6a8609bab0c2d7543ab1505105e011832afd7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:07:41 -0800 Subject: cgroups: make root_list contains active hierarchies only Don't link rootnode to the root list, so root_list contains active hierarchies only as the comment indicates. And rename for_each_root() to for_each_active_root(). Also remove redundant check in cgroup_kill_sb(). Signed-off-by: Li Zefan Cc: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f55af3daffc..fd572d05769 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -84,7 +84,7 @@ struct cgroupfs_root { /* Tracks how many cgroups are currently defined in hierarchy.*/ int number_of_cgroups; - /* A list running through the mounted hierarchies */ + /* A list running through the active hierarchies */ struct list_head root_list; /* Hierarchy-specific flags */ @@ -148,8 +148,8 @@ static int notify_on_release(const struct cgroup *cgrp) #define for_each_subsys(_root, _ss) \ list_for_each_entry(_ss, &_root->subsys_list, sibling) -/* for_each_root() allows you to iterate across the active hierarchies */ -#define for_each_root(_root) \ +/* for_each_active_root() allows you to iterate across the active hierarchies */ +#define for_each_active_root(_root) \ list_for_each_entry(_root, &roots, root_list) /* the list of cgroups eligible for automatic release. Protected by @@ -1111,10 +1111,9 @@ static void cgroup_kill_sb(struct super_block *sb) { } write_unlock(&css_set_lock); - if (!list_empty(&root->root_list)) { - list_del(&root->root_list); - root_count--; - } + list_del(&root->root_list); + root_count--; + mutex_unlock(&cgroup_mutex); kfree(root); @@ -2559,7 +2558,6 @@ int __init cgroup_init_early(void) INIT_HLIST_NODE(&init_css_set.hlist); css_set_count = 1; init_cgroup_root(&rootnode); - list_add(&rootnode.root_list, &roots); root_count = 1; init_task.cgroups = &init_css_set; @@ -2666,15 +2664,12 @@ static int proc_cgroup_show(struct seq_file *m, void *v) mutex_lock(&cgroup_mutex); - for_each_root(root) { + for_each_active_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; int subsys_id; int count = 0; - /* Skip this hierarchy if it has no active subsystems */ - if (!root->actual_subsys_bits) - continue; seq_printf(m, "%lu:", root->subsys_bits); for_each_subsys(root, ss) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); -- cgit v1.2.3-70-g09d2 From 33a68ac1c1b695216e873ee12e819adbe73e4d9f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:07:42 -0800 Subject: cgroups: add inactive subsystems to rootnode.subsys_list Though for an inactive hierarchy, we have subsys->root == &rootnode, but rootnode's subsys_list is always empty. This conflicts with the code in find_css_set(): for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { ... if (ss->root->subsys_list.next == &ss->sibling) { ... } } if (list_empty(&rootnode.subsys_list)) { ... } The above code assumes rootnode.subsys_list links all inactive hierarchies. Signed-off-by: Li Zefan Cc: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fd572d05769..abf7248f501 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -712,7 +712,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(dummytop->subsys[i]->cgroup != dummytop); cgrp->subsys[i] = dummytop->subsys[i]; cgrp->subsys[i]->cgroup = cgrp; - list_add(&ss->sibling, &root->subsys_list); + list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) ss->bind(ss, cgrp); @@ -726,7 +726,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; subsys[i]->root = &rootnode; - list_del(&ss->sibling); + list_move(&ss->sibling, &rootnode.subsys_list); } else if (bit & final_bits) { /* Subsystem state should already exist */ BUG_ON(!cgrp->subsys[i]); @@ -2521,6 +2521,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); /* Create the top cgroup state for this subsystem */ + list_add(&ss->sibling, &rootnode.subsys_list); ss->root = &rootnode; css = ss->create(ss, dummytop); /* We don't handle early failures gracefully */ -- cgit v1.2.3-70-g09d2 From c12f65d4396e05c51ce3af7f159ead98574a587c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:07:42 -0800 Subject: cgroups: introduce link_css_set() to remove duplicate code Add a common function link_css_set() to link a css_set to a cgroup. Signed-off-by: Li Zefan Cc: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 68 +++++++++++++++++++++++++-------------------------------- 1 file changed, 30 insertions(+), 38 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index abf7248f501..4c475ce4e22 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -384,6 +384,25 @@ static int allocate_cg_links(int count, struct list_head *tmp) return 0; } +/** + * link_css_set - a helper function to link a css_set to a cgroup + * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() + * @cg: the css_set to be linked + * @cgrp: the destination cgroup + */ +static void link_css_set(struct list_head *tmp_cg_links, + struct css_set *cg, struct cgroup *cgrp) +{ + struct cg_cgroup_link *link; + + BUG_ON(list_empty(tmp_cg_links)); + link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, + cgrp_link_list); + link->cg = cg; + list_move(&link->cgrp_link_list, &cgrp->css_sets); + list_add(&link->cg_link_list, &cg->cg_links); +} + /* * find_css_set() takes an existing cgroup group and a * cgroup object, and returns a css_set object that's @@ -399,7 +418,6 @@ static struct css_set *find_css_set( int i; struct list_head tmp_cg_links; - struct cg_cgroup_link *link; struct hlist_head *hhead; @@ -444,26 +462,11 @@ static struct css_set *find_css_set( * only do it for the first subsystem in each * hierarchy */ - if (ss->root->subsys_list.next == &ss->sibling) { - BUG_ON(list_empty(&tmp_cg_links)); - link = list_entry(tmp_cg_links.next, - struct cg_cgroup_link, - cgrp_link_list); - list_del(&link->cgrp_link_list); - list_add(&link->cgrp_link_list, &cgrp->css_sets); - link->cg = res; - list_add(&link->cg_link_list, &res->cg_links); - } - } - if (list_empty(&rootnode.subsys_list)) { - link = list_entry(tmp_cg_links.next, - struct cg_cgroup_link, - cgrp_link_list); - list_del(&link->cgrp_link_list); - list_add(&link->cgrp_link_list, &dummytop->css_sets); - link->cg = res; - list_add(&link->cg_link_list, &res->cg_links); + if (ss->root->subsys_list.next == &ss->sibling) + link_css_set(&tmp_cg_links, res, cgrp); } + if (list_empty(&rootnode.subsys_list)) + link_css_set(&tmp_cg_links, res, dummytop); BUG_ON(!list_empty(&tmp_cg_links)); @@ -988,7 +991,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, root = NULL; } else { /* New superblock */ - struct cgroup *cgrp = &root->top_cgroup; + struct cgroup *root_cgrp = &root->top_cgroup; struct inode *inode; int i; @@ -1029,7 +1032,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, list_add(&root->root_list, &roots); root_count++; - sb->s_root->d_fsdata = &root->top_cgroup; + sb->s_root->d_fsdata = root_cgrp; root->top_cgroup.dentry = sb->s_root; /* Link the top cgroup in this hierarchy into all @@ -1040,29 +1043,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type, struct hlist_node *node; struct css_set *cg; - hlist_for_each_entry(cg, node, hhead, hlist) { - struct cg_cgroup_link *link; - - BUG_ON(list_empty(&tmp_cg_links)); - link = list_entry(tmp_cg_links.next, - struct cg_cgroup_link, - cgrp_link_list); - list_del(&link->cgrp_link_list); - link->cg = cg; - list_add(&link->cgrp_link_list, - &root->top_cgroup.css_sets); - list_add(&link->cg_link_list, &cg->cg_links); - } + hlist_for_each_entry(cg, node, hhead, hlist) + link_css_set(&tmp_cg_links, cg, root_cgrp); } write_unlock(&css_set_lock); free_cg_links(&tmp_cg_links); - BUG_ON(!list_empty(&cgrp->sibling)); - BUG_ON(!list_empty(&cgrp->children)); + BUG_ON(!list_empty(&root_cgrp->sibling)); + BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); - cgroup_populate_dir(cgrp); + cgroup_populate_dir(root_cgrp); mutex_unlock(&inode->i_mutex); mutex_unlock(&cgroup_mutex); } -- cgit v1.2.3-70-g09d2 From e7b80bb695a5b64c92e314838e083b2f3bdf29b2 Mon Sep 17 00:00:00 2001 From: Gowrishankar M Date: Wed, 7 Jan 2009 18:07:43 -0800 Subject: cgroups: skip processes from other namespaces when listing a cgroup Once tasks are populated from system namespace inside cgroup, container replaces other namespace task with 0 while listing tasks, inside container. Though this is expected behaviour from container end, there is no use of showing unwanted 0s. In this patch, we check if a process is in same namespace before loading into pid array. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Gowrishankar M Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4c475ce4e22..cb7c72b91f4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2007,14 +2007,16 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) */ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) { - int n = 0; + int n = 0, pid; struct cgroup_iter it; struct task_struct *tsk; cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { if (unlikely(n == npids)) break; - pidarray[n++] = task_pid_vnr(tsk); + pid = task_pid_vnr(tsk); + if (pid > 0) + pidarray[n++] = pid; } cgroup_iter_end(cgrp, &it); return n; -- cgit v1.2.3-70-g09d2 From a47295e6bc42ad35f9c15ac66f598aa24debd4e2 Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Wed, 7 Jan 2009 18:07:44 -0800 Subject: cgroups: make cgroup_path() RCU-safe Fix races between /proc/sched_debug by freeing cgroup objects via an RCU callback. Thus any cgroup reference obtained from an RCU-safe source will remain valid during the RCU section. Since dentries are also RCU-safe, this allows us to traverse up the tree safely. Additionally, make cgroup_path() check for a NULL cgrp->dentry to avoid trying to report a path for a partially-created cgroup. [lizf@cn.fujitsu.com: call deactive_super() in cgroup_diput()] Signed-off-by: Paul Menage Reviewed-by: Li Zefan Tested-by: Li Zefan Cc: Peter Zijlstra Signed-off-by: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 5 ++++- kernel/cgroup.c | 30 +++++++++++++++++++++--------- 2 files changed, 25 insertions(+), 10 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f68dfd8dd53..73d1c730c3c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -116,7 +116,7 @@ struct cgroup { struct list_head children; /* my children */ struct cgroup *parent; /* my parent */ - struct dentry *dentry; /* cgroup fs entry */ + struct dentry *dentry; /* cgroup fs entry, RCU protected */ /* Private pointers for each registered subsystem */ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; @@ -145,6 +145,9 @@ struct cgroup { int pids_use_count; /* Length of the current tasks_pids array */ int pids_length; + + /* For RCU-protected deletion */ + struct rcu_head rcu_head; }; /* A css_set is a structure holding pointers to a set of diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cb7c72b91f4..83ea4f524be 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -271,7 +271,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) rcu_read_lock(); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup *cgrp = cg->subsys[i]->cgroup; + struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); if (atomic_dec_and_test(&cgrp->count) && notify_on_release(cgrp)) { if (taskexit) @@ -594,6 +594,13 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp) return; } +static void free_cgroup_rcu(struct rcu_head *obj) +{ + struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head); + + kfree(cgrp); +} + static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -619,11 +626,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); - /* Drop the active superblock reference that we took when we - * created the cgroup */ + /* + * Drop the active superblock reference that we took when we + * created the cgroup + */ deactivate_super(cgrp->root->sb); - kfree(cgrp); + call_rcu(&cgrp->rcu_head, free_cgroup_rcu); } iput(inode); } @@ -1134,14 +1143,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry) * @buf: the buffer to write the path into * @buflen: the length of the buffer * - * Called with cgroup_mutex held. Writes path of cgroup into buf. - * Returns 0 on success, -errno on error. + * Called with cgroup_mutex held or else with an RCU-protected cgroup + * reference. Writes path of cgroup into buf. Returns 0 on success, + * -errno on error. */ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { char *start; + struct dentry *dentry = rcu_dereference(cgrp->dentry); - if (cgrp == dummytop) { + if (!dentry || cgrp == dummytop) { /* * Inactive subsystems have no dentry for their root * cgroup @@ -1154,13 +1165,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) *--start = '\0'; for (;;) { - int len = cgrp->dentry->d_name.len; + int len = dentry->d_name.len; if ((start -= len) < buf) return -ENAMETOOLONG; memcpy(start, cgrp->dentry->d_name.name, len); cgrp = cgrp->parent; if (!cgrp) break; + dentry = rcu_dereference(cgrp->dentry); if (!cgrp->parent) continue; if (--start < buf) @@ -1663,7 +1675,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, if (!error) { dentry->d_fsdata = cgrp; inc_nlink(parent->d_inode); - cgrp->dentry = dentry; + rcu_assign_pointer(cgrp->dentry, dentry); dget(dentry); } dput(dentry); -- cgit v1.2.3-70-g09d2 From 999cd8a450f8f93701669a61cac4d3b19eca07e8 Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Wed, 7 Jan 2009 18:08:36 -0800 Subject: cgroups: add a per-subsystem hierarchy_mutex These patches introduce new locking/refcount support for cgroups to reduce the need for subsystems to call cgroup_lock(). This will ultimately allow the atomicity of cgroup_rmdir() (which was removed recently) to be restored. These three patches give: 1/3 - introduce a per-subsystem hierarchy_mutex which a subsystem can use to prevent changes to its own cgroup tree 2/3 - use hierarchy_mutex in place of calling cgroup_lock() in the memory controller 3/3 - introduce a css_tryget() function similar to the one recently proposed by Kamezawa, but avoiding spurious refcount failures in the event of a race between a css_tryget() and an unsuccessful cgroup_rmdir() Future patches will likely involve: - using hierarchy mutex in place of cgroup_lock() in more subsystems where appropriate - restoring the atomicity of cgroup_rmdir() with respect to cgroup_create() This patch: Add a hierarchy_mutex to the cgroup_subsys object that protects changes to the hierarchy observed by that subsystem. It is taken by the cgroup subsystem (in addition to cgroup_mutex) for the following operations: - linking a cgroup into that subsystem's cgroup tree - unlinking a cgroup from that subsystem's cgroup tree - moving the subsystem to/from a hierarchy (including across the bind() callback) Thus if the subsystem holds its own hierarchy_mutex, it can safely traverse its own hierarchy. Signed-off-by: Paul Menage Tested-by: KAMEZAWA Hiroyuki Cc: Li Zefan Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/cgroups.txt | 2 +- include/linux/cgroup.h | 17 ++++++++++++++++- kernel/cgroup.c | 37 +++++++++++++++++++++++++++++++++++-- 3 files changed, 52 insertions(+), 4 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 60287e9e9d2..e33ee74eee7 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -528,7 +528,7 @@ example in cpusets, no task may attach before 'cpus' and 'mems' are set up. void bind(struct cgroup_subsys *ss, struct cgroup *root) -(cgroup_mutex held by caller) +(cgroup_mutex and ss->hierarchy_mutex held by caller) Called when a cgroup subsystem is rebound to a different hierarchy and root cgroup. Currently this will only involve movement between diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 73d1c730c3c..ce1c1f34c30 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -340,8 +340,23 @@ struct cgroup_subsys { #define MAX_CGROUP_TYPE_NAMELEN 32 const char *name; - struct cgroupfs_root *root; + /* + * Protects sibling/children links of cgroups in this + * hierarchy, plus protects which hierarchy (or none) the + * subsystem is a part of (i.e. root/sibling). To avoid + * potential deadlocks, the following operations should not be + * undertaken while holding any hierarchy_mutex: + * + * - allocating memory + * - initiating hotplug events + */ + struct mutex hierarchy_mutex; + /* + * Link to parent, and list entry in parent's children. + * Protected by this->hierarchy_mutex and cgroup_lock() + */ + struct cgroupfs_root *root; struct list_head sibling; }; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 83ea4f524be..8b6379cdf63 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -722,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]); BUG_ON(!dummytop->subsys[i]); BUG_ON(dummytop->subsys[i]->cgroup != dummytop); + mutex_lock(&ss->hierarchy_mutex); cgrp->subsys[i] = dummytop->subsys[i]; cgrp->subsys[i]->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) ss->bind(ss, cgrp); - + mutex_unlock(&ss->hierarchy_mutex); } else if (bit & removed_bits) { /* We're removing this subsystem */ BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); BUG_ON(cgrp->subsys[i]->cgroup != cgrp); + mutex_lock(&ss->hierarchy_mutex); if (ss->bind) ss->bind(ss, dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; subsys[i]->root = &rootnode; list_move(&ss->sibling, &rootnode.subsys_list); + mutex_unlock(&ss->hierarchy_mutex); } else if (bit & final_bits) { /* Subsystem state should already exist */ BUG_ON(!cgrp->subsys[i]); @@ -2338,6 +2341,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, cgrp->subsys[ss->subsys_id] = css; } +static void cgroup_lock_hierarchy(struct cgroupfs_root *root) +{ + /* We need to take each hierarchy_mutex in a consistent order */ + int i; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->root == root) + mutex_lock_nested(&ss->hierarchy_mutex, i); + } +} + +static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) +{ + int i; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->root == root) + mutex_unlock(&ss->hierarchy_mutex); + } +} + /* * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup @@ -2386,7 +2412,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, init_cgroup_css(css, ss, cgrp); } + cgroup_lock_hierarchy(root); list_add(&cgrp->sibling, &cgrp->parent->children); + cgroup_unlock_hierarchy(root); root->number_of_cgroups++; err = cgroup_create_dir(cgrp, dentry, mode); @@ -2504,8 +2532,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) if (!list_empty(&cgrp->release_list)) list_del(&cgrp->release_list); spin_unlock(&release_list_lock); - /* delete my sibling from parent->children */ + + cgroup_lock_hierarchy(cgrp->root); + /* delete this cgroup from parent->children */ list_del(&cgrp->sibling); + cgroup_unlock_hierarchy(cgrp->root); + spin_lock(&cgrp->dentry->d_lock); d = dget(cgrp->dentry); spin_unlock(&d->d_lock); @@ -2547,6 +2579,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); + mutex_init(&ss->hierarchy_mutex); ss->active = 1; } -- cgit v1.2.3-70-g09d2 From e7c5ec9193d32b9559a3bb8893ceedbda85201ff Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Wed, 7 Jan 2009 18:08:38 -0800 Subject: cgroups: add css_tryget() Add css_tryget(), that obtains a counted reference on a CSS. It is used in situations where the caller has a "weak" reference to the CSS, i.e. one that does not protect the cgroup from removal via a reference count, but would instead be cleaned up by a destroy() callback. css_tryget() will return true on success, or false if the cgroup is being removed. This is similar to Kamezawa Hiroyuki's patch from a week or two ago, but with the difference that in the event of css_tryget() racing with a cgroup_rmdir(), css_tryget() will only return false if the cgroup really does get removed. This implementation is done by biasing css->refcnt, so that a refcnt of 1 means "releasable" and 0 means "released or releasing". In the event of a race, css_tryget() distinguishes between "released" and "releasing" by checking for the CSS_REMOVED flag in css->flags. Signed-off-by: Paul Menage Tested-by: KAMEZAWA Hiroyuki Cc: Li Zefan Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 38 ++++++++++++++++++++++++++----- kernel/cgroup.c | 61 +++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 88 insertions(+), 11 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ce1c1f34c30..e267e62827b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -52,9 +52,9 @@ struct cgroup_subsys_state { * hierarchy structure */ struct cgroup *cgroup; - /* State maintained by the cgroup system to allow - * subsystems to be "busy". Should be accessed via css_get() - * and css_put() */ + /* State maintained by the cgroup system to allow subsystems + * to be "busy". Should be accessed via css_get(), + * css_tryget() and and css_put(). */ atomic_t refcnt; @@ -64,11 +64,14 @@ struct cgroup_subsys_state { /* bits in struct cgroup_subsys_state flags field */ enum { CSS_ROOT, /* This CSS is the root of the subsystem */ + CSS_REMOVED, /* This CSS is dead */ }; /* - * Call css_get() to hold a reference on the cgroup; - * + * Call css_get() to hold a reference on the css; it can be used + * for a reference obtained via: + * - an existing ref-counted reference to the css + * - task->cgroups for a locked task */ static inline void css_get(struct cgroup_subsys_state *css) @@ -77,9 +80,32 @@ static inline void css_get(struct cgroup_subsys_state *css) if (!test_bit(CSS_ROOT, &css->flags)) atomic_inc(&css->refcnt); } + +static inline bool css_is_removed(struct cgroup_subsys_state *css) +{ + return test_bit(CSS_REMOVED, &css->flags); +} + +/* + * Call css_tryget() to take a reference on a css if your existing + * (known-valid) reference isn't already ref-counted. Returns false if + * the css has been destroyed. + */ + +static inline bool css_tryget(struct cgroup_subsys_state *css) +{ + if (test_bit(CSS_ROOT, &css->flags)) + return true; + while (!atomic_inc_not_zero(&css->refcnt)) { + if (test_bit(CSS_REMOVED, &css->flags)) + return false; + } + return true; +} + /* * css_put() should be called to release a reference taken by - * css_get() + * css_get() or css_tryget() */ extern void __css_put(struct cgroup_subsys_state *css); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8b6379cdf63..c29831076e7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2333,7 +2333,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, struct cgroup *cgrp) { css->cgroup = cgrp; - atomic_set(&css->refcnt, 0); + atomic_set(&css->refcnt, 1); css->flags = 0; if (cgrp == dummytop) set_bit(CSS_ROOT, &css->flags); @@ -2465,7 +2465,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) { /* Check the reference count on each subsystem. Since we * already established that there are no tasks in the - * cgroup, if the css refcount is also 0, then there should + * cgroup, if the css refcount is also 1, then there should * be no outstanding references, so the subsystem is safe to * destroy. We scan across all subsystems rather than using * the per-hierarchy linked list of mounted subsystems since @@ -2486,12 +2486,62 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) * matter, since it can only happen if the cgroup * has been deleted and hence no longer needs the * release agent to be called anyway. */ - if (css && atomic_read(&css->refcnt)) + if (css && (atomic_read(&css->refcnt) > 1)) return 1; } return 0; } +/* + * Atomically mark all (or else none) of the cgroup's CSS objects as + * CSS_REMOVED. Return true on success, or false if the cgroup has + * busy subsystems. Call with cgroup_mutex held + */ + +static int cgroup_clear_css_refs(struct cgroup *cgrp) +{ + struct cgroup_subsys *ss; + unsigned long flags; + bool failed = false; + local_irq_save(flags); + for_each_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + int refcnt; + do { + /* We can only remove a CSS with a refcnt==1 */ + refcnt = atomic_read(&css->refcnt); + if (refcnt > 1) { + failed = true; + goto done; + } + BUG_ON(!refcnt); + /* + * Drop the refcnt to 0 while we check other + * subsystems. This will cause any racing + * css_tryget() to spin until we set the + * CSS_REMOVED bits or abort + */ + } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt); + } + done: + for_each_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + if (failed) { + /* + * Restore old refcnt if we previously managed + * to clear it from 1 to 0 + */ + if (!atomic_read(&css->refcnt)) + atomic_set(&css->refcnt, 1); + } else { + /* Commit the fact that the CSS is removed */ + set_bit(CSS_REMOVED, &css->flags); + } + } + local_irq_restore(flags); + return !failed; +} + static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) { struct cgroup *cgrp = dentry->d_fsdata; @@ -2522,7 +2572,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children) - || cgroup_has_css_refs(cgrp)) { + || !cgroup_clear_css_refs(cgrp)) { mutex_unlock(&cgroup_mutex); return -EBUSY; } @@ -3078,7 +3128,8 @@ void __css_put(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; rcu_read_lock(); - if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) { + if ((atomic_dec_return(&css->refcnt) == 1) && + notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } -- cgit v1.2.3-70-g09d2