From ed95779340b50e362245c81b5dec0d11a1debfa8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Nov 2012 09:16:58 -0800 Subject: cgroup: kill cgroup_subsys->__DEPRECATED_clear_css_refs 2ef37d3fe4 ("memcg: Simplify mem_cgroup_force_empty_list error handling") removed the last user of __DEPRECATED_clear_css_refs. This patch removes __DEPRECATED_clear_css_refs and mechanisms to support it. * Conditionals dependent on __DEPRECATED_clear_css_refs removed. * cgroup_clear_css_refs() can no longer fail. All that needs to be done are deactivating refcnts, setting CSS_REMOVED and putting the base reference on each css. Remove cgroup_clear_css_refs() and the failure path, and open-code the loops into cgroup_rmdir(). This patch keeps the two for_each_subsys() loops separate while open coding them. They can be merged now but there are scheduled changes which need them to be separate, so keep them separate to reduce the amount of churn. local_irq_save/restore() from cgroup_clear_css_refs() are replaced with local_irq_disable/enable() for simplicity. This is safe as cgroup_rmdir() is always called with IRQ enabled. Note that this IRQ switching is necessary to ensure that css_tryget() isn't called from IRQ context on the same CPU while lower context is between CSS deactivation and setting CSS_REMOVED as css_tryget() would hang forever in such cases waiting for CSS to be re-activated or CSS_REMOVED set. This will go away soon. v2: cgroup_call_pre_destroy() removal dropped per Michal. Commit message updated to explain local_irq_disable/enable() conversion. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan --- include/linux/cgroup.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c90eaa80344..02e09c0e98a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -86,7 +86,6 @@ struct cgroup_subsys_state { enum { CSS_ROOT, /* This CSS is the root of the subsystem */ CSS_REMOVED, /* This CSS is dead */ - CSS_CLEAR_CSS_REFS, /* @ss->__DEPRECATED_clear_css_refs */ }; /* Caller must verify that the css is not for root cgroup */ @@ -485,17 +484,6 @@ struct cgroup_subsys { */ bool use_id; - /* - * If %true, cgroup removal will try to clear css refs by retrying - * ss->pre_destroy() until there's no css ref left. This behavior - * is strictly for backward compatibility and will be removed as - * soon as the current user (memcg) is updated. - * - * If %false, ss->pre_destroy() can't fail and cgroup removal won't - * wait for css refs to drop to zero before proceeding. - */ - bool __DEPRECATED_clear_css_refs; - #define MAX_CGROUP_TYPE_NAMELEN 32 const char *name; -- cgit v1.2.3-70-g09d2 From e93160803ffda2e67d9ff9cacb63bb6868c8398f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Nov 2012 09:16:58 -0800 Subject: cgroup: kill CSS_REMOVED CSS_REMOVED is one of the several contortions which were necessary to support css reference draining on cgroup removal. All css->refcnts which need draining should be deactivated and verified to equal zero atomically w.r.t. css_tryget(). If any one isn't zero, all refcnts needed to be re-activated and css_tryget() shouldn't fail in the process. This was achieved by letting css_tryget() busy-loop until either the refcnt is reactivated (failed removal attempt) or CSS_REMOVED is set (committing to removal). Now that css refcnt draining is no longer used, there's no need for atomic rollback mechanism. css_tryget() simply can look at the reference count and fail if it's deactivated - it's never getting re-activated. This patch removes CSS_REMOVED and updates __css_tryget() to fail if the refcnt is deactivated. As deactivation and removal are a single step now, they no longer need to be protected against css_tryget() happening from irq context. Remove local_irq_disable/enable() from cgroup_rmdir(). Note that this removes css_is_removed() whose only user is VM_BUG_ON() in memcontrol.c. We can replace it with a check on the refcnt but given that the only use case is a debug assert, I think it's better to simply unexport it. v2: Comment updated and explanation on local_irq_disable/enable() added per Michal Hocko. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Johannes Weiner Cc: Balbir Singh --- include/linux/cgroup.h | 6 ------ kernel/cgroup.c | 31 ++++++++++++------------------- mm/memcontrol.c | 7 +++---- 3 files changed, 15 insertions(+), 29 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 02e09c0e98a..a3098046250 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -85,7 +85,6 @@ struct cgroup_subsys_state { /* bits in struct cgroup_subsys_state flags field */ enum { CSS_ROOT, /* This CSS is the root of the subsystem */ - CSS_REMOVED, /* This CSS is dead */ }; /* Caller must verify that the css is not for root cgroup */ @@ -108,11 +107,6 @@ static inline void css_get(struct cgroup_subsys_state *css) __css_get(css, 1); } -static inline bool css_is_removed(struct cgroup_subsys_state *css) -{ - return test_bit(CSS_REMOVED, &css->flags); -} - /* * Call css_tryget() to take a reference on a css if your existing * (known-valid) reference isn't already ref-counted. Returns false if diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8c605e2bdd1..c194f9e4fc7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -170,8 +170,8 @@ struct css_id { * The css to which this ID points. This pointer is set to valid value * after cgroup is populated. If cgroup is removed, this will be NULL. * This pointer is expected to be RCU-safe because destroy() - * is called after synchronize_rcu(). But for safe use, css_is_removed() - * css_tryget() should be used for avoiding race. + * is called after synchronize_rcu(). But for safe use, css_tryget() + * should be used for avoiding race. */ struct cgroup_subsys_state __rcu *css; /* @@ -4112,8 +4112,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) } prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); - local_irq_disable(); - /* block new css_tryget() by deactivating refcnt */ for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; @@ -4123,21 +4121,14 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) } /* - * Set REMOVED. All in-progress css_tryget() will be released. * Put all the base refs. Each css holds an extra reference to the * cgroup's dentry and cgroup removal proceeds regardless of css * refs. On the last put of each css, whenever that may be, the * extra dentry ref is put so that dentry destruction happens only * after all css's are released. */ - for_each_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - - set_bit(CSS_REMOVED, &css->flags); - css_put(css); - } - - local_irq_enable(); + for_each_subsys(cgrp->root, ss) + css_put(cgrp->subsys[ss->subsys_id]); finish_wait(&cgroup_rmdir_waitq, &wait); clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); @@ -4861,15 +4852,17 @@ static void check_for_release(struct cgroup *cgrp) /* Caller must verify that the css is not for root cgroup */ bool __css_tryget(struct cgroup_subsys_state *css) { - do { - int v = css_refcnt(css); + while (true) { + int t, v; - if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) + v = css_refcnt(css); + t = atomic_cmpxchg(&css->refcnt, v, v + 1); + if (likely(t == v)) return true; + else if (t < 0) + return false; cpu_relax(); - } while (!test_bit(CSS_REMOVED, &css->flags)); - - return false; + } } EXPORT_SYMBOL_GPL(__css_tryget); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5a1d584ffed..37c35664654 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2343,7 +2343,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, again: if (*ptr) { /* css should be a valid one */ memcg = *ptr; - VM_BUG_ON(css_is_removed(&memcg->css)); if (mem_cgroup_is_root(memcg)) goto done; if (nr_pages == 1 && consume_stock(memcg)) @@ -2483,9 +2482,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, /* * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock(). The caller must check css_is_removed() or some if - * it's concern. (dropping refcnt from swap can be called against removed - * memcg.) + * rcu_read_lock(). The caller is responsible for calling css_tryget if + * the mem_cgroup is used for charging. (dropping refcnt from swap can be + * called against removed memcg.) */ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) { -- cgit v1.2.3-70-g09d2 From b25ed609d0eecf077db607e88ea70bae83b6adb2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Nov 2012 09:16:59 -0800 Subject: cgroup: remove CGRP_WAIT_ON_RMDIR, cgroup_exclude_rmdir() and cgroup_release_and_wakeup_rmdir() CGRP_WAIT_ON_RMDIR is another kludge which was added to make cgroup destruction rollback somewhat working. cgroup_rmdir() used to drain CSS references and CGRP_WAIT_ON_RMDIR and the associated waitqueue and helpers were used to allow the task performing rmdir to wait for the next relevant event. Unfortunately, the wait is visible to controllers too and the mechanism got exposed to memcg by 887032670d ("cgroup avoid permanent sleep at rmdir"). Now that the draining and retries are gone, CGRP_WAIT_ON_RMDIR is unnecessary. Remove it and all the mechanisms supporting it. Note that memcontrol.c changes are essentially revert of 887032670d ("cgroup avoid permanent sleep at rmdir"). Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Balbir Singh --- include/linux/cgroup.h | 21 --------------------- kernel/cgroup.c | 51 -------------------------------------------------- mm/memcontrol.c | 24 +----------------------- 3 files changed, 1 insertion(+), 95 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index a3098046250..47868a86ba2 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -144,10 +144,6 @@ enum { CGRP_RELEASABLE, /* Control Group requires release notifications to userspace */ CGRP_NOTIFY_ON_RELEASE, - /* - * A thread in rmdir() is wating for this cgroup. - */ - CGRP_WAIT_ON_RMDIR, /* * Clone cgroup values when creating a new child cgroup */ @@ -411,23 +407,6 @@ int cgroup_task_count(const struct cgroup *cgrp); /* Return true if cgrp is a descendant of the task's cgroup */ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); -/* - * When the subsys has to access css and may add permanent refcnt to css, - * it should take care of racy conditions with rmdir(). Following set of - * functions, is for stop/restart rmdir if necessary. - * Because these will call css_get/put, "css" should be alive css. - * - * cgroup_exclude_rmdir(); - * ...do some jobs which may access arbitrary empty cgroup - * cgroup_release_and_wakeup_rmdir(); - * - * When someone removes a cgroup while cgroup_exclude_rmdir() holds it, - * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up. - */ - -void cgroup_exclude_rmdir(struct cgroup_subsys_state *css); -void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css); - /* * Control Group taskset, used to pass around set of tasks to cgroup_subsys * methods. diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 66204a6f68f..c5f6fb28dd0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -965,33 +965,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry) remove_dir(dentry); } -/* - * A queue for waiters to do rmdir() cgroup. A tasks will sleep when - * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some - * reference to css->refcnt. In general, this refcnt is expected to goes down - * to zero, soon. - * - * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; - */ -static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); - -static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) -{ - if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) - wake_up_all(&cgroup_rmdir_waitq); -} - -void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) -{ - css_get(css); -} - -void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) -{ - cgroup_wakeup_rmdir_waiter(css->cgroup); - css_put(css); -} - /* * Call with cgroup_mutex held. Drops reference counts on modules, including * any duplicate ones that parse_cgroupfs_options took. If this function @@ -1963,12 +1936,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } synchronize_rcu(); - - /* - * wake up rmdir() waiter. the rmdir should fail since the cgroup - * is no longer empty. - */ - cgroup_wakeup_rmdir_waiter(cgrp); out: if (retval) { for_each_subsys(root, ss) { @@ -2138,7 +2105,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * step 5: success! and cleanup */ synchronize_rcu(); - cgroup_wakeup_rmdir_waiter(cgrp); retval = 0; out_put_css_set_refs: if (retval) { @@ -4058,26 +4024,13 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; - /* - * In general, subsystem has no css->refcnt after pre_destroy(). But - * in racy cases, subsystem may have to get css->refcnt after - * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes - * make rmdir return -EBUSY too often. To avoid that, we use waitqueue - * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir - * and subsystem's reference count handling. Please see css_get/put - * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. - */ - set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - /* the vfs holds both inode->i_mutex already */ mutex_lock(&cgroup_mutex); parent = cgrp->parent; if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); mutex_unlock(&cgroup_mutex); return -EBUSY; } - prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); /* * Block new css_tryget() by deactivating refcnt and mark @cgrp @@ -4114,9 +4067,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) for_each_subsys(cgrp->root, ss) css_put(cgrp->subsys[ss->subsys_id]); - finish_wait(&cgroup_rmdir_waitq, &wait); - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - raw_spin_lock(&release_list_lock); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); @@ -4864,7 +4814,6 @@ void __css_put(struct cgroup_subsys_state *css) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } - cgroup_wakeup_rmdir_waiter(cgrp); break; case 0: schedule_work(&css->dput_work); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 37c35664654..930edfaa518 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2681,13 +2681,6 @@ static int mem_cgroup_move_account(struct page *page, /* caller should have done css_get */ pc->mem_cgroup = to; mem_cgroup_charge_statistics(to, anon, nr_pages); - /* - * We charges against "to" which may not have any tasks. Then, "to" - * can be under rmdir(). But in current implementation, caller of - * this function is just force_empty() and move charge, so it's - * guaranteed that "to" is never removed. So, we don't check rmdir - * status here. - */ move_unlock_mem_cgroup(from, &flags); ret = 0; unlock: @@ -2893,7 +2886,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, return; if (!memcg) return; - cgroup_exclude_rmdir(&memcg->css); __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); /* @@ -2907,12 +2899,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, swp_entry_t ent = {.val = page_private(page)}; mem_cgroup_uncharge_swap(ent); } - /* - * At swapin, we may charge account against cgroup which has no tasks. - * So, rmdir()->pre_destroy() can be called while we do this charge. - * In that case, we need to call pre_destroy() again. check it here. - */ - cgroup_release_and_wakeup_rmdir(&memcg->css); } void mem_cgroup_commit_charge_swapin(struct page *page, @@ -3360,8 +3346,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, if (!memcg) return; - /* blocks rmdir() */ - cgroup_exclude_rmdir(&memcg->css); + if (!migration_ok) { used = oldpage; unused = newpage; @@ -3395,13 +3380,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, */ if (anon) mem_cgroup_uncharge_page(used); - /* - * At migration, we may charge account against cgroup which has no - * tasks. - * So, rmdir()->pre_destroy() can be called while we do this charge. - * In that case, we need to call pre_destroy() again. check it here. - */ - cgroup_release_and_wakeup_rmdir(&memcg->css); } /* -- cgit v1.2.3-70-g09d2 From bcf6de1b9129531215d26dd9af8331e84973bc52 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Nov 2012 09:16:59 -0800 Subject: cgroup: make ->pre_destroy() return void All ->pre_destory() implementations return 0 now, which is the only allowed return value. Make it return void. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Balbir Singh Cc: Vivek Goyal --- block/blk-cgroup.c | 3 +-- include/linux/cgroup.h | 2 +- kernel/cgroup.c | 2 +- mm/hugetlb_cgroup.c | 4 +--- mm/memcontrol.c | 3 +-- 5 files changed, 5 insertions(+), 9 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index f3b44a65fc7..a7816f3d005 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -600,7 +600,7 @@ struct cftype blkcg_files[] = { * * This is the blkcg counterpart of ioc_release_fn(). */ -static int blkcg_pre_destroy(struct cgroup *cgroup) +static void blkcg_pre_destroy(struct cgroup *cgroup) { struct blkcg *blkcg = cgroup_to_blkcg(cgroup); @@ -622,7 +622,6 @@ static int blkcg_pre_destroy(struct cgroup *cgroup) } spin_unlock_irq(&blkcg->lock); - return 0; } static void blkcg_destroy(struct cgroup *cgroup) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 47868a86ba2..adb2adc8ec1 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -436,7 +436,7 @@ int cgroup_taskset_size(struct cgroup_taskset *tset); struct cgroup_subsys { struct cgroup_subsys_state *(*create)(struct cgroup *cgrp); - int (*pre_destroy)(struct cgroup *cgrp); + void (*pre_destroy)(struct cgroup *cgrp); void (*destroy)(struct cgroup *cgrp); int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c5f6fb28dd0..83cd7d041c6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4054,7 +4054,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) mutex_unlock(&cgroup_mutex); for_each_subsys(cgrp->root, ss) if (ss->pre_destroy) - WARN_ON_ONCE(ss->pre_destroy(cgrp)); + ss->pre_destroy(cgrp); mutex_lock(&cgroup_mutex); /* diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index dc595c6b1f5..0d3a1a31773 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -155,7 +155,7 @@ out: * Force the hugetlb cgroup to empty the hugetlb resources by moving them to * the parent cgroup. */ -static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) +static void hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) { struct hstate *h; struct page *page; @@ -172,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) } cond_resched(); } while (hugetlb_cgroup_have_usage(cgroup)); - - return 0; } int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6678f991c6c..a1811ce60e2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5002,12 +5002,11 @@ free_out: return ERR_PTR(error); } -static int mem_cgroup_pre_destroy(struct cgroup *cont) +static void mem_cgroup_pre_destroy(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); mem_cgroup_reparent_charges(memcg); - return 0; } static void mem_cgroup_destroy(struct cgroup *cont) -- cgit v1.2.3-70-g09d2