diff options
-rw-r--r-- | include/linux/cgroup.h | 23 | ||||
-rw-r--r-- | kernel/cgroup.c | 165 |
2 files changed, 112 insertions, 76 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e345d8b9004..b7bd4beae29 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -20,6 +20,7 @@ #include <linux/workqueue.h> #include <linux/xattr.h> #include <linux/fs.h> +#include <linux/percpu-refcount.h> #ifdef CONFIG_CGROUPS @@ -72,13 +73,8 @@ struct cgroup_subsys_state { */ struct cgroup *cgroup; - /* - * State maintained by the cgroup system to allow subsystems - * to be "busy". Should be accessed via css_get(), - * css_tryget() and css_put(). - */ - - atomic_t refcnt; + /* reference count - access via css_[try]get() and css_put() */ + struct percpu_ref refcnt; unsigned long flags; /* ID for this css, if possible */ @@ -104,11 +100,9 @@ static inline void css_get(struct cgroup_subsys_state *css) { /* We don't need to reference count the root state */ if (!(css->flags & CSS_ROOT)) - atomic_inc(&css->refcnt); + percpu_ref_get(&css->refcnt); } -extern bool __css_tryget(struct cgroup_subsys_state *css); - /** * css_tryget - try to obtain a reference on the specified css * @css: target css @@ -123,11 +117,9 @@ static inline bool css_tryget(struct cgroup_subsys_state *css) { if (css->flags & CSS_ROOT) return true; - return __css_tryget(css); + return percpu_ref_tryget(&css->refcnt); } -extern void __css_put(struct cgroup_subsys_state *css); - /** * css_put - put a css reference * @css: target css @@ -137,7 +129,7 @@ extern void __css_put(struct cgroup_subsys_state *css); static inline void css_put(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_ROOT)) - __css_put(css); + percpu_ref_put(&css->refcnt); } /* bits in struct cgroup flags field */ @@ -231,9 +223,10 @@ struct cgroup { struct list_head pidlists; struct mutex pidlist_mutex; - /* For RCU-protected deletion */ + /* For css percpu_ref killing and RCU-protected deletion */ struct rcu_head rcu_head; struct work_struct destroy_work; + atomic_t css_kill_cnt; /* List of events which userspace want to receive */ struct list_head event_list; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ebbfc043153..2e9da7bf25c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -63,9 +63,6 @@ #include <linux/atomic.h> -/* css deactivation bias, makes css->refcnt negative to deny new trygets */ -#define CSS_DEACT_BIAS INT_MIN - /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. @@ -213,19 +210,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, struct cftype cfts[], bool is_add); -static int css_unbias_refcnt(int refcnt) -{ - return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; -} - -/* the current nr of refs, always >= 0 whether @css is deactivated or not */ -static int css_refcnt(struct cgroup_subsys_state *css) -{ - int v = atomic_read(&css->refcnt); - - return css_unbias_refcnt(v); -} - /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) { @@ -4139,12 +4123,19 @@ static void css_dput_fn(struct work_struct *work) deactivate_super(sb); } +static void css_release(struct percpu_ref *ref) +{ + struct cgroup_subsys_state *css = + container_of(ref, struct cgroup_subsys_state, refcnt); + + schedule_work(&css->dput_work); +} + static void init_cgroup_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { css->cgroup = cgrp; - atomic_set(&css->refcnt, 1); css->flags = 0; css->id = NULL; if (cgrp == dummytop) @@ -4266,7 +4257,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err = PTR_ERR(css); goto err_free_all; } + + err = percpu_ref_init(&css->refcnt, css_release); + if (err) + goto err_free_all; + init_cgroup_css(css, ss, cgrp); + if (ss->use_id) { err = alloc_css_id(ss, parent, cgrp); if (err) @@ -4331,8 +4328,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_free_all: for_each_subsys(root, ss) { - if (cgrp->subsys[ss->subsys_id]) + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + + if (css) { + percpu_ref_cancel_init(&css->refcnt); ss->css_free(cgrp); + } } mutex_unlock(&cgroup_mutex); /* Release the reference count that we took on the superblock */ @@ -4360,6 +4361,48 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return cgroup_create(c_parent, dentry, mode | S_IFDIR); } +static void cgroup_css_killed(struct cgroup *cgrp) +{ + if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) + return; + + /* percpu ref's of all css's are killed, kick off the next step */ + INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); + schedule_work(&cgrp->destroy_work); +} + +static void css_ref_killed_fn(struct percpu_ref *ref) +{ + struct cgroup_subsys_state *css = + container_of(ref, struct cgroup_subsys_state, refcnt); + + cgroup_css_killed(css->cgroup); +} + +/** + * cgroup_destroy_locked - the first stage of cgroup destruction + * @cgrp: cgroup to be destroyed + * + * css's make use of percpu refcnts whose killing latency shouldn't be + * exposed to userland and are RCU protected. Also, cgroup core needs to + * guarantee that css_tryget() won't succeed by the time ->css_offline() is + * invoked. To satisfy all the requirements, destruction is implemented in + * the following two steps. + * + * s1. Verify @cgrp can be destroyed and mark it dying. Remove all + * userland visible parts and start killing the percpu refcnts of + * css's. Set up so that the next stage will be kicked off once all + * the percpu refcnts are confirmed to be killed. + * + * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the + * rest of destruction. Once all cgroup references are gone, the + * cgroup is RCU-freed. + * + * This function implements s1. After this step, @cgrp is gone as far as + * the userland is concerned and a new cgroup with the same name may be + * created. As cgroup doesn't care about the names internally, this + * doesn't cause any problem. + */ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { @@ -4382,16 +4425,34 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return -EBUSY; /* - * Block new css_tryget() by deactivating refcnt and mark @cgrp - * removed. This makes future css_tryget() attempts fail which we - * guarantee to ->css_offline() callbacks. + * Block new css_tryget() by killing css refcnts. cgroup core + * guarantees that, by the time ->css_offline() is invoked, no new + * css reference will be given out via css_tryget(). We can't + * simply call percpu_ref_kill() and proceed to offlining css's + * because percpu_ref_kill() doesn't guarantee that the ref is seen + * as killed on all CPUs on return. + * + * Use percpu_ref_kill_and_confirm() to get notifications as each + * css is confirmed to be seen as killed on all CPUs. The + * notification callback keeps track of the number of css's to be + * killed and schedules cgroup_offline_fn() to perform the rest of + * destruction once the percpu refs of all css's are confirmed to + * be killed. */ + atomic_set(&cgrp->css_kill_cnt, 1); for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - WARN_ON(atomic_read(&css->refcnt) < 0); - atomic_add(CSS_DEACT_BIAS, &css->refcnt); + /* + * Killing would put the base ref, but we need to keep it + * alive until after ->css_offline. + */ + percpu_ref_get(&css->refcnt); + + atomic_inc(&cgrp->css_kill_cnt); + percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn); } + cgroup_css_killed(cgrp); /* * Mark @cgrp dead. This prevents further task migration and child @@ -4427,12 +4488,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) } spin_unlock(&cgrp->event_list_lock); - INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); - schedule_work(&cgrp->destroy_work); - return 0; }; +/** + * cgroup_offline_fn - the second step of cgroup destruction + * @work: cgroup->destroy_free_work + * + * This function is invoked from a work item for a cgroup which is being + * destroyed after the percpu refcnts of all css's are guaranteed to be + * seen as killed on all CPUs, and performs the rest of destruction. This + * is the second step of destruction described in the comment above + * cgroup_destroy_locked(). + */ static void cgroup_offline_fn(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); @@ -4442,16 +4510,19 @@ static void cgroup_offline_fn(struct work_struct *work) mutex_lock(&cgroup_mutex); - /* tell subsystems to initate destruction */ + /* + * css_tryget() is guaranteed to fail now. Tell subsystems to + * initate destruction. + */ for_each_subsys(cgrp->root, ss) offline_css(ss, cgrp); /* - * Put all the base refs. Each css holds an extra reference to the - * cgroup's dentry and cgroup removal proceeds regardless of css - * refs. On the last put of each css, whenever that may be, the - * extra dentry ref is put so that dentry destruction happens only - * after all css's are released. + * Put the css refs from cgroup_destroy_locked(). Each css holds + * an extra reference to the cgroup's dentry and cgroup removal + * proceeds regardless of css refs. On the last put of each css, + * whenever that may be, the extra dentry ref is put so that dentry + * destruction happens only after all css's are released. */ for_each_subsys(cgrp->root, ss) css_put(cgrp->subsys[ss->subsys_id]); @@ -5100,34 +5171,6 @@ static void check_for_release(struct cgroup *cgrp) } } -/* Caller must verify that the css is not for root cgroup */ -bool __css_tryget(struct cgroup_subsys_state *css) -{ - while (true) { - int t, v; - - v = css_refcnt(css); - t = atomic_cmpxchg(&css->refcnt, v, v + 1); - if (likely(t == v)) - return true; - else if (t < 0) - return false; - cpu_relax(); - } -} -EXPORT_SYMBOL_GPL(__css_tryget); - -/* Caller must verify that the css is not for root cgroup */ -void __css_put(struct cgroup_subsys_state *css) -{ - int v; - - v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); - if (v == 0) - schedule_work(&css->dput_work); -} -EXPORT_SYMBOL_GPL(__css_put); - /* * Notify userspace when a cgroup is released, by running the * configured release agent with the name of the cgroup (path @@ -5245,7 +5288,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) * on this or this is under rcu_read_lock(). Once css->id is allocated, * it's unchanged until freed. */ - cssid = rcu_dereference_check(css->id, css_refcnt(css)); + cssid = rcu_dereference_raw(css->id); if (cssid) return cssid->id; |