summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/cgroup.h23
-rw-r--r--kernel/cgroup.c165
2 files changed, 112 insertions, 76 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e345d8b9004..b7bd4beae29 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -20,6 +20,7 @@
#include <linux/workqueue.h>
#include <linux/xattr.h>
#include <linux/fs.h>
+#include <linux/percpu-refcount.h>
#ifdef CONFIG_CGROUPS
@@ -72,13 +73,8 @@ struct cgroup_subsys_state {
*/
struct cgroup *cgroup;
- /*
- * State maintained by the cgroup system to allow subsystems
- * to be "busy". Should be accessed via css_get(),
- * css_tryget() and css_put().
- */
-
- atomic_t refcnt;
+ /* reference count - access via css_[try]get() and css_put() */
+ struct percpu_ref refcnt;
unsigned long flags;
/* ID for this css, if possible */
@@ -104,11 +100,9 @@ static inline void css_get(struct cgroup_subsys_state *css)
{
/* We don't need to reference count the root state */
if (!(css->flags & CSS_ROOT))
- atomic_inc(&css->refcnt);
+ percpu_ref_get(&css->refcnt);
}
-extern bool __css_tryget(struct cgroup_subsys_state *css);
-
/**
* css_tryget - try to obtain a reference on the specified css
* @css: target css
@@ -123,11 +117,9 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
{
if (css->flags & CSS_ROOT)
return true;
- return __css_tryget(css);
+ return percpu_ref_tryget(&css->refcnt);
}
-extern void __css_put(struct cgroup_subsys_state *css);
-
/**
* css_put - put a css reference
* @css: target css
@@ -137,7 +129,7 @@ extern void __css_put(struct cgroup_subsys_state *css);
static inline void css_put(struct cgroup_subsys_state *css)
{
if (!(css->flags & CSS_ROOT))
- __css_put(css);
+ percpu_ref_put(&css->refcnt);
}
/* bits in struct cgroup flags field */
@@ -231,9 +223,10 @@ struct cgroup {
struct list_head pidlists;
struct mutex pidlist_mutex;
- /* For RCU-protected deletion */
+ /* For css percpu_ref killing and RCU-protected deletion */
struct rcu_head rcu_head;
struct work_struct destroy_work;
+ atomic_t css_kill_cnt;
/* List of events which userspace want to receive */
struct list_head event_list;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ebbfc043153..2e9da7bf25c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,9 +63,6 @@
#include <linux/atomic.h>
-/* css deactivation bias, makes css->refcnt negative to deny new trygets */
-#define CSS_DEACT_BIAS INT_MIN
-
/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
@@ -213,19 +210,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
struct cftype cfts[], bool is_add);
-static int css_unbias_refcnt(int refcnt)
-{
- return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
-}
-
-/* the current nr of refs, always >= 0 whether @css is deactivated or not */
-static int css_refcnt(struct cgroup_subsys_state *css)
-{
- int v = atomic_read(&css->refcnt);
-
- return css_unbias_refcnt(v);
-}
-
/* convenient tests for these bits */
static inline bool cgroup_is_dead(const struct cgroup *cgrp)
{
@@ -4139,12 +4123,19 @@ static void css_dput_fn(struct work_struct *work)
deactivate_super(sb);
}
+static void css_release(struct percpu_ref *ref)
+{
+ struct cgroup_subsys_state *css =
+ container_of(ref, struct cgroup_subsys_state, refcnt);
+
+ schedule_work(&css->dput_work);
+}
+
static void init_cgroup_css(struct cgroup_subsys_state *css,
struct cgroup_subsys *ss,
struct cgroup *cgrp)
{
css->cgroup = cgrp;
- atomic_set(&css->refcnt, 1);
css->flags = 0;
css->id = NULL;
if (cgrp == dummytop)
@@ -4266,7 +4257,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
err = PTR_ERR(css);
goto err_free_all;
}
+
+ err = percpu_ref_init(&css->refcnt, css_release);
+ if (err)
+ goto err_free_all;
+
init_cgroup_css(css, ss, cgrp);
+
if (ss->use_id) {
err = alloc_css_id(ss, parent, cgrp);
if (err)
@@ -4331,8 +4328,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
err_free_all:
for_each_subsys(root, ss) {
- if (cgrp->subsys[ss->subsys_id])
+ struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+
+ if (css) {
+ percpu_ref_cancel_init(&css->refcnt);
ss->css_free(cgrp);
+ }
}
mutex_unlock(&cgroup_mutex);
/* Release the reference count that we took on the superblock */
@@ -4360,6 +4361,48 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
return cgroup_create(c_parent, dentry, mode | S_IFDIR);
}
+static void cgroup_css_killed(struct cgroup *cgrp)
+{
+ if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
+ return;
+
+ /* percpu ref's of all css's are killed, kick off the next step */
+ INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
+ schedule_work(&cgrp->destroy_work);
+}
+
+static void css_ref_killed_fn(struct percpu_ref *ref)
+{
+ struct cgroup_subsys_state *css =
+ container_of(ref, struct cgroup_subsys_state, refcnt);
+
+ cgroup_css_killed(css->cgroup);
+}
+
+/**
+ * cgroup_destroy_locked - the first stage of cgroup destruction
+ * @cgrp: cgroup to be destroyed
+ *
+ * css's make use of percpu refcnts whose killing latency shouldn't be
+ * exposed to userland and are RCU protected. Also, cgroup core needs to
+ * guarantee that css_tryget() won't succeed by the time ->css_offline() is
+ * invoked. To satisfy all the requirements, destruction is implemented in
+ * the following two steps.
+ *
+ * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
+ * userland visible parts and start killing the percpu refcnts of
+ * css's. Set up so that the next stage will be kicked off once all
+ * the percpu refcnts are confirmed to be killed.
+ *
+ * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
+ * rest of destruction. Once all cgroup references are gone, the
+ * cgroup is RCU-freed.
+ *
+ * This function implements s1. After this step, @cgrp is gone as far as
+ * the userland is concerned and a new cgroup with the same name may be
+ * created. As cgroup doesn't care about the names internally, this
+ * doesn't cause any problem.
+ */
static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
@@ -4382,16 +4425,34 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
return -EBUSY;
/*
- * Block new css_tryget() by deactivating refcnt and mark @cgrp
- * removed. This makes future css_tryget() attempts fail which we
- * guarantee to ->css_offline() callbacks.
+ * Block new css_tryget() by killing css refcnts. cgroup core
+ * guarantees that, by the time ->css_offline() is invoked, no new
+ * css reference will be given out via css_tryget(). We can't
+ * simply call percpu_ref_kill() and proceed to offlining css's
+ * because percpu_ref_kill() doesn't guarantee that the ref is seen
+ * as killed on all CPUs on return.
+ *
+ * Use percpu_ref_kill_and_confirm() to get notifications as each
+ * css is confirmed to be seen as killed on all CPUs. The
+ * notification callback keeps track of the number of css's to be
+ * killed and schedules cgroup_offline_fn() to perform the rest of
+ * destruction once the percpu refs of all css's are confirmed to
+ * be killed.
*/
+ atomic_set(&cgrp->css_kill_cnt, 1);
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
- WARN_ON(atomic_read(&css->refcnt) < 0);
- atomic_add(CSS_DEACT_BIAS, &css->refcnt);
+ /*
+ * Killing would put the base ref, but we need to keep it
+ * alive until after ->css_offline.
+ */
+ percpu_ref_get(&css->refcnt);
+
+ atomic_inc(&cgrp->css_kill_cnt);
+ percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
}
+ cgroup_css_killed(cgrp);
/*
* Mark @cgrp dead. This prevents further task migration and child
@@ -4427,12 +4488,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
}
spin_unlock(&cgrp->event_list_lock);
- INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
- schedule_work(&cgrp->destroy_work);
-
return 0;
};
+/**
+ * cgroup_offline_fn - the second step of cgroup destruction
+ * @work: cgroup->destroy_free_work
+ *
+ * This function is invoked from a work item for a cgroup which is being
+ * destroyed after the percpu refcnts of all css's are guaranteed to be
+ * seen as killed on all CPUs, and performs the rest of destruction. This
+ * is the second step of destruction described in the comment above
+ * cgroup_destroy_locked().
+ */
static void cgroup_offline_fn(struct work_struct *work)
{
struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
@@ -4442,16 +4510,19 @@ static void cgroup_offline_fn(struct work_struct *work)
mutex_lock(&cgroup_mutex);
- /* tell subsystems to initate destruction */
+ /*
+ * css_tryget() is guaranteed to fail now. Tell subsystems to
+ * initate destruction.
+ */
for_each_subsys(cgrp->root, ss)
offline_css(ss, cgrp);
/*
- * Put all the base refs. Each css holds an extra reference to the
- * cgroup's dentry and cgroup removal proceeds regardless of css
- * refs. On the last put of each css, whenever that may be, the
- * extra dentry ref is put so that dentry destruction happens only
- * after all css's are released.
+ * Put the css refs from cgroup_destroy_locked(). Each css holds
+ * an extra reference to the cgroup's dentry and cgroup removal
+ * proceeds regardless of css refs. On the last put of each css,
+ * whenever that may be, the extra dentry ref is put so that dentry
+ * destruction happens only after all css's are released.
*/
for_each_subsys(cgrp->root, ss)
css_put(cgrp->subsys[ss->subsys_id]);
@@ -5100,34 +5171,6 @@ static void check_for_release(struct cgroup *cgrp)
}
}
-/* Caller must verify that the css is not for root cgroup */
-bool __css_tryget(struct cgroup_subsys_state *css)
-{
- while (true) {
- int t, v;
-
- v = css_refcnt(css);
- t = atomic_cmpxchg(&css->refcnt, v, v + 1);
- if (likely(t == v))
- return true;
- else if (t < 0)
- return false;
- cpu_relax();
- }
-}
-EXPORT_SYMBOL_GPL(__css_tryget);
-
-/* Caller must verify that the css is not for root cgroup */
-void __css_put(struct cgroup_subsys_state *css)
-{
- int v;
-
- v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
- if (v == 0)
- schedule_work(&css->dput_work);
-}
-EXPORT_SYMBOL_GPL(__css_put);
-
/*
* Notify userspace when a cgroup is released, by running the
* configured release agent with the name of the cgroup (path
@@ -5245,7 +5288,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
* on this or this is under rcu_read_lock(). Once css->id is allocated,
* it's unchanged until freed.
*/
- cssid = rcu_dereference_check(css->id, css_refcnt(css));
+ cssid = rcu_dereference_raw(css->id);
if (cssid)
return cssid->id;