summaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-12 08:18:24 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-12 08:18:24 -0800
commitd206e09036d6201f90b2719484c8a59526c46125 (patch)
tree84b9057919bcb8cfd1cff47baa5fc74457e77d6d /mm/memcontrol.c
parentfef3ff2eb777e76cfa5ae67591982d902c17139c (diff)
parent15ef4ffaa797034d5ff82844daf8f595d7c6d53c (diff)
Merge branch 'for-3.8' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup changes from Tejun Heo: "A lot of activities on cgroup side. The big changes are focused on making cgroup hierarchy handling saner. - cgroup_rmdir() had peculiar semantics - it allowed cgroup destruction to be vetoed by individual controllers and tried to drain refcnt synchronously. The vetoing never worked properly and caused good deal of contortions in cgroup. memcg was the last reamining user. Michal Hocko removed the usage and cgroup_rmdir() path has been simplified significantly. This was done in a separate branch so that the memcg people can base further memcg changes on top. - The above allowed cleaning up cgroup lifecycle management and implementation of generic cgroup iterators which are used to improve hierarchy support. - cgroup_freezer updated to allow migration in and out of a frozen cgroup and handle hierarchy. If a cgroup is frozen, all descendant cgroups are frozen. - netcls_cgroup and netprio_cgroup updated to handle hierarchy properly. - Various fixes and cleanups. - Two merge commits. One to pull in memcg and rmdir cleanups (needed to build iterators). The other pulled in cgroup/for-3.7-fixes for device_cgroup fixes so that further device_cgroup patches can be stacked on top." Fixed up a trivial conflict in mm/memcontrol.c as per Tejun (due to commit bea8c150a7 ("memcg: fix hotplugged memory zone oops") in master touching code close to commit 2ef37d3fe4 ("memcg: Simplify mem_cgroup_force_empty_list error handling") in for-3.8) * 'for-3.8' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (65 commits) cgroup: update Documentation/cgroups/00-INDEX cgroup_rm_file: don't delete the uncreated files cgroup: remove subsystem files when remounting cgroup cgroup: use cgroup_addrm_files() in cgroup_clear_directory() cgroup: warn about broken hierarchies only after css_online cgroup: list_del_init() on removed events cgroup: fix lockdep warning for event_control cgroup: move list add after list head initilization netprio_cgroup: allow nesting and inherit config on cgroup creation netprio_cgroup: implement netprio[_set]_prio() helpers netprio_cgroup: use cgroup->id instead of cgroup_netprio_state->prioidx netprio_cgroup: reimplement priomap expansion netprio_cgroup: shorten variable names in extend_netdev_table() netprio_cgroup: simplify write_priomap() netcls_cgroup: move config inheritance to ->css_online() and remove .broken_hierarchy marking cgroup: remove obsolete guarantee from cgroup_task_migrate. cgroup: add cgroup->id cgroup, cpuset: remove cgroup_subsys->post_clone() cgroup: s/CGRP_CLONE_CHILDREN/CGRP_CPUSET_CLONE_CHILDREN/ cgroup: rename ->create/post_create/pre_destroy/destroy() to ->css_alloc/online/offline/free() ...
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c191
1 files changed, 96 insertions, 95 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cf6d0df4849..12307b3838f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2370,7 +2370,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
again:
if (*ptr) { /* css should be a valid one */
memcg = *ptr;
- VM_BUG_ON(css_is_removed(&memcg->css));
if (mem_cgroup_is_root(memcg))
goto done;
if (nr_pages == 1 && consume_stock(memcg))
@@ -2510,9 +2509,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
/*
* A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock(). The caller must check css_is_removed() or some if
- * it's concern. (dropping refcnt from swap can be called against removed
- * memcg.)
+ * rcu_read_lock(). The caller is responsible for calling css_tryget if
+ * the mem_cgroup is used for charging. (dropping refcnt from swap can be
+ * called against removed memcg.)
*/
static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
{
@@ -2709,13 +2708,6 @@ static int mem_cgroup_move_account(struct page *page,
/* caller should have done css_get */
pc->mem_cgroup = to;
mem_cgroup_charge_statistics(to, anon, nr_pages);
- /*
- * We charges against "to" which may not have any tasks. Then, "to"
- * can be under rmdir(). But in current implementation, caller of
- * this function is just force_empty() and move charge, so it's
- * guaranteed that "to" is never removed. So, we don't check rmdir
- * status here.
- */
move_unlock_mem_cgroup(from, &flags);
ret = 0;
unlock:
@@ -2729,10 +2721,27 @@ out:
return ret;
}
-/*
- * move charges to its parent.
+/**
+ * mem_cgroup_move_parent - moves page to the parent group
+ * @page: the page to move
+ * @pc: page_cgroup of the page
+ * @child: page's cgroup
+ *
+ * move charges to its parent or the root cgroup if the group has no
+ * parent (aka use_hierarchy==0).
+ * Although this might fail (get_page_unless_zero, isolate_lru_page or
+ * mem_cgroup_move_account fails) the failure is always temporary and
+ * it signals a race with a page removal/uncharge or migration. In the
+ * first case the page is on the way out and it will vanish from the LRU
+ * on the next attempt and the call should be retried later.
+ * Isolation from the LRU fails only if page has been isolated from
+ * the LRU since we looked at it and that usually means either global
+ * reclaim or migration going on. The page will either get back to the
+ * LRU or vanish.
+ * Finaly mem_cgroup_move_account fails only if the page got uncharged
+ * (!PageCgroupUsed) or moved to a different group. The page will
+ * disappear in the next attempt.
*/
-
static int mem_cgroup_move_parent(struct page *page,
struct page_cgroup *pc,
struct mem_cgroup *child)
@@ -2742,9 +2751,7 @@ static int mem_cgroup_move_parent(struct page *page,
unsigned long uninitialized_var(flags);
int ret;
- /* Is ROOT ? */
- if (mem_cgroup_is_root(child))
- return -EINVAL;
+ VM_BUG_ON(mem_cgroup_is_root(child));
ret = -EBUSY;
if (!get_page_unless_zero(page))
@@ -2761,8 +2768,10 @@ static int mem_cgroup_move_parent(struct page *page,
if (!parent)
parent = root_mem_cgroup;
- if (nr_pages > 1)
+ if (nr_pages > 1) {
+ VM_BUG_ON(!PageTransHuge(page));
flags = compound_lock_irqsave(page);
+ }
ret = mem_cgroup_move_account(page, nr_pages,
pc, child, parent);
@@ -2904,7 +2913,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
return;
if (!memcg)
return;
- cgroup_exclude_rmdir(&memcg->css);
__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
/*
@@ -2918,12 +2926,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
swp_entry_t ent = {.val = page_private(page)};
mem_cgroup_uncharge_swap(ent);
}
- /*
- * At swapin, we may charge account against cgroup which has no tasks.
- * So, rmdir()->pre_destroy() can be called while we do this charge.
- * In that case, we need to call pre_destroy() again. check it here.
- */
- cgroup_release_and_wakeup_rmdir(&memcg->css);
}
void mem_cgroup_commit_charge_swapin(struct page *page,
@@ -3371,8 +3373,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
if (!memcg)
return;
- /* blocks rmdir() */
- cgroup_exclude_rmdir(&memcg->css);
+
if (!migration_ok) {
used = oldpage;
unused = newpage;
@@ -3406,13 +3407,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
*/
if (anon)
mem_cgroup_uncharge_page(used);
- /*
- * At migration, we may charge account against cgroup which has no
- * tasks.
- * So, rmdir()->pre_destroy() can be called while we do this charge.
- * In that case, we need to call pre_destroy() again. check it here.
- */
- cgroup_release_and_wakeup_rmdir(&memcg->css);
}
/*
@@ -3712,17 +3706,22 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
return nr_reclaimed;
}
-/*
+/**
+ * mem_cgroup_force_empty_list - clears LRU of a group
+ * @memcg: group to clear
+ * @node: NUMA node
+ * @zid: zone id
+ * @lru: lru to to clear
+ *
* Traverse a specified page_cgroup list and try to drop them all. This doesn't
- * reclaim the pages page themselves - it just removes the page_cgroups.
- * Returns true if some page_cgroups were not freed, indicating that the caller
- * must retry this operation.
+ * reclaim the pages page themselves - pages are moved to the parent (or root)
+ * group.
*/
-static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
+static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
int node, int zid, enum lru_list lru)
{
struct lruvec *lruvec;
- unsigned long flags, loop;
+ unsigned long flags;
struct list_head *list;
struct page *busy;
struct zone *zone;
@@ -3731,11 +3730,8 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
list = &lruvec->lists[lru];
- loop = mem_cgroup_get_lru_size(lruvec, lru);
- /* give some margin against EBUSY etc...*/
- loop += 256;
busy = NULL;
- while (loop--) {
+ do {
struct page_cgroup *pc;
struct page *page;
@@ -3761,76 +3757,72 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
cond_resched();
} else
busy = NULL;
- }
- return !list_empty(list);
+ } while (!list_empty(list));
}
/*
- * make mem_cgroup's charge to be 0 if there is no task.
+ * make mem_cgroup's charge to be 0 if there is no task by moving
+ * all the charges and pages to the parent.
* This enables deleting this mem_cgroup.
+ *
+ * Caller is responsible for holding css reference on the memcg.
*/
-static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
+static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
{
- int ret;
- int node, zid, shrink;
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct cgroup *cgrp = memcg->css.cgroup;
-
- css_get(&memcg->css);
+ int node, zid;
- shrink = 0;
- /* should free all ? */
- if (free_all)
- goto try_to_free;
-move_account:
do {
- ret = -EBUSY;
- if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
- goto out;
/* This is for making all *used* pages to be on LRU. */
lru_add_drain_all();
drain_all_stock_sync(memcg);
- ret = 0;
mem_cgroup_start_move(memcg);
for_each_node_state(node, N_HIGH_MEMORY) {
- for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
enum lru_list lru;
for_each_lru(lru) {
- ret = mem_cgroup_force_empty_list(memcg,
+ mem_cgroup_force_empty_list(memcg,
node, zid, lru);
- if (ret)
- break;
}
}
- if (ret)
- break;
}
mem_cgroup_end_move(memcg);
memcg_oom_recover(memcg);
cond_resched();
- /* "ret" should also be checked to ensure all lists are empty. */
- } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
-out:
- css_put(&memcg->css);
- return ret;
-try_to_free:
+ /*
+ * This is a safety check because mem_cgroup_force_empty_list
+ * could have raced with mem_cgroup_replace_page_cache callers
+ * so the lru seemed empty but the page could have been added
+ * right after the check. RES_USAGE should be safe as we always
+ * charge before adding to the LRU.
+ */
+ } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0);
+}
+
+/*
+ * Reclaims as many pages from the given memcg as possible and moves
+ * the rest to the parent.
+ *
+ * Caller is responsible for holding css reference for memcg.
+ */
+static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
+{
+ int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ struct cgroup *cgrp = memcg->css.cgroup;
+
/* returns EBUSY if there is a task or if we come here twice. */
- if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
- ret = -EBUSY;
- goto out;
- }
+ if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
+ return -EBUSY;
+
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
/* try to free all pages in this cgroup */
- shrink = 1;
while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
int progress;
- if (signal_pending(current)) {
- ret = -EINTR;
- goto out;
- }
+ if (signal_pending(current))
+ return -EINTR;
+
progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
false);
if (!progress) {
@@ -3841,13 +3833,23 @@ try_to_free:
}
lru_add_drain();
- /* try move_account...there may be some *locked* pages. */
- goto move_account;
+ mem_cgroup_reparent_charges(memcg);
+
+ return 0;
}
static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
{
- return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+ int ret;
+
+ if (mem_cgroup_is_root(memcg))
+ return -EINVAL;
+ css_get(&memcg->css);
+ ret = mem_cgroup_force_empty(memcg);
+ css_put(&memcg->css);
+
+ return ret;
}
@@ -4953,7 +4955,7 @@ err_cleanup:
}
static struct cgroup_subsys_state * __ref
-mem_cgroup_create(struct cgroup *cont)
+mem_cgroup_css_alloc(struct cgroup *cont)
{
struct mem_cgroup *memcg, *parent;
long error = -ENOMEM;
@@ -5034,14 +5036,14 @@ free_out:
return ERR_PTR(error);
}
-static int mem_cgroup_pre_destroy(struct cgroup *cont)
+static void mem_cgroup_css_offline(struct cgroup *cont)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
- return mem_cgroup_force_empty(memcg, false);
+ mem_cgroup_reparent_charges(memcg);
}
-static void mem_cgroup_destroy(struct cgroup *cont)
+static void mem_cgroup_css_free(struct cgroup *cont)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
@@ -5631,16 +5633,15 @@ static void mem_cgroup_move_task(struct cgroup *cont,
struct cgroup_subsys mem_cgroup_subsys = {
.name = "memory",
.subsys_id = mem_cgroup_subsys_id,
- .create = mem_cgroup_create,
- .pre_destroy = mem_cgroup_pre_destroy,
- .destroy = mem_cgroup_destroy,
+ .css_alloc = mem_cgroup_css_alloc,
+ .css_offline = mem_cgroup_css_offline,
+ .css_free = mem_cgroup_css_free,
.can_attach = mem_cgroup_can_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.attach = mem_cgroup_move_task,
.base_cftypes = mem_cgroup_files,
.early_init = 0,
.use_id = 1,
- .__DEPRECATED_clear_css_refs = true,
};
#ifdef CONFIG_MEMCG_SWAP