From de57780dc659f95b17ccb649f003278dde0b5b86 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 12 Sep 2013 15:13:26 -0700 Subject: memcg: enhance memcg iterator to support predicates The caller of the iterator might know that some nodes or even subtrees should be skipped but there is no way to tell iterators about that so the only choice left is to let iterators to visit each node and do the selection outside of the iterating code. This, however, doesn't scale well with hierarchies with many groups where only few groups are interesting. This patch adds mem_cgroup_iter_cond variant of the iterator with a callback which gets called for every visited node. There are three possible ways how the callback can influence the walk. Either the node is visited, it is skipped but the tree walk continues down the tree or the whole subtree of the current group is skipped. [hughd@google.com: fix memcg-less page reclaim] Signed-off-by: Michal Hocko Cc: Balbir Singh Cc: Glauber Costa Cc: Greg Thelen Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Michel Lespinasse Cc: Tejun Heo Cc: Ying Han Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 70 ++++++++++++++++++++++++++++++++++++++++++++------------- mm/vmscan.c | 16 +++++-------- 2 files changed, 60 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c016e001c5b..a4bb857d902 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -875,6 +875,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } +static enum mem_cgroup_filter_t +mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, + mem_cgroup_iter_filter cond) +{ + if (!cond) + return VISIT; + return cond(memcg, root); +} + /* * Returns a next (in a pre-order walk) alive memcg (with elevated css * ref. count) or NULL if the whole root's subtree has been visited. @@ -882,7 +891,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) * helper function to be used by mem_cgroup_iter */ static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, - struct mem_cgroup *last_visited) + struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) { struct cgroup_subsys_state *prev_css, *next_css; @@ -900,11 +909,31 @@ skip_node: if (next_css) { struct mem_cgroup *mem = mem_cgroup_from_css(next_css); - if (css_tryget(&mem->css)) - return mem; - else { + switch (mem_cgroup_filter(mem, root, cond)) { + case SKIP: prev_css = next_css; goto skip_node; + case SKIP_TREE: + if (mem == root) + return NULL; + /* + * css_rightmost_descendant is not an optimal way to + * skip through a subtree (especially for imbalanced + * trees leaning to right) but that's what we have right + * now. More effective solution would be traversing + * right-up for first non-NULL without calling + * css_next_descendant_pre afterwards. + */ + prev_css = css_rightmost_descendant(next_css); + goto skip_node; + case VISIT: + if (css_tryget(&mem->css)) + return mem; + else { + prev_css = next_css; + goto skip_node; + } + break; } } @@ -968,6 +997,7 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, * @root: hierarchy root * @prev: previously returned memcg, NULL on first invocation * @reclaim: cookie for shared reclaim walks, NULL for full walks + * @cond: filter for visited nodes, NULL for no filter * * Returns references to children of the hierarchy below @root, or * @root itself, or %NULL after a full round-trip. @@ -980,15 +1010,18 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, * divide up the memcgs in the hierarchy among all concurrent * reclaimers operating on the same zone and priority. */ -struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, +struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, struct mem_cgroup *prev, - struct mem_cgroup_reclaim_cookie *reclaim) + struct mem_cgroup_reclaim_cookie *reclaim, + mem_cgroup_iter_filter cond) { struct mem_cgroup *memcg = NULL; struct mem_cgroup *last_visited = NULL; - if (mem_cgroup_disabled()) - return NULL; + if (mem_cgroup_disabled()) { + /* first call must return non-NULL, second return NULL */ + return (struct mem_cgroup *)(unsigned long)!prev; + } if (!root) root = root_mem_cgroup; @@ -999,7 +1032,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (!root->use_hierarchy && root != root_mem_cgroup) { if (prev) goto out_css_put; - return root; + if (mem_cgroup_filter(root, root, cond) == VISIT) + return root; + return NULL; } rcu_read_lock(); @@ -1022,7 +1057,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, last_visited = mem_cgroup_iter_load(iter, root, &seq); } - memcg = __mem_cgroup_iter_next(root, last_visited); + memcg = __mem_cgroup_iter_next(root, last_visited, cond); if (reclaim) { mem_cgroup_iter_update(iter, last_visited, memcg, seq); @@ -1033,7 +1068,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, reclaim->generation = iter->generation; } - if (prev && !memcg) + /* + * We have finished the whole tree walk or no group has been + * visited because filter told us to skip the root node. + */ + if (!memcg && (prev || (cond && !last_visited))) goto out_unlock; } out_unlock: @@ -1778,13 +1817,14 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) * a) it is over its soft limit * b) any parent up the hierarchy is over its soft limit */ -bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, +enum mem_cgroup_filter_t +mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, struct mem_cgroup *root) { struct mem_cgroup *parent = memcg; if (res_counter_soft_limit_excess(&memcg->res)) - return true; + return VISIT; /* * If any parent up to the root in the hierarchy is over its soft limit @@ -1792,12 +1832,12 @@ bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, */ while((parent = parent_mem_cgroup(parent))) { if (res_counter_soft_limit_excess(&parent->res)) - return true; + return VISIT; if (parent == root) break; } - return false; + return SKIP; } /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 1896e7ca494..f2e35099508 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2151,21 +2151,16 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) .zone = zone, .priority = sc->priority, }; - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = NULL; + mem_cgroup_iter_filter filter = (soft_reclaim) ? + mem_cgroup_soft_reclaim_eligible : NULL; nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; - memcg = mem_cgroup_iter(root, NULL, &reclaim); - do { + while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { struct lruvec *lruvec; - if (soft_reclaim && - !mem_cgroup_soft_reclaim_eligible(memcg, root)) { - memcg = mem_cgroup_iter(root, memcg, &reclaim); - continue; - } - lruvec = mem_cgroup_zone_lruvec(zone, memcg); shrink_lruvec(lruvec, sc); @@ -2185,8 +2180,7 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) mem_cgroup_iter_break(root, memcg); break; } - memcg = mem_cgroup_iter(root, memcg, &reclaim); - } while (memcg); + } vmpressure(sc->gfp_mask, sc->target_mem_cgroup, sc->nr_scanned - nr_scanned, -- cgit v1.2.3-70-g09d2