From 20ba27f52eded4d14e309ac57971c9c83dec46e4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 24 Sep 2013 15:27:33 -0700
Subject: revert "memcg, vmscan: do not fall into reclaim-all pass too quickly"

Revert commit e975de998b96 ("memcg, vmscan: do not fall into reclaim-all
pass too quickly")

I merged this prematurely - Michal and Johannes still disagree about the
overall design direction and the future remains unclear.

Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8ed1b775bdc..97b0ed16749 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2176,11 +2176,10 @@ static inline bool should_continue_reclaim(struct zone *zone,
 	}
 }
 
-static int
+static void
 __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 {
 	unsigned long nr_reclaimed, nr_scanned;
-	int groups_scanned = 0;
 
 	do {
 		struct mem_cgroup *root = sc->target_mem_cgroup;
@@ -2198,7 +2197,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 		while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) {
 			struct lruvec *lruvec;
 
-			groups_scanned++;
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
 			shrink_lruvec(lruvec, sc);
@@ -2226,8 +2224,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 
 	} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
 					 sc->nr_scanned - nr_scanned, sc));
-
-	return groups_scanned;
 }
 
 
@@ -2235,19 +2231,8 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 {
 	bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc);
 	unsigned long nr_scanned = sc->nr_scanned;
-	int scanned_groups;
 
-	scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim);
-	/*
-	 * memcg iterator might race with other reclaimer or start from
-	 * a incomplete tree walk so the tree walk in __shrink_zone
-	 * might have missed groups that are above the soft limit. Try
-	 * another loop to catch up with others. Do it just once to
-	 * prevent from reclaim latencies when other reclaimers always
-	 * preempt this one.
-	 */
-	if (do_soft_reclaim && !scanned_groups)
-		__shrink_zone(zone, sc, do_soft_reclaim);
+	__shrink_zone(zone, sc, do_soft_reclaim);
 
 	/*
 	 * No group is over the soft limit or those that are do not have
-- 
cgit v1.2.3-70-g09d2


From 3120055e869f2a208480f238680d097eec8f0e02 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 24 Sep 2013 15:27:35 -0700
Subject: revert "memcg, vmscan: do not attempt soft limit reclaim if it would
 not scan anything"

Revert commit e839b6a1c8d0 ("memcg, vmscan: do not attempt soft limit
reclaim if it would not scan anything")

I merged this prematurely - Michal and Johannes still disagree about the
overall design direction and the future remains unclear.

Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 6 +-----
 mm/vmscan.c     | 4 +---
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5ee0af463f2..5cf7726764c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1872,11 +1872,7 @@ enum mem_cgroup_filter_t
 mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
 		struct mem_cgroup *root)
 {
-	struct mem_cgroup *parent;
-
-	if (!memcg)
-		memcg = root_mem_cgroup;
-	parent = memcg;
+	struct mem_cgroup *parent = memcg;
 
 	if (res_counter_soft_limit_excess(&memcg->res))
 		return VISIT;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 97b0ed16749..cdd300b8148 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -142,9 +142,7 @@ static bool global_reclaim(struct scan_control *sc)
 
 static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
 {
-	struct mem_cgroup *root = sc->target_mem_cgroup;
-	return !mem_cgroup_disabled() &&
-		mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE;
+	return !mem_cgroup_disabled();
 }
 #else
 static bool global_reclaim(struct scan_control *sc)
-- 
cgit v1.2.3-70-g09d2


From 694fbc0fe78518d06efa63910bf4ecee660e7852 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 24 Sep 2013 15:27:37 -0700
Subject: revert "memcg: enhance memcg iterator to support predicates"

Revert commit de57780dc659 ("memcg: enhance memcg iterator to support
predicates")

I merged this prematurely - Michal and Johannes still disagree about the
overall design direction and the future remains unclear.

Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 49 ++++----------------------------
 mm/memcontrol.c            | 70 ++++++++++------------------------------------
 mm/vmscan.c                | 16 +++++++----
 3 files changed, 32 insertions(+), 103 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 60e95872da2..ef2b9bd7faf 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -53,23 +53,6 @@ struct mem_cgroup_reclaim_cookie {
 	unsigned int generation;
 };
 
-enum mem_cgroup_filter_t {
-	VISIT,		/* visit current node */
-	SKIP,		/* skip the current node and continue traversal */
-	SKIP_TREE,	/* skip the whole subtree and continue traversal */
-};
-
-/*
- * mem_cgroup_filter_t predicate might instruct mem_cgroup_iter_cond how to
- * iterate through the hierarchy tree. Each tree element is checked by the
- * predicate before it is returned by the iterator. If a filter returns
- * SKIP or SKIP_TREE then the iterator code continues traversal (with the
- * next node down the hierarchy or the next node that doesn't belong under the
- * memcg's subtree).
- */
-typedef enum mem_cgroup_filter_t
-(*mem_cgroup_iter_filter)(struct mem_cgroup *memcg, struct mem_cgroup *root);
-
 #ifdef CONFIG_MEMCG
 /*
  * All "charge" functions with gfp_mask should use GFP_KERNEL or
@@ -137,18 +120,9 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
 extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 	struct page *oldpage, struct page *newpage, bool migration_ok);
 
-struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
-				   struct mem_cgroup *prev,
-				   struct mem_cgroup_reclaim_cookie *reclaim,
-				   mem_cgroup_iter_filter cond);
-
-static inline struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
-				   struct mem_cgroup *prev,
-				   struct mem_cgroup_reclaim_cookie *reclaim)
-{
-	return mem_cgroup_iter_cond(root, prev, reclaim, NULL);
-}
-
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
+				   struct mem_cgroup *,
+				   struct mem_cgroup_reclaim_cookie *);
 void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
 
 /*
@@ -260,8 +234,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
 	mem_cgroup_update_page_stat(page, idx, -1);
 }
 
-enum mem_cgroup_filter_t
-mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
+bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
 		struct mem_cgroup *root);
 
 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
@@ -376,15 +349,6 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 		struct page *oldpage, struct page *newpage, bool migration_ok)
 {
 }
-static inline struct mem_cgroup *
-mem_cgroup_iter_cond(struct mem_cgroup *root,
-		struct mem_cgroup *prev,
-		struct mem_cgroup_reclaim_cookie *reclaim,
-		mem_cgroup_iter_filter cond)
-{
-	/* first call must return non-NULL, second return NULL */
-	return (struct mem_cgroup *)(unsigned long)!prev;
-}
 
 static inline struct mem_cgroup *
 mem_cgroup_iter(struct mem_cgroup *root,
@@ -471,11 +435,10 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
 }
 
 static inline
-enum mem_cgroup_filter_t
-mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
+bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
 		struct mem_cgroup *root)
 {
-	return VISIT;
+	return false;
 }
 
 static inline void mem_cgroup_split_huge_fixup(struct page *head)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 916892c2b8e..65e7bec4b0f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -862,15 +862,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 	return memcg;
 }
 
-static enum mem_cgroup_filter_t
-mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
-		mem_cgroup_iter_filter cond)
-{
-	if (!cond)
-		return VISIT;
-	return cond(memcg, root);
-}
-
 /*
  * Returns a next (in a pre-order walk) alive memcg (with elevated css
  * ref. count) or NULL if the whole root's subtree has been visited.
@@ -878,7 +869,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
  * helper function to be used by mem_cgroup_iter
  */
 static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
-		struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond)
+		struct mem_cgroup *last_visited)
 {
 	struct cgroup_subsys_state *prev_css, *next_css;
 
@@ -896,31 +887,11 @@ skip_node:
 	if (next_css) {
 		struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
 
-		switch (mem_cgroup_filter(mem, root, cond)) {
-		case SKIP:
+		if (css_tryget(&mem->css))
+			return mem;
+		else {
 			prev_css = next_css;
 			goto skip_node;
-		case SKIP_TREE:
-			if (mem == root)
-				return NULL;
-			/*
-			 * css_rightmost_descendant is not an optimal way to
-			 * skip through a subtree (especially for imbalanced
-			 * trees leaning to right) but that's what we have right
-			 * now. More effective solution would be traversing
-			 * right-up for first non-NULL without calling
-			 * css_next_descendant_pre afterwards.
-			 */
-			prev_css = css_rightmost_descendant(next_css);
-			goto skip_node;
-		case VISIT:
-			if (css_tryget(&mem->css))
-				return mem;
-			else {
-				prev_css = next_css;
-				goto skip_node;
-			}
-			break;
 		}
 	}
 
@@ -984,7 +955,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
  * @root: hierarchy root
  * @prev: previously returned memcg, NULL on first invocation
  * @reclaim: cookie for shared reclaim walks, NULL for full walks
- * @cond: filter for visited nodes, NULL for no filter
  *
  * Returns references to children of the hierarchy below @root, or
  * @root itself, or %NULL after a full round-trip.
@@ -997,18 +967,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
  * divide up the memcgs in the hierarchy among all concurrent
  * reclaimers operating on the same zone and priority.
  */
-struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 				   struct mem_cgroup *prev,
-				   struct mem_cgroup_reclaim_cookie *reclaim,
-				   mem_cgroup_iter_filter cond)
+				   struct mem_cgroup_reclaim_cookie *reclaim)
 {
 	struct mem_cgroup *memcg = NULL;
 	struct mem_cgroup *last_visited = NULL;
 
-	if (mem_cgroup_disabled()) {
-		/* first call must return non-NULL, second return NULL */
-		return (struct mem_cgroup *)(unsigned long)!prev;
-	}
+	if (mem_cgroup_disabled())
+		return NULL;
 
 	if (!root)
 		root = root_mem_cgroup;
@@ -1019,9 +986,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
 	if (!root->use_hierarchy && root != root_mem_cgroup) {
 		if (prev)
 			goto out_css_put;
-		if (mem_cgroup_filter(root, root, cond) == VISIT)
-			return root;
-		return NULL;
+		return root;
 	}
 
 	rcu_read_lock();
@@ -1044,7 +1009,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
 			last_visited = mem_cgroup_iter_load(iter, root, &seq);
 		}
 
-		memcg = __mem_cgroup_iter_next(root, last_visited, cond);
+		memcg = __mem_cgroup_iter_next(root, last_visited);
 
 		if (reclaim) {
 			mem_cgroup_iter_update(iter, last_visited, memcg, seq);
@@ -1055,11 +1020,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
 				reclaim->generation = iter->generation;
 		}
 
-		/*
-		 * We have finished the whole tree walk or no group has been
-		 * visited because filter told us to skip the root node.
-		 */
-		if (!memcg && (prev || (cond && !last_visited)))
+		if (prev && !memcg)
 			goto out_unlock;
 	}
 out_unlock:
@@ -1804,14 +1765,13 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
  *	a) it is over its soft limit
  *	b) any parent up the hierarchy is over its soft limit
  */
-enum mem_cgroup_filter_t
-mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
+bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
 		struct mem_cgroup *root)
 {
 	struct mem_cgroup *parent = memcg;
 
 	if (res_counter_soft_limit_excess(&memcg->res))
-		return VISIT;
+		return true;
 
 	/*
 	 * If any parent up to the root in the hierarchy is over its soft limit
@@ -1819,12 +1779,12 @@ mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
 	 */
 	while ((parent = parent_mem_cgroup(parent))) {
 		if (res_counter_soft_limit_excess(&parent->res))
-			return VISIT;
+			return true;
 		if (parent == root)
 			break;
 	}
 
-	return SKIP;
+	return false;
 }
 
 static DEFINE_SPINLOCK(memcg_oom_lock);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index cdd300b8148..17a7134de4d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2185,16 +2185,21 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 			.zone = zone,
 			.priority = sc->priority,
 		};
-		struct mem_cgroup *memcg = NULL;
-		mem_cgroup_iter_filter filter = (soft_reclaim) ?
-			mem_cgroup_soft_reclaim_eligible : NULL;
+		struct mem_cgroup *memcg;
 
 		nr_reclaimed = sc->nr_reclaimed;
 		nr_scanned = sc->nr_scanned;
 
-		while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) {
+		memcg = mem_cgroup_iter(root, NULL, &reclaim);
+		do {
 			struct lruvec *lruvec;
 
+			if (soft_reclaim &&
+			    !mem_cgroup_soft_reclaim_eligible(memcg, root)) {
+				memcg = mem_cgroup_iter(root, memcg, &reclaim);
+				continue;
+			}
+
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
 			shrink_lruvec(lruvec, sc);
@@ -2214,7 +2219,8 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 				mem_cgroup_iter_break(root, memcg);
 				break;
 			}
-		}
+			memcg = mem_cgroup_iter(root, memcg, &reclaim);
+		} while (memcg);
 
 		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
 			   sc->nr_scanned - nr_scanned,
-- 
cgit v1.2.3-70-g09d2


From b1aff7fcf86c88472b0a70f15d89d7a4adba07bb Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 24 Sep 2013 15:27:38 -0700
Subject: revert "vmscan, memcg: do softlimit reclaim also for targeted
 reclaim"

Revert commit a5b7c87f9207 ("vmscan, memcg: do softlimit reclaim also
for targeted reclaim")

I merged this prematurely - Michal and Johannes still disagree about the
overall design direction and the future remains unclear.

Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  6 ++----
 mm/memcontrol.c            | 14 +++++---------
 mm/vmscan.c                |  4 ++--
 3 files changed, 9 insertions(+), 15 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ef2b9bd7faf..6054c9f3a5e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -234,8 +234,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
 	mem_cgroup_update_page_stat(page, idx, -1);
 }
 
-bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
-		struct mem_cgroup *root);
+bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg);
 
 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
 static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
@@ -435,8 +434,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
 }
 
 static inline
-bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
-		struct mem_cgroup *root)
+bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg)
 {
 	return false;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 65e7bec4b0f..47cdc7eb1a6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1760,13 +1760,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 #endif
 
 /*
- * A group is eligible for the soft limit reclaim under the given root
- * hierarchy if
- *	a) it is over its soft limit
+ * A group is eligible for the soft limit reclaim if
+ * 	a) it is over its soft limit
  *	b) any parent up the hierarchy is over its soft limit
  */
-bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
-		struct mem_cgroup *root)
+bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *parent = memcg;
 
@@ -1774,14 +1772,12 @@ bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
 		return true;
 
 	/*
-	 * If any parent up to the root in the hierarchy is over its soft limit
-	 * then we have to obey and reclaim from this group as well.
+	 * If any parent up the hierarchy is over its soft limit then we
+	 * have to obey and reclaim from this group as well.
 	 */
 	while ((parent = parent_mem_cgroup(parent))) {
 		if (res_counter_soft_limit_excess(&parent->res))
 			return true;
-		if (parent == root)
-			break;
 	}
 
 	return false;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 17a7134de4d..0e081cada4b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -142,7 +142,7 @@ static bool global_reclaim(struct scan_control *sc)
 
 static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
 {
-	return !mem_cgroup_disabled();
+	return !mem_cgroup_disabled() && global_reclaim(sc);
 }
 #else
 static bool global_reclaim(struct scan_control *sc)
@@ -2195,7 +2195,7 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 			struct lruvec *lruvec;
 
 			if (soft_reclaim &&
-			    !mem_cgroup_soft_reclaim_eligible(memcg, root)) {
+			    !mem_cgroup_soft_reclaim_eligible(memcg)) {
 				memcg = mem_cgroup_iter(root, memcg, &reclaim);
 				continue;
 			}
-- 
cgit v1.2.3-70-g09d2


From 0608f43da64a1f1c42507304b5f25bc8b1227aa4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 24 Sep 2013 15:27:41 -0700
Subject: revert "memcg, vmscan: integrate soft reclaim tighter with zone
 shrinking code"

Revert commit 3b38722efd9f ("memcg, vmscan: integrate soft reclaim
tighter with zone shrinking code")

I merged this prematurely - Michal and Johannes still disagree about the
overall design direction and the future remains unclear.

Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  10 ++-
 mm/memcontrol.c            | 163 +++++++++++++++++++++++++++++++++++++++------
 mm/vmscan.c                |  62 ++++++++---------
 3 files changed, 175 insertions(+), 60 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6054c9f3a5e..ecc82b37c4c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -234,7 +234,9 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
 	mem_cgroup_update_page_stat(page, idx, -1);
 }
 
-bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg);
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+						gfp_t gfp_mask,
+						unsigned long *total_scanned);
 
 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
 static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
@@ -434,9 +436,11 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
 }
 
 static inline
-bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg)
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+					    gfp_t gfp_mask,
+					    unsigned long *total_scanned)
 {
-	return false;
+	return 0;
 }
 
 static inline void mem_cgroup_split_huge_fixup(struct page *head)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 852dbec07ce..1c52ddbc839 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1991,28 +1991,57 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 }
 #endif
 
-/*
- * A group is eligible for the soft limit reclaim if
- * 	a) it is over its soft limit
- *	b) any parent up the hierarchy is over its soft limit
- */
-bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *parent = memcg;
-
-	if (res_counter_soft_limit_excess(&memcg->res))
-		return true;
-
-	/*
-	 * If any parent up the hierarchy is over its soft limit then we
-	 * have to obey and reclaim from this group as well.
-	 */
-	while ((parent = parent_mem_cgroup(parent))) {
-		if (res_counter_soft_limit_excess(&parent->res))
-			return true;
+static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
+				   struct zone *zone,
+				   gfp_t gfp_mask,
+				   unsigned long *total_scanned)
+{
+	struct mem_cgroup *victim = NULL;
+	int total = 0;
+	int loop = 0;
+	unsigned long excess;
+	unsigned long nr_scanned;
+	struct mem_cgroup_reclaim_cookie reclaim = {
+		.zone = zone,
+		.priority = 0,
+	};
+
+	excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
+
+	while (1) {
+		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
+		if (!victim) {
+			loop++;
+			if (loop >= 2) {
+				/*
+				 * If we have not been able to reclaim
+				 * anything, it might because there are
+				 * no reclaimable pages under this hierarchy
+				 */
+				if (!total)
+					break;
+				/*
+				 * We want to do more targeted reclaim.
+				 * excess >> 2 is not to excessive so as to
+				 * reclaim too much, nor too less that we keep
+				 * coming back to reclaim from this cgroup
+				 */
+				if (total >= (excess >> 2) ||
+					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
+					break;
+			}
+			continue;
+		}
+		if (!mem_cgroup_reclaimable(victim, false))
+			continue;
+		total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
+						     zone, &nr_scanned);
+		*total_scanned += nr_scanned;
+		if (!res_counter_soft_limit_excess(&root_memcg->res))
+			break;
 	}
-
-	return false;
+	mem_cgroup_iter_break(root_memcg, victim);
+	return total;
 }
 
 static DEFINE_SPINLOCK(memcg_oom_lock);
@@ -4761,6 +4790,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 	return ret;
 }
 
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+					    gfp_t gfp_mask,
+					    unsigned long *total_scanned)
+{
+	unsigned long nr_reclaimed = 0;
+	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
+	unsigned long reclaimed;
+	int loop = 0;
+	struct mem_cgroup_tree_per_zone *mctz;
+	unsigned long long excess;
+	unsigned long nr_scanned;
+
+	if (order > 0)
+		return 0;
+
+	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
+	/*
+	 * This loop can run a while, specially if mem_cgroup's continuously
+	 * keep exceeding their soft limit and putting the system under
+	 * pressure
+	 */
+	do {
+		if (next_mz)
+			mz = next_mz;
+		else
+			mz = mem_cgroup_largest_soft_limit_node(mctz);
+		if (!mz)
+			break;
+
+		nr_scanned = 0;
+		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
+						    gfp_mask, &nr_scanned);
+		nr_reclaimed += reclaimed;
+		*total_scanned += nr_scanned;
+		spin_lock(&mctz->lock);
+
+		/*
+		 * If we failed to reclaim anything from this memory cgroup
+		 * it is time to move on to the next cgroup
+		 */
+		next_mz = NULL;
+		if (!reclaimed) {
+			do {
+				/*
+				 * Loop until we find yet another one.
+				 *
+				 * By the time we get the soft_limit lock
+				 * again, someone might have aded the
+				 * group back on the RB tree. Iterate to
+				 * make sure we get a different mem.
+				 * mem_cgroup_largest_soft_limit_node returns
+				 * NULL if no other cgroup is present on
+				 * the tree
+				 */
+				next_mz =
+				__mem_cgroup_largest_soft_limit_node(mctz);
+				if (next_mz == mz)
+					css_put(&next_mz->memcg->css);
+				else /* next_mz == NULL or other memcg */
+					break;
+			} while (1);
+		}
+		__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
+		excess = res_counter_soft_limit_excess(&mz->memcg->res);
+		/*
+		 * One school of thought says that we should not add
+		 * back the node to the tree if reclaim returns 0.
+		 * But our reclaim could return 0, simply because due
+		 * to priority we are exposing a smaller subset of
+		 * memory to reclaim from. Consider this as a longer
+		 * term TODO.
+		 */
+		/* If excess == 0, no tree ops */
+		__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
+		spin_unlock(&mctz->lock);
+		css_put(&mz->memcg->css);
+		loop++;
+		/*
+		 * Could not reclaim anything and there are no more
+		 * mem cgroups to try or we seem to be looping without
+		 * reclaiming anything.
+		 */
+		if (!nr_reclaimed &&
+			(next_mz == NULL ||
+			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
+			break;
+	} while (!nr_reclaimed);
+	if (next_mz)
+		css_put(&next_mz->memcg->css);
+	return nr_reclaimed;
+}
+
 /**
  * mem_cgroup_force_empty_list - clears LRU of a group
  * @memcg: group to clear
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0e081cada4b..beb35778c69 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -139,21 +139,11 @@ static bool global_reclaim(struct scan_control *sc)
 {
 	return !sc->target_mem_cgroup;
 }
-
-static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
-{
-	return !mem_cgroup_disabled() && global_reclaim(sc);
-}
 #else
 static bool global_reclaim(struct scan_control *sc)
 {
 	return true;
 }
-
-static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
-{
-	return false;
-}
 #endif
 
 unsigned long zone_reclaimable_pages(struct zone *zone)
@@ -2174,8 +2164,7 @@ static inline bool should_continue_reclaim(struct zone *zone,
 	}
 }
 
-static void
-__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
+static void shrink_zone(struct zone *zone, struct scan_control *sc)
 {
 	unsigned long nr_reclaimed, nr_scanned;
 
@@ -2194,12 +2183,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 		do {
 			struct lruvec *lruvec;
 
-			if (soft_reclaim &&
-			    !mem_cgroup_soft_reclaim_eligible(memcg)) {
-				memcg = mem_cgroup_iter(root, memcg, &reclaim);
-				continue;
-			}
-
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
 			shrink_lruvec(lruvec, sc);
@@ -2230,24 +2213,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 					 sc->nr_scanned - nr_scanned, sc));
 }
 
-
-static void shrink_zone(struct zone *zone, struct scan_control *sc)
-{
-	bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc);
-	unsigned long nr_scanned = sc->nr_scanned;
-
-	__shrink_zone(zone, sc, do_soft_reclaim);
-
-	/*
-	 * No group is over the soft limit or those that are do not have
-	 * pages in the zone we are reclaiming so we have to reclaim everybody
-	 */
-	if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) {
-		__shrink_zone(zone, sc, false);
-		return;
-	}
-}
-
 /* Returns true if compaction should go ahead for a high-order request */
 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
 {
@@ -2309,6 +2274,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 {
 	struct zoneref *z;
 	struct zone *zone;
+	unsigned long nr_soft_reclaimed;
+	unsigned long nr_soft_scanned;
 	bool aborted_reclaim = false;
 
 	/*
@@ -2348,6 +2315,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 					continue;
 				}
 			}
+			/*
+			 * This steals pages from memory cgroups over softlimit
+			 * and returns the number of reclaimed pages and
+			 * scanned pages. This works for global memory pressure
+			 * and balancing, not for a memcg's limit.
+			 */
+			nr_soft_scanned = 0;
+			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+						sc->order, sc->gfp_mask,
+						&nr_soft_scanned);
+			sc->nr_reclaimed += nr_soft_reclaimed;
+			sc->nr_scanned += nr_soft_scanned;
 			/* need some check for avoid more shrink_zone() */
 		}
 
@@ -2941,6 +2920,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 {
 	int i;
 	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
+	unsigned long nr_soft_reclaimed;
+	unsigned long nr_soft_scanned;
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
 		.priority = DEF_PRIORITY,
@@ -3055,6 +3036,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 
 			sc.nr_scanned = 0;
 
+			nr_soft_scanned = 0;
+			/*
+			 * Call soft limit reclaim before calling shrink_zone.
+			 */
+			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+							order, sc.gfp_mask,
+							&nr_soft_scanned);
+			sc.nr_reclaimed += nr_soft_reclaimed;
+
 			/*
 			 * There should be no need to raise the scanning
 			 * priority if enough pages are already being scanned
-- 
cgit v1.2.3-70-g09d2


From 117aad1e9e4d97448d1df3f84b08bd65811e6d6a Mon Sep 17 00:00:00 2001
From: Rafael Aquini <aquini@redhat.com>
Date: Mon, 30 Sep 2013 13:45:16 -0700
Subject: mm: avoid reinserting isolated balloon pages into LRU lists

Isolated balloon pages can wrongly end up in LRU lists when
migrate_pages() finishes its round without draining all the isolated
page list.

The same issue can happen when reclaim_clean_pages_from_list() tries to
reclaim pages from an isolated page list, before migration, in the CMA
path.  Such balloon page leak opens a race window against LRU lists
shrinkers that leads us to the following kernel panic:

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000028
  IP: [<ffffffff810c2625>] shrink_page_list+0x24e/0x897
  PGD 3cda2067 PUD 3d713067 PMD 0
  Oops: 0000 [#1] SMP
  CPU: 0 PID: 340 Comm: kswapd0 Not tainted 3.12.0-rc1-22626-g4367597 #87
  Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
  RIP: shrink_page_list+0x24e/0x897
  RSP: 0000:ffff88003da499b8  EFLAGS: 00010286
  RAX: 0000000000000000 RBX: ffff88003e82bd60 RCX: 00000000000657d5
  RDX: 0000000000000000 RSI: 000000000000031f RDI: ffff88003e82bd40
  RBP: ffff88003da49ab0 R08: 0000000000000001 R09: 0000000081121a45
  R10: ffffffff81121a45 R11: ffff88003c4a9a28 R12: ffff88003e82bd40
  R13: ffff88003da0e800 R14: 0000000000000001 R15: ffff88003da49d58
  FS:  0000000000000000(0000) GS:ffff88003fc00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00000000067d9000 CR3: 000000003ace5000 CR4: 00000000000407b0
  Call Trace:
    shrink_inactive_list+0x240/0x3de
    shrink_lruvec+0x3e0/0x566
    __shrink_zone+0x94/0x178
    shrink_zone+0x3a/0x82
    balance_pgdat+0x32a/0x4c2
    kswapd+0x2f0/0x372
    kthread+0xa2/0xaa
    ret_from_fork+0x7c/0xb0
  Code: 80 7d 8f 01 48 83 95 68 ff ff ff 00 4c 89 e7 e8 5a 7b 00 00 48 85 c0 49 89 c5 75 08 80 7d 8f 00 74 3e eb 31 48 8b 80 18 01 00 00 <48> 8b 74 0d 48 8b 78 30 be 02 00 00 00 ff d2 eb
  RIP  [<ffffffff810c2625>] shrink_page_list+0x24e/0x897
   RSP <ffff88003da499b8>
  CR2: 0000000000000028
  ---[ end trace 703d2451af6ffbfd ]---
  Kernel panic - not syncing: Fatal exception

This patch fixes the issue, by assuring the proper tests are made at
putback_movable_pages() & reclaim_clean_pages_from_list() to avoid
isolated balloon pages being wrongly reinserted in LRU lists.

[akpm@linux-foundation.org: clarify awkward comment text]
Signed-off-by: Rafael Aquini <aquini@redhat.com>
Reported-by: Luiz Capitulino <lcapitulino@redhat.com>
Tested-by: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/balloon_compaction.h | 25 +++++++++++++++++++++++++
 mm/migrate.c                       |  2 +-
 mm/vmscan.c                        |  4 +++-
 3 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'mm/vmscan.c')

diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index f7f1d7169b1..089743ade73 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -158,6 +158,26 @@ static inline bool balloon_page_movable(struct page *page)
 	return false;
 }
 
+/*
+ * isolated_balloon_page - identify an isolated balloon page on private
+ *			   compaction/migration page lists.
+ *
+ * After a compaction thread isolates a balloon page for migration, it raises
+ * the page refcount to prevent concurrent compaction threads from re-isolating
+ * the same page. For that reason putback_movable_pages(), or other routines
+ * that need to identify isolated balloon pages on private pagelists, cannot
+ * rely on balloon_page_movable() to accomplish the task.
+ */
+static inline bool isolated_balloon_page(struct page *page)
+{
+	/* Already isolated balloon pages, by default, have a raised refcount */
+	if (page_flags_cleared(page) && !page_mapped(page) &&
+	    page_count(page) >= 2)
+		return __is_movable_balloon_page(page);
+
+	return false;
+}
+
 /*
  * balloon_page_insert - insert a page into the balloon's page list and make
  *		         the page->mapping assignment accordingly.
@@ -243,6 +263,11 @@ static inline bool balloon_page_movable(struct page *page)
 	return false;
 }
 
+static inline bool isolated_balloon_page(struct page *page)
+{
+	return false;
+}
+
 static inline bool balloon_page_isolate(struct page *page)
 {
 	return false;
diff --git a/mm/migrate.c b/mm/migrate.c
index 9c8d5f59d30..a26bccd44cc 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -107,7 +107,7 @@ void putback_movable_pages(struct list_head *l)
 		list_del(&page->lru);
 		dec_zone_page_state(page, NR_ISOLATED_ANON +
 				page_is_file_cache(page));
-		if (unlikely(balloon_page_movable(page)))
+		if (unlikely(isolated_balloon_page(page)))
 			balloon_page_putback(page);
 		else
 			putback_lru_page(page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index beb35778c69..53f2f82f83a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -48,6 +48,7 @@
 #include <asm/div64.h>
 
 #include <linux/swapops.h>
+#include <linux/balloon_compaction.h>
 
 #include "internal.h"
 
@@ -1113,7 +1114,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 	LIST_HEAD(clean_pages);
 
 	list_for_each_entry_safe(page, next, page_list, lru) {
-		if (page_is_file_cache(page) && !PageDirty(page)) {
+		if (page_is_file_cache(page) && !PageDirty(page) &&
+		    !isolated_balloon_page(page)) {
 			ClearPageActive(page);
 			list_move(&page->lru, &clean_pages);
 		}
-- 
cgit v1.2.3-70-g09d2


From ae39332162a837c3791bb21172d22382a90a6fd1 Mon Sep 17 00:00:00 2001
From: Andrew Vagin <avagin@openvz.org>
Date: Wed, 16 Oct 2013 13:46:46 -0700
Subject: mm/vmscan.c: don't forget to free shrinker->nr_deferred

This leak was added by commit 1d3d4437eae1 ("vmscan: per-node deferred
work").

unreferenced object 0xffff88006ada3bd0 (size 8):
  comm "criu", pid 14781, jiffies 4295238251 (age 105.641s)
  hex dump (first 8 bytes):
    00 00 00 00 00 00 00 00                          ........
  backtrace:
    [<ffffffff8170caee>] kmemleak_alloc+0x5e/0xc0
    [<ffffffff811c0527>] __kmalloc+0x247/0x310
    [<ffffffff8117848c>] register_shrinker+0x3c/0xa0
    [<ffffffff811e115b>] sget+0x5ab/0x670
    [<ffffffff812532f4>] proc_mount+0x54/0x170
    [<ffffffff811e1893>] mount_fs+0x43/0x1b0
    [<ffffffff81202dd2>] vfs_kern_mount+0x72/0x110
    [<ffffffff81202e89>] kern_mount_data+0x19/0x30
    [<ffffffff812530a0>] pid_ns_prepare_proc+0x20/0x40
    [<ffffffff81083c56>] alloc_pid+0x466/0x4a0
    [<ffffffff8105aeda>] copy_process+0xc6a/0x1860
    [<ffffffff8105beab>] do_fork+0x8b/0x370
    [<ffffffff8105c1a6>] SyS_clone+0x16/0x20
    [<ffffffff8171f739>] stub_clone+0x69/0x90
    [<ffffffffffffffff>] 0xffffffffffffffff

Signed-off-by: Andrew Vagin <avagin@openvz.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Glauber Costa <glommer@openvz.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm/vmscan.c')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 53f2f82f83a..eea668d9cff 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -211,6 +211,7 @@ void unregister_shrinker(struct shrinker *shrinker)
 	down_write(&shrinker_rwsem);
 	list_del(&shrinker->list);
 	up_write(&shrinker_rwsem);
+	kfree(shrinker->nr_deferred);
 }
 EXPORT_SYMBOL(unregister_shrinker);
 
-- 
cgit v1.2.3-70-g09d2