[PATCH] Zone reclaim: Reclaim logic

Some bits for zone reclaim exists in 2.6.15 but they are not usable. This patch fixes them up, removes unused code and makes zone reclaim usable. Zone reclaim allows the reclaiming of pages from a zone if the number of free pages falls below the watermarks even if other zones still have enough pages available. Zone reclaim is of particular importance for NUMA machines. It can be more beneficial to reclaim a page than taking the performance penalties that come with allocating a page on a remote zone. Zone reclaim is enabled if the maximum distance to another node is higher than RECLAIM_DISTANCE, which may be defined by an arch. By default RECLAIM_DISTANCE is 20. 20 is the distance to another node in the same component (enclosure or motherboard) on IA64. The meaning of the NUMA distance information seems to vary by arch. If zone reclaim is not successful then no further reclaim attempts will occur for a certain time period (ZONE_RECLAIM_INTERVAL). This patch was discussed before. See http://marc.theaimsgroup.com/?l=linux-kernel&m=113519961504207&w=2 http://marc.theaimsgroup.com/?l=linux-kernel&m=113408418232531&w=2 http://marc.theaimsgroup.com/?l=linux-kernel&m=113389027420032&w=2 http://marc.theaimsgroup.com/?l=linux-kernel&m=113380938612205&w=2 Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Christoph Lameter <clameter@sgi.com> 2006-01-18 17:42:31 -0800
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-01-18 19:20:17 -0800
commit: 9eeff2395e3cfd05c9b2e6074ff943a34b0c5c21 (patch)
tree: 20160098ec6ed8738cfecfc5f81181ad22b44e60 /mm
parent: f1fd1067ece574ab56e4a70878b9a5a1ed4c3c42 (diff)
2 files changed, 82 insertions, 3 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c2e29743a8d..df54e2fc8ee 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -878,7 +878,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 				mark = (*z)->pages_high;
 			if (!zone_watermark_ok(*z, order, mark,
 				    classzone_idx, alloc_flags))
-				continue;
+				if (!zone_reclaim_mode ||
+				    !zone_reclaim(*z, gfp_mask, order))
+					continue;
 		}
 
 		page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
@@ -1595,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat)
 	prev_node = local_node;
 	nodes_clear(used_mask);
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+		int distance = node_distance(local_node, node);
+
+		/*
+		 * If another node is sufficiently far away then it is better
+		 * to reclaim pages in a zone before going off node.
+		 */
+		if (distance > RECLAIM_DISTANCE)
+			zone_reclaim_mode = 1;
+
 		/*
 		 * We don't want to pressure a particular node.
 		 * So adding penalty to the first node in same
 		 * distance group to make it round-robin.
 		 */
-		if (node_distance(local_node, node) !=
-				node_distance(local_node, prev_node))
+
+		if (distance != node_distance(local_node, prev_node))
 			node_load[node] += load;
 		prev_node = node;
 		load--;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5117b6897a..2e34b61a70c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1572,3 +1572,71 @@ static int __init kswapd_init(void)
 }
 
 module_init(kswapd_init)
+
+#ifdef CONFIG_NUMA
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ *
+ * In the future we may add flags to the mode. However, the page allocator
+ * should only have to check that zone_reclaim_mode != 0 before calling
+ * zone_reclaim().
+ */
+int zone_reclaim_mode __read_mostly;
+
+/*
+ * Mininum time between zone reclaim scans
+ */
+#define ZONE_RECLAIM_INTERVAL HZ/2
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+	int nr_pages = 1 << order;
+	struct task_struct *p = current;
+	struct reclaim_state reclaim_state;
+	struct scan_control sc = {
+		.gfp_mask	= gfp_mask,
+		.may_writepage	= 0,
+		.may_swap	= 0,
+		.nr_mapped	= read_page_state(nr_mapped),
+		.nr_scanned	= 0,
+		.nr_reclaimed	= 0,
+		.priority	= 0
+	};
+
+	if (!(gfp_mask & __GFP_WAIT) ||
+		zone->zone_pgdat->node_id != numa_node_id() ||
+		zone->all_unreclaimable ||
+		atomic_read(&zone->reclaim_in_progress) > 0)
+			return 0;
+
+	if (time_before(jiffies,
+		zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
+			return 0;
+
+	disable_swap_token();
+
+	if (nr_pages > SWAP_CLUSTER_MAX)
+		sc.swap_cluster_max = nr_pages;
+	else
+		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+
+	cond_resched();
+	p->flags |= PF_MEMALLOC;
+	reclaim_state.reclaimed_slab = 0;
+	p->reclaim_state = &reclaim_state;
+	shrink_zone(zone, &sc);
+	p->reclaim_state = NULL;
+	current->flags &= ~PF_MEMALLOC;
+
+	if (sc.nr_reclaimed == 0)
+		zone->last_unsuccessful_zone_reclaim = jiffies;
+
+	return sc.nr_reclaimed > nr_pages;
+}
+#endif
+
author	Christoph Lameter <clameter@sgi.com>	2006-01-18 17:42:31 -0800
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-01-18 19:20:17 -0800
commit	9eeff2395e3cfd05c9b2e6074ff943a34b0c5c21 (patch)
tree	20160098ec6ed8738cfecfc5f81181ad22b44e60 /mm
parent	f1fd1067ece574ab56e4a70878b9a5a1ed4c3c42 (diff)