1 files changed, 119 insertions, 63 deletions
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 80d110635d2..002878ccf90 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -17,7 +17,7 @@
 #include <linux/nodemask.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
-#include <linux/lmb.h>
+#include <linux/memblock.h>
 #include <linux/of.h>
 #include <linux/pfn.h>
 #include <asm/sparsemem.h>
@@ -42,6 +42,12 @@ EXPORT_SYMBOL(node_data);
 
 static int min_common_depth;
 static int n_mem_addr_cells, n_mem_size_cells;
+static int form1_affinity;
+
+#define MAX_DISTANCE_REF_POINTS 4
+static int distance_ref_points_depth;
+static const unsigned int *distance_ref_points;
+static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
 
 /*
  * Allocate node_to_cpumask_map based on number of available nodes
@@ -204,6 +210,39 @@ static const u32 *of_get_usable_memory(struct device_node *memory)
 	return prop;
 }
 
+int __node_distance(int a, int b)
+{
+	int i;
+	int distance = LOCAL_DISTANCE;
+
+	if (!form1_affinity)
+		return distance;
+
+	for (i = 0; i < distance_ref_points_depth; i++) {
+		if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
+			break;
+
+		/* Double the distance for each NUMA level */
+		distance *= 2;
+	}
+
+	return distance;
+}
+
+static void initialize_distance_lookup_table(int nid,
+		const unsigned int *associativity)
+{
+	int i;
+
+	if (!form1_affinity)
+		return;
+
+	for (i = 0; i < distance_ref_points_depth; i++) {
+		distance_lookup_table[nid][i] =
+			associativity[distance_ref_points[i]];
+	}
+}
+
 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
  * info is found.
  */
@@ -225,6 +264,10 @@ static int of_node_to_nid_single(struct device_node *device)
 	/* POWER4 LPAR uses 0xffff as invalid node */
 	if (nid == 0xffff || nid >= MAX_NUMNODES)
 		nid = -1;
+
+	if (nid > 0 && tmp[0] >= distance_ref_points_depth)
+		initialize_distance_lookup_table(nid, tmp);
+
 out:
 	return nid;
 }
@@ -251,26 +294,10 @@ int of_node_to_nid(struct device_node *device)
 }
 EXPORT_SYMBOL_GPL(of_node_to_nid);
 
-/*
- * In theory, the "ibm,associativity" property may contain multiple
- * associativity lists because a resource may be multiply connected
- * into the machine.  This resource then has different associativity
- * characteristics relative to its multiple connections.  We ignore
- * this for now.  We also assume that all cpu and memory sets have
- * their distances represented at a common level.  This won't be
- * true for hierarchical NUMA.
- *
- * In any case the ibm,associativity-reference-points should give
- * the correct depth for a normal NUMA system.
- *
- * - Dave Hansen <haveblue@us.ibm.com>
- */
 static int __init find_min_common_depth(void)
 {
-	int depth, index;
-	const unsigned int *ref_points;
+	int depth;
 	struct device_node *rtas_root;
-	unsigned int len;
 	struct device_node *chosen;
 	const char *vec5;
 
@@ -280,18 +307,28 @@ static int __init find_min_common_depth(void)
 		return -1;
 
 	/*
-	 * this property is 2 32-bit integers, each representing a level of
-	 * depth in the associativity nodes.  The first is for an SMP
-	 * configuration (should be all 0's) and the second is for a normal
-	 * NUMA configuration.
+	 * This property is a set of 32-bit integers, each representing
+	 * an index into the ibm,associativity nodes.
+	 *
+	 * With form 0 affinity the first integer is for an SMP configuration
+	 * (should be all 0's) and the second is for a normal NUMA
+	 * configuration. We have only one level of NUMA.
+	 *
+	 * With form 1 affinity the first integer is the most significant
+	 * NUMA boundary and the following are progressively less significant
+	 * boundaries. There can be more than one level of NUMA.
 	 */
-	index = 1;
-	ref_points = of_get_property(rtas_root,
-			"ibm,associativity-reference-points", &len);
+	distance_ref_points = of_get_property(rtas_root,
+					"ibm,associativity-reference-points",
+					&distance_ref_points_depth);
+
+	if (!distance_ref_points) {
+		dbg("NUMA: ibm,associativity-reference-points not found.\n");
+		goto err;
+	}
+
+	distance_ref_points_depth /= sizeof(int);
 
-	/*
-	 * For form 1 affinity information we want the first field
-	 */
 #define VEC5_AFFINITY_BYTE	5
 #define VEC5_AFFINITY		0x80
 	chosen = of_find_node_by_path("/chosen");
@@ -299,19 +336,38 @@ static int __init find_min_common_depth(void)
 		vec5 = of_get_property(chosen, "ibm,architecture-vec-5", NULL);
 		if (vec5 && (vec5[VEC5_AFFINITY_BYTE] & VEC5_AFFINITY)) {
 			dbg("Using form 1 affinity\n");
-			index = 0;
+			form1_affinity = 1;
 		}
 	}
 
-	if ((len >= 2 * sizeof(unsigned int)) && ref_points) {
-		depth = ref_points[index];
+	if (form1_affinity) {
+		depth = distance_ref_points[0];
 	} else {
-		dbg("NUMA: ibm,associativity-reference-points not found.\n");
-		depth = -1;
+		if (distance_ref_points_depth < 2) {
+			printk(KERN_WARNING "NUMA: "
+				"short ibm,associativity-reference-points\n");
+			goto err;
+		}
+
+		depth = distance_ref_points[1];
 	}
-	of_node_put(rtas_root);
 
+	/*
+	 * Warn and cap if the hardware supports more than
+	 * MAX_DISTANCE_REF_POINTS domains.
+	 */
+	if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
+		printk(KERN_WARNING "NUMA: distance array capped at "
+			"%d entries\n", MAX_DISTANCE_REF_POINTS);
+		distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
+	}
+
+	of_node_put(rtas_root);
 	return depth;
+
+err:
+	of_node_put(rtas_root);
+	return -1;
 }
 
 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
@@ -351,7 +407,7 @@ struct of_drconf_cell {
 #define DRCONF_MEM_RESERVED	0x00000080
 
 /*
- * Read the next lmb list entry from the ibm,dynamic-memory property
+ * Read the next memblock list entry from the ibm,dynamic-memory property
  * and return the information in the provided of_drconf_cell structure.
  */
 static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp)
@@ -372,8 +428,8 @@ static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp)
 /*
  * Retreive and validate the ibm,dynamic-memory property of the device tree.
  *
- * The layout of the ibm,dynamic-memory property is a number N of lmb
- * list entries followed by N lmb list entries.  Each lmb list entry
+ * The layout of the ibm,dynamic-memory property is a number N of memblock
+ * list entries followed by N memblock list entries.  Each memblock list entry
  * contains information as layed out in the of_drconf_cell struct above.
  */
 static int of_get_drconf_memory(struct device_node *memory, const u32 **dm)
@@ -540,19 +596,19 @@ static unsigned long __init numa_enforce_memory_limit(unsigned long start,
 						      unsigned long size)
 {
 	/*
-	 * We use lmb_end_of_DRAM() in here instead of memory_limit because
+	 * We use memblock_end_of_DRAM() in here instead of memory_limit because
 	 * we've already adjusted it for the limit and it takes care of
 	 * having memory holes below the limit.  Also, in the case of
 	 * iommu_is_off, memory_limit is not set but is implicitly enforced.
 	 */
 
-	if (start + size <= lmb_end_of_DRAM())
+	if (start + size <= memblock_end_of_DRAM())
 		return size;
 
-	if (start >= lmb_end_of_DRAM())
+	if (start >= memblock_end_of_DRAM())
 		return 0;
 
-	return lmb_end_of_DRAM() - start;
+	return memblock_end_of_DRAM() - start;
 }
 
 /*
@@ -731,7 +787,7 @@ new_range:
 	}
 
 	/*
-	 * Now do the same thing for each LMB listed in the ibm,dynamic-memory
+	 * Now do the same thing for each MEMBLOCK listed in the ibm,dynamic-memory
 	 * property in the ibm,dynamic-reconfiguration-memory node.
 	 */
 	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
@@ -743,8 +799,8 @@ new_range:
 
 static void __init setup_nonnuma(void)
 {
-	unsigned long top_of_ram = lmb_end_of_DRAM();
-	unsigned long total_ram = lmb_phys_mem_size();
+	unsigned long top_of_ram = memblock_end_of_DRAM();
+	unsigned long total_ram = memblock_phys_mem_size();
 	unsigned long start_pfn, end_pfn;
 	unsigned int i, nid = 0;
 
@@ -753,9 +809,9 @@ static void __init setup_nonnuma(void)
 	printk(KERN_DEBUG "Memory hole size: %ldMB\n",
 	       (top_of_ram - total_ram) >> 20);
 
-	for (i = 0; i < lmb.memory.cnt; ++i) {
-		start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT;
-		end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i);
+	for (i = 0; i < memblock.memory.cnt; ++i) {
+		start_pfn = memblock.memory.region[i].base >> PAGE_SHIFT;
+		end_pfn = start_pfn + memblock_size_pages(&memblock.memory, i);
 
 		fake_numa_create_new_node(end_pfn, &nid);
 		add_active_range(nid, start_pfn, end_pfn);
@@ -813,7 +869,7 @@ static void __init dump_numa_memory_topology(void)
 
 		count = 0;
 
-		for (i = 0; i < lmb_end_of_DRAM();
+		for (i = 0; i < memblock_end_of_DRAM();
 		     i += (1 << SECTION_SIZE_BITS)) {
 			if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
 				if (count == 0)
@@ -833,7 +889,7 @@ static void __init dump_numa_memory_topology(void)
 }
 
 /*
- * Allocate some memory, satisfying the lmb or bootmem allocator where
+ * Allocate some memory, satisfying the memblock or bootmem allocator where
  * required. nid is the preferred node and end is the physical address of
  * the highest address in the node.
  *
@@ -847,11 +903,11 @@ static void __init *careful_zallocation(int nid, unsigned long size,
 	int new_nid;
 	unsigned long ret_paddr;
 
-	ret_paddr = __lmb_alloc_base(size, align, end_pfn << PAGE_SHIFT);
+	ret_paddr = __memblock_alloc_base(size, align, end_pfn << PAGE_SHIFT);
 
 	/* retry over all memory */
 	if (!ret_paddr)
-		ret_paddr = __lmb_alloc_base(size, align, lmb_end_of_DRAM());
+		ret_paddr = __memblock_alloc_base(size, align, memblock_end_of_DRAM());
 
 	if (!ret_paddr)
 		panic("numa.c: cannot allocate %lu bytes for node %d",
@@ -861,14 +917,14 @@ static void __init *careful_zallocation(int nid, unsigned long size,
 
 	/*
 	 * We initialize the nodes in numeric order: 0, 1, 2...
-	 * and hand over control from the LMB allocator to the
+	 * and hand over control from the MEMBLOCK allocator to the
 	 * bootmem allocator.  If this function is called for
 	 * node 5, then we know that all nodes <5 are using the
-	 * bootmem allocator instead of the LMB allocator.
+	 * bootmem allocator instead of the MEMBLOCK allocator.
 	 *
 	 * So, check the nid from which this allocation came
 	 * and double check to see if we need to use bootmem
-	 * instead of the LMB.  We don't free the LMB memory
+	 * instead of the MEMBLOCK.  We don't free the MEMBLOCK memory
 	 * since it would be useless.
 	 */
 	new_nid = early_pfn_to_nid(ret_paddr >> PAGE_SHIFT);
@@ -893,9 +949,9 @@ static void mark_reserved_regions_for_nid(int nid)
 	struct pglist_data *node = NODE_DATA(nid);
 	int i;
 
-	for (i = 0; i < lmb.reserved.cnt; i++) {
-		unsigned long physbase = lmb.reserved.region[i].base;
-		unsigned long size = lmb.reserved.region[i].size;
+	for (i = 0; i < memblock.reserved.cnt; i++) {
+		unsigned long physbase = memblock.reserved.region[i].base;
+		unsigned long size = memblock.reserved.region[i].size;
 		unsigned long start_pfn = physbase >> PAGE_SHIFT;
 		unsigned long end_pfn = PFN_UP(physbase + size);
 		struct node_active_region node_ar;
@@ -903,7 +959,7 @@ static void mark_reserved_regions_for_nid(int nid)
 					     node->node_spanned_pages;
 
 		/*
-		 * Check to make sure that this lmb.reserved area is
+		 * Check to make sure that this memblock.reserved area is
 		 * within the bounds of the node that we care about.
 		 * Checking the nid of the start and end points is not
 		 * sufficient because the reserved area could span the
@@ -961,7 +1017,7 @@ void __init do_init_bootmem(void)
 	int nid;
 
 	min_low_pfn = 0;
-	max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
+	max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
 	max_pfn = max_low_pfn;
 
 	if (parse_numa_properties())
@@ -1038,7 +1094,7 @@ void __init paging_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-	max_zone_pfns[ZONE_DMA] = lmb_end_of_DRAM() >> PAGE_SHIFT;
+	max_zone_pfns[ZONE_DMA] = memblock_end_of_DRAM() >> PAGE_SHIFT;
 	free_area_init_nodes(max_zone_pfns);
 }
 
@@ -1113,7 +1169,7 @@ static int hot_add_drconf_scn_to_nid(struct device_node *memory,
 /*
  * Find the node associated with a hot added memory section for memory
  * represented in the device tree as a node (i.e. memory@XXXX) for
- * each lmb.
+ * each memblock.
  */
 int hot_add_node_scn_to_nid(unsigned long scn_addr)
 {
@@ -1154,8 +1210,8 @@ int hot_add_node_scn_to_nid(unsigned long scn_addr)
 
 /*
  * Find the node associated with a hot added memory section.  Section
- * corresponds to a SPARSEMEM section, not an LMB.  It is assumed that
- * sections are fully contained within a single LMB.
+ * corresponds to a SPARSEMEM section, not an MEMBLOCK.  It is assumed that
+ * sections are fully contained within a single MEMBLOCK.
  */
 int hot_add_scn_to_nid(unsigned long scn_addr)
 {