diff options
Diffstat (limited to 'arch/x86_64/mm/numa.c')
-rw-r--r-- | arch/x86_64/mm/numa.c | 189 |
1 files changed, 138 insertions, 51 deletions
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 21480382100..6ef9f9a7623 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -38,38 +38,59 @@ cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; int numa_off __initdata; -int __init compute_hash_shift(struct node *nodes, int numnodes) + +/* + * Given a shift value, try to populate memnodemap[] + * Returns : + * 1 if OK + * 0 if memnodmap[] too small (of shift too small) + * -1 if node overlap or lost ram (shift too big) + */ +static int __init +populate_memnodemap(const struct node *nodes, int numnodes, int shift) { int i; - int shift = 20; - unsigned long addr,maxend=0; - - for (i = 0; i < numnodes; i++) - if ((nodes[i].start != nodes[i].end) && (nodes[i].end > maxend)) - maxend = nodes[i].end; + int res = -1; + unsigned long addr, end; - while ((1UL << shift) < (maxend / NODEMAPSIZE)) - shift++; - - printk (KERN_DEBUG"Using %d for the hash shift. Max adder is %lx \n", - shift,maxend); - memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE); + if (shift >= 64) + return -1; + memset(memnodemap, 0xff, sizeof(memnodemap)); for (i = 0; i < numnodes; i++) { - if (nodes[i].start == nodes[i].end) + addr = nodes[i].start; + end = nodes[i].end; + if (addr >= end) continue; - for (addr = nodes[i].start; - addr < nodes[i].end; - addr += (1UL << shift)) { - if (memnodemap[addr >> shift] != 0xff) { - printk(KERN_INFO - "Your memory is not aligned you need to rebuild your kernel " - "with a bigger NODEMAPSIZE shift=%d adder=%lu\n", - shift,addr); + if ((end >> shift) >= NODEMAPSIZE) + return 0; + do { + if (memnodemap[addr >> shift] != 0xff) return -1; - } memnodemap[addr >> shift] = i; - } + addr += (1UL << shift); + } while (addr < end); + res = 1; } + return res; +} + +int __init compute_hash_shift(struct node *nodes, int numnodes) +{ + int shift = 20; + + while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0) + shift++; + + printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", + shift); + + if (populate_memnodemap(nodes, numnodes, shift) != 1) { + printk(KERN_INFO + "Your memory is not aligned you need to rebuild your kernel " + "with a bigger NODEMAPSIZE shift=%d\n", + shift); + return -1; + } return shift; } @@ -89,12 +110,11 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en start = round_up(start, ZONE_ALIGN); - printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); + printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); start_pfn = start >> PAGE_SHIFT; end_pfn = end >> PAGE_SHIFT; - memory_present(nodeid, start_pfn, end_pfn); nodedata_phys = find_e820_area(start, end, pgdat_size); if (nodedata_phys == -1L) panic("Cannot find memory pgdat in node %d\n", nodeid); @@ -132,29 +152,14 @@ void __init setup_node_zones(int nodeid) unsigned long start_pfn, end_pfn; unsigned long zones[MAX_NR_ZONES]; unsigned long holes[MAX_NR_ZONES]; - unsigned long dma_end_pfn; - memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); - memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES); + start_pfn = node_start_pfn(nodeid); + end_pfn = node_end_pfn(nodeid); - start_pfn = node_start_pfn(nodeid); - end_pfn = node_end_pfn(nodeid); + Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n", + nodeid, start_pfn, end_pfn); - Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn); - - /* All nodes > 0 have a zero length zone DMA */ - dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT; - if (start_pfn < dma_end_pfn) { - zones[ZONE_DMA] = dma_end_pfn - start_pfn; - holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn); - zones[ZONE_NORMAL] = end_pfn - dma_end_pfn; - holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn); - - } else { - zones[ZONE_NORMAL] = end_pfn - start_pfn; - holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn); - } - + size_zones(zones, holes, start_pfn, end_pfn); free_area_init_node(nodeid, NODE_DATA(nodeid), zones, start_pfn, holes); } @@ -171,7 +176,7 @@ void __init numa_init_array(void) for (i = 0; i < NR_CPUS; i++) { if (cpu_to_node[i] != NUMA_NO_NODE) continue; - cpu_to_node[i] = rr; + numa_set_node(i, rr); rr = next_node(rr, node_online_map); if (rr == MAX_NUMNODES) rr = first_node(node_online_map); @@ -195,7 +200,7 @@ static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) while ((x << 1) < sz) x <<= 1; if (x < sz/2) - printk("Numa emulation unbalanced. Complain to maintainer\n"); + printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n"); sz = x; } @@ -205,8 +210,6 @@ static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) if (i == numa_fake-1) sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; nodes[i].end = nodes[i].start + sz; - if (i != numa_fake-1) - nodes[i].end--; printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", i, nodes[i].start, nodes[i].end, @@ -257,7 +260,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) nodes_clear(node_online_map); node_set_online(0); for (i = 0; i < NR_CPUS; i++) - cpu_to_node[i] = 0; + numa_set_node(i, 0); node_to_cpumask[0] = cpumask_of_cpu(0); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); } @@ -267,6 +270,12 @@ __cpuinit void numa_add_cpu(int cpu) set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); } +void __cpuinit numa_set_node(int cpu, int node) +{ + cpu_pda(cpu)->nodenumber = node; + cpu_to_node[cpu] = node; +} + unsigned long __init numa_free_all_bootmem(void) { int i; @@ -277,9 +286,26 @@ unsigned long __init numa_free_all_bootmem(void) return pages; } +#ifdef CONFIG_SPARSEMEM +static void __init arch_sparse_init(void) +{ + int i; + + for_each_online_node(i) + memory_present(i, node_start_pfn(i), node_end_pfn(i)); + + sparse_init(); +} +#else +#define arch_sparse_init() do {} while (0) +#endif + void __init paging_init(void) { int i; + + arch_sparse_init(); + for_each_online_node(i) { setup_node_zones(i); } @@ -304,8 +330,69 @@ __init int numa_setup(char *opt) return 1; } +/* + * Setup early cpu_to_node. + * + * Populate cpu_to_node[] only if x86_cpu_to_apicid[], + * and apicid_to_node[] tables have valid entries for a CPU. + * This means we skip cpu_to_node[] initialisation for NUMA + * emulation and faking node case (when running a kernel compiled + * for NUMA on a non NUMA box), which is OK as cpu_to_node[] + * is already initialized in a round robin manner at numa_init_array, + * prior to this call, and this initialization is good enough + * for the fake NUMA cases. + */ +void __init init_cpu_to_node(void) +{ + int i; + for (i = 0; i < NR_CPUS; i++) { + u8 apicid = x86_cpu_to_apicid[i]; + if (apicid == BAD_APICID) + continue; + if (apicid_to_node[apicid] == NUMA_NO_NODE) + continue; + cpu_to_node[i] = apicid_to_node[apicid]; + } +} + EXPORT_SYMBOL(cpu_to_node); EXPORT_SYMBOL(node_to_cpumask); EXPORT_SYMBOL(memnode_shift); EXPORT_SYMBOL(memnodemap); EXPORT_SYMBOL(node_data); + +#ifdef CONFIG_DISCONTIGMEM +/* + * Functions to convert PFNs from/to per node page addresses. + * These are out of line because they are quite big. + * They could be all tuned by pre caching more state. + * Should do that. + */ + +/* Requires pfn_valid(pfn) to be true */ +struct page *pfn_to_page(unsigned long pfn) +{ + int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT); + return (pfn - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map; +} +EXPORT_SYMBOL(pfn_to_page); + +unsigned long page_to_pfn(struct page *page) +{ + return (long)(((page) - page_zone(page)->zone_mem_map) + + page_zone(page)->zone_start_pfn); +} +EXPORT_SYMBOL(page_to_pfn); + +int pfn_valid(unsigned long pfn) +{ + unsigned nid; + if (pfn >= num_physpages) + return 0; + nid = pfn_to_nid(pfn); + if (nid == 0xff) + return 0; + return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid); +} +EXPORT_SYMBOL(pfn_valid); +#endif |