diff options
Diffstat (limited to 'arch/x86_64/mm')
-rw-r--r-- | arch/x86_64/mm/fault.c | 19 | ||||
-rw-r--r-- | arch/x86_64/mm/init.c | 129 | ||||
-rw-r--r-- | arch/x86_64/mm/ioremap.c | 37 | ||||
-rw-r--r-- | arch/x86_64/mm/k8topology.c | 1 | ||||
-rw-r--r-- | arch/x86_64/mm/numa.c | 124 | ||||
-rw-r--r-- | arch/x86_64/mm/srat.c | 6 |
6 files changed, 194 insertions, 122 deletions
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index b75b872ec15..3a63707a698 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -308,18 +308,6 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, unsigned long flags; siginfo_t info; -#ifdef CONFIG_CHECKING - { - unsigned long gs; - struct x8664_pda *pda = cpu_pda + stack_smp_processor_id(); - rdmsrl(MSR_GS_BASE, gs); - if (gs != (unsigned long)pda) { - wrmsrl(MSR_GS_BASE, pda); - printk("page_fault: wrong gs %lx expected %p\n", gs, pda); - } - } -#endif - /* get the address */ __asm__("movq %%cr2,%0":"=r" (address)); if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, @@ -571,3 +559,10 @@ do_sigbus: force_sig_info(SIGBUS, &info, tsk); return; } + +static int __init enable_pagefaulttrace(char *str) +{ + page_fault_trace = 1; + return 0; +} +__setup("pagefaulttrace", enable_pagefaulttrace); diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index e60a1a848de..c016dfe8478 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -22,6 +22,7 @@ #include <linux/pagemap.h> #include <linux/bootmem.h> #include <linux/proc_fs.h> +#include <linux/pci.h> #include <asm/processor.h> #include <asm/system.h> @@ -36,16 +37,13 @@ #include <asm/mmu_context.h> #include <asm/proto.h> #include <asm/smp.h> +#include <asm/sections.h> #ifndef Dprintk #define Dprintk(x...) #endif -#ifdef CONFIG_GART_IOMMU -extern int swiotlb; -#endif - -extern char _stext[]; +static unsigned long dma_reserve __initdata; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); @@ -86,9 +84,6 @@ void show_mem(void) /* References to section boundaries */ -extern char _text, _etext, _edata, __bss_start, _end[]; -extern char __init_begin, __init_end; - int after_bootmem; static void *spp_getpage(void) @@ -308,42 +303,81 @@ void __init init_memory_mapping(unsigned long start, unsigned long end) table_end<<PAGE_SHIFT); } -extern struct x8664_pda cpu_pda[NR_CPUS]; +void __cpuinit zap_low_mappings(int cpu) +{ + if (cpu == 0) { + pgd_t *pgd = pgd_offset_k(0UL); + pgd_clear(pgd); + } else { + /* + * For AP's, zap the low identity mappings by changing the cr3 + * to init_level4_pgt and doing local flush tlb all + */ + asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); + } + __flush_tlb_all(); +} -/* Assumes all CPUs still execute in init_mm */ -void zap_low_mappings(void) +/* Compute zone sizes for the DMA and DMA32 zones in a node. */ +__init void +size_zones(unsigned long *z, unsigned long *h, + unsigned long start_pfn, unsigned long end_pfn) { - pgd_t *pgd = pgd_offset_k(0UL); - pgd_clear(pgd); - flush_tlb_all(); + int i; + unsigned long w; + + for (i = 0; i < MAX_NR_ZONES; i++) + z[i] = 0; + + if (start_pfn < MAX_DMA_PFN) + z[ZONE_DMA] = MAX_DMA_PFN - start_pfn; + if (start_pfn < MAX_DMA32_PFN) { + unsigned long dma32_pfn = MAX_DMA32_PFN; + if (dma32_pfn > end_pfn) + dma32_pfn = end_pfn; + z[ZONE_DMA32] = dma32_pfn - start_pfn; + } + z[ZONE_NORMAL] = end_pfn - start_pfn; + + /* Remove lower zones from higher ones. */ + w = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { + if (z[i]) + z[i] -= w; + w += z[i]; + } + + /* Compute holes */ + w = start_pfn; + for (i = 0; i < MAX_NR_ZONES; i++) { + unsigned long s = w; + w += z[i]; + h[i] = e820_hole_size(s, w); + } + + /* Add the space pace needed for mem_map to the holes too. */ + for (i = 0; i < MAX_NR_ZONES; i++) + h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE; + + /* The 16MB DMA zone has the kernel and other misc mappings. + Account them too */ + if (h[ZONE_DMA]) { + h[ZONE_DMA] += dma_reserve; + if (h[ZONE_DMA] >= z[ZONE_DMA]) { + printk(KERN_WARNING + "Kernel too large and filling up ZONE_DMA?\n"); + h[ZONE_DMA] = z[ZONE_DMA]; + } + } } #ifndef CONFIG_NUMA void __init paging_init(void) { - { - unsigned long zones_size[MAX_NR_ZONES]; - unsigned long holes[MAX_NR_ZONES]; - unsigned int max_dma; - - memset(zones_size, 0, sizeof(zones_size)); - memset(holes, 0, sizeof(holes)); - - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - - if (end_pfn < max_dma) { - zones_size[ZONE_DMA] = end_pfn; - holes[ZONE_DMA] = e820_hole_size(0, end_pfn); - } else { - zones_size[ZONE_DMA] = max_dma; - holes[ZONE_DMA] = e820_hole_size(0, max_dma); - zones_size[ZONE_NORMAL] = end_pfn - max_dma; - holes[ZONE_NORMAL] = e820_hole_size(max_dma, end_pfn); - } - free_area_init_node(0, NODE_DATA(0), zones_size, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes); - } - return; + unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES]; + size_zones(zones, holes, 0, end_pfn); + free_area_init_node(0, NODE_DATA(0), zones, + __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes); } #endif @@ -438,19 +472,16 @@ void __init mem_init(void) datasize >> 10, initsize >> 10); +#ifdef CONFIG_SMP /* - * Subtle. SMP is doing its boot stuff late (because it has to - * fork idle threads) - but it also needs low mappings for the - * protected-mode entry to work. We zap these entries only after - * the WP-bit has been tested. + * Sync boot_level4_pgt mappings with the init_level4_pgt + * except for the low identity mappings which are already zapped + * in init_level4_pgt. This sync-up is essential for AP's bringup */ -#ifndef CONFIG_SMP - zap_low_mappings(); + memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t)); #endif } -extern char __initdata_begin[], __initdata_end[]; - void free_initmem(void) { unsigned long addr; @@ -464,7 +495,7 @@ void free_initmem(void) totalram_pages++; } memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin); - printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10); + printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10); } #ifdef CONFIG_BLK_DEV_INITRD @@ -491,6 +522,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) #else reserve_bootmem(phys, len); #endif + if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) + dma_reserve += len / PAGE_SIZE; } int kern_addr_valid(unsigned long addr) @@ -532,10 +565,6 @@ extern int exception_trace, page_fault_trace; static ctl_table debug_table2[] = { { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL, proc_dointvec }, -#ifdef CONFIG_CHECKING - { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL, - proc_dointvec }, -#endif { 0, } }; diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c index ecf7acb5db9..ae207064201 100644 --- a/arch/x86_64/mm/ioremap.c +++ b/arch/x86_64/mm/ioremap.c @@ -247,9 +247,15 @@ void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) return __ioremap(phys_addr, size, _PAGE_PCD); } +/** + * iounmap - Free a IO remapping + * @addr: virtual address from ioremap_* + * + * Caller must ensure there is only one unmapping for the same pointer. + */ void iounmap(volatile void __iomem *addr) { - struct vm_struct *p; + struct vm_struct *p, *o; if (addr <= high_memory) return; @@ -257,12 +263,31 @@ void iounmap(volatile void __iomem *addr) addr < phys_to_virt(ISA_END_ADDRESS)) return; - write_lock(&vmlist_lock); - p = __remove_vm_area((void *)((unsigned long)addr & PAGE_MASK)); - if (!p) + addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr); + /* Use the vm area unlocked, assuming the caller + ensures there isn't another iounmap for the same address + in parallel. Reuse of the virtual address is prevented by + leaving it in the global lists until we're done with it. + cpa takes care of the direct mappings. */ + read_lock(&vmlist_lock); + for (p = vmlist; p; p = p->next) { + if (p->addr == addr) + break; + } + read_unlock(&vmlist_lock); + + if (!p) { printk("iounmap: bad address %p\n", addr); - else if (p->flags >> 20) + dump_stack(); + return; + } + + /* Reset the direct mapping. Can block */ + if (p->flags >> 20) ioremap_change_attr(p->phys_addr, p->size, 0); - write_unlock(&vmlist_lock); + + /* Finally remove it */ + o = remove_vm_area((void *)addr); + BUG_ON(p != o || o == NULL); kfree(p); } diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c index 65417b040c1..a5663e0bb01 100644 --- a/arch/x86_64/mm/k8topology.c +++ b/arch/x86_64/mm/k8topology.c @@ -108,6 +108,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) limit >>= 16; limit <<= 24; limit |= (1<<24)-1; + limit++; if (limit > end_pfn << PAGE_SHIFT) limit = end_pfn << PAGE_SHIFT; diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 21480382100..15b67d2760c 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -38,38 +38,59 @@ cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; int numa_off __initdata; -int __init compute_hash_shift(struct node *nodes, int numnodes) + +/* + * Given a shift value, try to populate memnodemap[] + * Returns : + * 1 if OK + * 0 if memnodmap[] too small (of shift too small) + * -1 if node overlap or lost ram (shift too big) + */ +static int __init populate_memnodemap( + const struct node *nodes, int numnodes, int shift) { int i; - int shift = 20; - unsigned long addr,maxend=0; - - for (i = 0; i < numnodes; i++) - if ((nodes[i].start != nodes[i].end) && (nodes[i].end > maxend)) - maxend = nodes[i].end; + int res = -1; + unsigned long addr, end; - while ((1UL << shift) < (maxend / NODEMAPSIZE)) - shift++; - - printk (KERN_DEBUG"Using %d for the hash shift. Max adder is %lx \n", - shift,maxend); - memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE); + if (shift >= 64) + return -1; + memset(memnodemap, 0xff, sizeof(memnodemap)); for (i = 0; i < numnodes; i++) { - if (nodes[i].start == nodes[i].end) + addr = nodes[i].start; + end = nodes[i].end; + if (addr >= end) continue; - for (addr = nodes[i].start; - addr < nodes[i].end; - addr += (1UL << shift)) { - if (memnodemap[addr >> shift] != 0xff) { - printk(KERN_INFO - "Your memory is not aligned you need to rebuild your kernel " - "with a bigger NODEMAPSIZE shift=%d adder=%lu\n", - shift,addr); + if ((end >> shift) >= NODEMAPSIZE) + return 0; + do { + if (memnodemap[addr >> shift] != 0xff) return -1; - } memnodemap[addr >> shift] = i; - } + addr += (1UL << shift); + } while (addr < end); + res = 1; } + return res; +} + +int __init compute_hash_shift(struct node *nodes, int numnodes) +{ + int shift = 20; + + while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0) + shift++; + + printk(KERN_DEBUG "Using %d for the hash shift.\n", + shift); + + if (populate_memnodemap(nodes, numnodes, shift) != 1) { + printk(KERN_INFO + "Your memory is not aligned you need to rebuild your kernel " + "with a bigger NODEMAPSIZE shift=%d\n", + shift); + return -1; + } return shift; } @@ -94,7 +115,6 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en start_pfn = start >> PAGE_SHIFT; end_pfn = end >> PAGE_SHIFT; - memory_present(nodeid, start_pfn, end_pfn); nodedata_phys = find_e820_area(start, end, pgdat_size); if (nodedata_phys == -1L) panic("Cannot find memory pgdat in node %d\n", nodeid); @@ -132,29 +152,14 @@ void __init setup_node_zones(int nodeid) unsigned long start_pfn, end_pfn; unsigned long zones[MAX_NR_ZONES]; unsigned long holes[MAX_NR_ZONES]; - unsigned long dma_end_pfn; - memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); - memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES); + start_pfn = node_start_pfn(nodeid); + end_pfn = node_end_pfn(nodeid); - start_pfn = node_start_pfn(nodeid); - end_pfn = node_end_pfn(nodeid); + Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", + nodeid, start_pfn, end_pfn); - Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn); - - /* All nodes > 0 have a zero length zone DMA */ - dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT; - if (start_pfn < dma_end_pfn) { - zones[ZONE_DMA] = dma_end_pfn - start_pfn; - holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn); - zones[ZONE_NORMAL] = end_pfn - dma_end_pfn; - holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn); - - } else { - zones[ZONE_NORMAL] = end_pfn - start_pfn; - holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn); - } - + size_zones(zones, holes, start_pfn, end_pfn); free_area_init_node(nodeid, NODE_DATA(nodeid), zones, start_pfn, holes); } @@ -171,7 +176,7 @@ void __init numa_init_array(void) for (i = 0; i < NR_CPUS; i++) { if (cpu_to_node[i] != NUMA_NO_NODE) continue; - cpu_to_node[i] = rr; + numa_set_node(i, rr); rr = next_node(rr, node_online_map); if (rr == MAX_NUMNODES) rr = first_node(node_online_map); @@ -205,8 +210,6 @@ static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) if (i == numa_fake-1) sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; nodes[i].end = nodes[i].start + sz; - if (i != numa_fake-1) - nodes[i].end--; printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", i, nodes[i].start, nodes[i].end, @@ -257,7 +260,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) nodes_clear(node_online_map); node_set_online(0); for (i = 0; i < NR_CPUS; i++) - cpu_to_node[i] = 0; + numa_set_node(i, 0); node_to_cpumask[0] = cpumask_of_cpu(0); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); } @@ -267,6 +270,12 @@ __cpuinit void numa_add_cpu(int cpu) set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); } +void __cpuinit numa_set_node(int cpu, int node) +{ + cpu_pda[cpu].nodenumber = node; + cpu_to_node[cpu] = node; +} + unsigned long __init numa_free_all_bootmem(void) { int i; @@ -277,9 +286,26 @@ unsigned long __init numa_free_all_bootmem(void) return pages; } +#ifdef CONFIG_SPARSEMEM +static void __init arch_sparse_init(void) +{ + int i; + + for_each_online_node(i) + memory_present(i, node_start_pfn(i), node_end_pfn(i)); + + sparse_init(); +} +#else +#define arch_sparse_init() do {} while (0) +#endif + void __init paging_init(void) { int i; + + arch_sparse_init(); + for_each_online_node(i) { setup_node_zones(i); } diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 4b2e844c15a..33340bd1e32 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -71,8 +71,6 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end) nd->start = nd->end; } if (nd->end > end) { - if (!(end & 0xfff)) - end--; nd->end = end; if (nd->start > nd->end) nd->start = nd->end; @@ -166,8 +164,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) if (nd->end < end) nd->end = end; } - if (!(nd->end & 0xfff)) - nd->end--; printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, nd->start, nd->end); } @@ -203,7 +199,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) if (cpu_to_node[i] == NUMA_NO_NODE) continue; if (!node_isset(cpu_to_node[i], nodes_parsed)) - cpu_to_node[i] = NUMA_NO_NODE; + numa_set_node(i, NUMA_NO_NODE); } numa_init_array(); return 0; |