diff options
Diffstat (limited to 'arch/x86_64/mm')
-rw-r--r-- | arch/x86_64/mm/fault.c | 81 | ||||
-rw-r--r-- | arch/x86_64/mm/init.c | 44 | ||||
-rw-r--r-- | arch/x86_64/mm/k8topology.c | 2 | ||||
-rw-r--r-- | arch/x86_64/mm/numa.c | 39 | ||||
-rw-r--r-- | arch/x86_64/mm/pageattr.c | 63 | ||||
-rw-r--r-- | arch/x86_64/mm/srat.c | 8 |
6 files changed, 149 insertions, 88 deletions
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 2e7c3c8ffe0..55250593d8c 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -264,6 +264,8 @@ static int vmalloc_fault(unsigned long address) return -1; if (pgd_none(*pgd)) set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref)); /* Below here mismatches are bugs because these lower tables are shared */ @@ -312,21 +314,13 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, unsigned long flags; siginfo_t info; + tsk = current; + mm = tsk->mm; + prefetchw(&mm->mmap_sem); + /* get the address */ __asm__("movq %%cr2,%0":"=r" (address)); - if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, - SIGSEGV) == NOTIFY_STOP) - return; - - if (likely(regs->eflags & X86_EFLAGS_IF)) - local_irq_enable(); - if (unlikely(page_fault_trace)) - printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", - regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); - - tsk = current; - mm = tsk->mm; info.si_code = SEGV_MAPERR; @@ -351,10 +345,12 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, */ if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && ((address >= VMALLOC_START && address < VMALLOC_END))) { - if (vmalloc_fault(address) < 0) - goto bad_area_nosemaphore; - return; + if (vmalloc_fault(address) >= 0) + return; } + if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, + SIGSEGV) == NOTIFY_STOP) + return; /* * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock. @@ -362,6 +358,17 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, goto bad_area_nosemaphore; } + if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, + SIGSEGV) == NOTIFY_STOP) + return; + + if (likely(regs->eflags & X86_EFLAGS_IF)) + local_irq_enable(); + + if (unlikely(page_fault_trace)) + printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", + regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); + if (unlikely(error_code & PF_RSVD)) pgtable_bad(address, regs, error_code); @@ -571,9 +578,51 @@ do_sigbus: return; } +DEFINE_SPINLOCK(pgd_lock); +struct page *pgd_list; + +void vmalloc_sync_all(void) +{ + /* Note that races in the updates of insync and start aren't + problematic: + insync can only get set bits added, and updates to start are only + improving performance (without affecting correctness if undone). */ + static DECLARE_BITMAP(insync, PTRS_PER_PGD); + static unsigned long start = VMALLOC_START & PGDIR_MASK; + unsigned long address; + + for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { + if (!test_bit(pgd_index(address), insync)) { + const pgd_t *pgd_ref = pgd_offset_k(address); + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + spin_lock(&pgd_lock); + for (page = pgd_list; page; + page = (struct page *)page->index) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref)); + } + spin_unlock(&pgd_lock); + set_bit(pgd_index(address), insync); + } + if (address == start) + start = address + PGDIR_SIZE; + } + /* Check that there is no need to do the same for the modules area. */ + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); + BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == + (__START_KERNEL & PGDIR_MASK))); +} + static int __init enable_pagefaulttrace(char *str) { page_fault_trace = 1; - return 0; + return 1; } __setup("pagefaulttrace", enable_pagefaulttrace); diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 7af1742aa95..e5f7f1c3446 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -72,7 +72,7 @@ void show_mem(void) show_free_areas(); printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pfn_to_page(pgdat->node_start_pfn + i); total++; @@ -94,7 +94,7 @@ void show_mem(void) int after_bootmem; -static void *spp_getpage(void) +static __init void *spp_getpage(void) { void *ptr; if (after_bootmem) @@ -108,7 +108,7 @@ static void *spp_getpage(void) return ptr; } -static void set_pte_phys(unsigned long vaddr, +static __init void set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) { pgd_t *pgd; @@ -157,7 +157,8 @@ static void set_pte_phys(unsigned long vaddr, } /* NOTE: this is meant to be run only at boot */ -void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) +void __init +__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) { unsigned long address = __fix_to_virt(idx); @@ -225,6 +226,33 @@ static __meminit void unmap_low_page(int i) ti->allocated = 0; } +/* Must run before zap_low_mappings */ +__init void *early_ioremap(unsigned long addr, unsigned long size) +{ + unsigned long map = round_down(addr, LARGE_PAGE_SIZE); + + /* actually usually some more */ + if (size >= LARGE_PAGE_SIZE) { + printk("SMBIOS area too long %lu\n", size); + return NULL; + } + set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); + map += LARGE_PAGE_SIZE; + set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); + __flush_tlb(); + return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1)); +} + +/* To avoid virtual aliases later */ +__init void early_iounmap(void *addr, unsigned long size) +{ + if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address) + printk("early_iounmap: bad address %p\n", addr); + set_pmd(temp_mappings[0].pmd, __pmd(0)); + set_pmd(temp_mappings[1].pmd, __pmd(0)); + __flush_tlb(); +} + static void __meminit phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) { @@ -344,7 +372,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) pud_t *pud; if (after_bootmem) - pud = pud_offset_k(pgd, __PAGE_OFFSET); + pud = pud_offset_k(pgd, start & PGDIR_MASK); else pud = alloc_low_page(&map, &pud_phys); @@ -486,7 +514,7 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size) void online_page(struct page *page) { ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalram_pages++; num_physpages++; @@ -592,7 +620,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); free_page(addr); totalram_pages++; @@ -632,7 +660,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c index dd60e71fdba..7c45c2d2b8b 100644 --- a/arch/x86_64/mm/k8topology.c +++ b/arch/x86_64/mm/k8topology.c @@ -43,7 +43,7 @@ static __init int find_northbridge(void) int __init k8_scan_nodes(unsigned long start, unsigned long end) { unsigned long prevbase; - struct node nodes[8]; + struct bootnode nodes[8]; int nodeid, i, nb; unsigned char nodeids[8]; int found = 0; diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 22e51beee8d..4be82d6e2b4 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -25,8 +25,7 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; bootmem_data_t plat_node_bdata[MAX_NUMNODES]; -int memnode_shift; -u8 memnodemap[NODEMAPSIZE]; +struct memnode memnode; unsigned char cpu_to_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = NUMA_NO_NODE @@ -47,7 +46,7 @@ int numa_off __initdata; * -1 if node overlap or lost ram (shift too big) */ static int __init -populate_memnodemap(const struct node *nodes, int numnodes, int shift) +populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) { int i; int res = -1; @@ -74,7 +73,7 @@ populate_memnodemap(const struct node *nodes, int numnodes, int shift) return res; } -int __init compute_hash_shift(struct node *nodes, int numnodes) +int __init compute_hash_shift(struct bootnode *nodes, int numnodes) { int shift = 20; @@ -149,7 +148,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en /* Initialize final allocator for a zone */ void __init setup_node_zones(int nodeid) { - unsigned long start_pfn, end_pfn; + unsigned long start_pfn, end_pfn, memmapsize, limit; unsigned long zones[MAX_NR_ZONES]; unsigned long holes[MAX_NR_ZONES]; @@ -159,6 +158,16 @@ void __init setup_node_zones(int nodeid) Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn); + /* Try to allocate mem_map at end to not fill up precious <4GB + memory. */ + memmapsize = sizeof(struct page) * (end_pfn-start_pfn); + limit = end_pfn << PAGE_SHIFT; + NODE_DATA(nodeid)->node_mem_map = + __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, + memmapsize, SMP_CACHE_BYTES, + round_down(limit - memmapsize, PAGE_SIZE), + limit); + size_zones(zones, holes, start_pfn, end_pfn); free_area_init_node(nodeid, NODE_DATA(nodeid), zones, start_pfn, holes); @@ -191,7 +200,7 @@ int numa_fake __initdata = 0; static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) { int i; - struct node nodes[MAX_NUMNODES]; + struct bootnode nodes[MAX_NUMNODES]; unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake; /* Kludge needed for the hash function */ @@ -357,8 +366,7 @@ void __init init_cpu_to_node(void) EXPORT_SYMBOL(cpu_to_node); EXPORT_SYMBOL(node_to_cpumask); -EXPORT_SYMBOL(memnode_shift); -EXPORT_SYMBOL(memnodemap); +EXPORT_SYMBOL(memnode); EXPORT_SYMBOL(node_data); #ifdef CONFIG_DISCONTIGMEM @@ -369,21 +377,6 @@ EXPORT_SYMBOL(node_data); * Should do that. */ -/* Requires pfn_valid(pfn) to be true */ -struct page *pfn_to_page(unsigned long pfn) -{ - int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT); - return (pfn - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map; -} -EXPORT_SYMBOL(pfn_to_page); - -unsigned long page_to_pfn(struct page *page) -{ - return (long)(((page) - page_zone(page)->zone_mem_map) + - page_zone(page)->zone_start_pfn); -} -EXPORT_SYMBOL(page_to_pfn); - int pfn_valid(unsigned long pfn) { unsigned nid; diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index 35f1f1aab06..531ad21447b 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -45,6 +45,13 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, pte_t *pbase; if (!base) return NULL; + /* + * page_private is used to track the number of entries in + * the page table page have non standard attributes. + */ + SetPagePrivate(base); + page_private(base) = 0; + address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); @@ -77,26 +84,12 @@ static inline void flush_map(unsigned long address) on_each_cpu(flush_kernel_map, (void *)address, 1, 1); } -struct deferred_page { - struct deferred_page *next; - struct page *fpage; - unsigned long address; -}; -static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */ +static struct page *deferred_pages; /* protected by init_mm.mmap_sem */ -static inline void save_page(unsigned long address, struct page *fpage) +static inline void save_page(struct page *fpage) { - struct deferred_page *df; - df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL); - if (!df) { - flush_map(address); - __free_page(fpage); - } else { - df->next = df_list; - df->fpage = fpage; - df->address = address; - df_list = df; - } + fpage->lru.next = (struct list_head *)deferred_pages; + deferred_pages = fpage; } /* @@ -138,8 +131,8 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, set_pte(kpte, pfn_pte(pfn, prot)); } else { /* - * split_large_page will take the reference for this change_page_attr - * on the split page. + * split_large_page will take the reference for this + * change_page_attr on the split page. */ struct page *split; @@ -151,23 +144,20 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, set_pte(kpte,mk_pte(split, ref_prot2)); kpte_page = split; } - get_page(kpte_page); + page_private(kpte_page)++; } else if ((kpte_flags & _PAGE_PSE) == 0) { set_pte(kpte, pfn_pte(pfn, ref_prot)); - __put_page(kpte_page); + BUG_ON(page_private(kpte_page) == 0); + page_private(kpte_page)--; } else BUG(); /* on x86-64 the direct mapping set at boot is not using 4k pages */ BUG_ON(PageReserved(kpte_page)); - switch (page_count(kpte_page)) { - case 1: - save_page(address, kpte_page); + if (page_private(kpte_page) == 0) { + save_page(kpte_page); revert_page(address, ref_prot); - break; - case 0: - BUG(); /* memleak and failed 2M page regeneration */ } return 0; } @@ -220,17 +210,18 @@ int change_page_attr(struct page *page, int numpages, pgprot_t prot) void global_flush_tlb(void) { - struct deferred_page *df, *next_df; + struct page *dpage; down_read(&init_mm.mmap_sem); - df = xchg(&df_list, NULL); + dpage = xchg(&deferred_pages, NULL); up_read(&init_mm.mmap_sem); - flush_map((df && !df->next) ? df->address : 0); - for (; df; df = next_df) { - next_df = df->next; - if (df->fpage) - __free_page(df->fpage); - kfree(df); + + flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0); + while (dpage) { + struct page *tmp = dpage; + dpage = (struct page *)dpage->lru.next; + ClearPagePrivate(tmp); + __free_page(tmp); } } diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 482c2576736..2eb879590dc 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -23,7 +23,7 @@ static struct acpi_table_slit *acpi_slit; static nodemask_t nodes_parsed __initdata; static nodemask_t nodes_found __initdata; -static struct node nodes[MAX_NUMNODES] __initdata; +static struct bootnode nodes[MAX_NUMNODES] __initdata; static u8 pxm2node[256] = { [0 ... 255] = 0xff }; /* Too small nodes confuse the VM badly. Usually they result @@ -57,7 +57,7 @@ static __init int conflicting_nodes(unsigned long start, unsigned long end) { int i; for_each_node_mask(i, nodes_parsed) { - struct node *nd = &nodes[i]; + struct bootnode *nd = &nodes[i]; if (nd->start == nd->end) continue; if (nd->end > start && nd->start < end) @@ -70,7 +70,7 @@ static __init int conflicting_nodes(unsigned long start, unsigned long end) static __init void cutoff_node(int i, unsigned long start, unsigned long end) { - struct node *nd = &nodes[i]; + struct bootnode *nd = &nodes[i]; if (nd->start < start) { nd->start = start; if (nd->end < nd->start) @@ -159,7 +159,7 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa) void __init acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) { - struct node *nd; + struct bootnode *nd; unsigned long start, end; int node, pxm; int i; |