diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-11-26 08:22:50 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-11-26 08:22:50 +0100 |
commit | 7fbb8759eff9a348efa5f352ffaa51c364837c4b (patch) | |
tree | d40cd3f47b9f667ba94d9613270132080dcb6a1a /arch/x86/mm/init_64.c | |
parent | 6003ab0bad4cc56f3c4fadf62a0d23a967b9c53b (diff) | |
parent | 13d428afc007fcfcd6deeb215618f54cf9c0cae6 (diff) |
Merge commit 'v2.6.28-rc6' into core/debug
Diffstat (limited to 'arch/x86/mm/init_64.c')
-rw-r--r-- | arch/x86/mm/init_64.c | 253 |
1 files changed, 194 insertions, 59 deletions
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d3746efb060..9db01db6e3c 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -31,6 +31,7 @@ #include <linux/nmi.h> #include <asm/processor.h> +#include <asm/bios_ebda.h> #include <asm/system.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -88,6 +89,62 @@ early_param("gbpages", parse_direct_gbpages_on); int after_bootmem; +pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; +EXPORT_SYMBOL_GPL(__supported_pte_mask); + +static int do_not_nx __cpuinitdata; + +/* + * noexec=on|off + * Control non-executable mappings for 64-bit processes. + * + * on Enable (default) + * off Disable + */ +static int __init nonx_setup(char *str) +{ + if (!str) + return -EINVAL; + if (!strncmp(str, "on", 2)) { + __supported_pte_mask |= _PAGE_NX; + do_not_nx = 0; + } else if (!strncmp(str, "off", 3)) { + do_not_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } + return 0; +} +early_param("noexec", nonx_setup); + +void __cpuinit check_efer(void) +{ + unsigned long efer; + + rdmsrl(MSR_EFER, efer); + if (!(efer & EFER_NX) || do_not_nx) + __supported_pte_mask &= ~_PAGE_NX; +} + +int force_personality32; + +/* + * noexec32=on|off + * Control non executable heap for 32bit processes. + * To control the stack too use noexec=off + * + * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default) + * off PROT_READ implies PROT_EXEC + */ +static int __init nonx32_setup(char *str) +{ + if (!strcmp(str, "on")) + force_personality32 &= ~READ_IMPLIES_EXEC; + else if (!strcmp(str, "off")) + force_personality32 |= READ_IMPLIES_EXEC; + return 1; +} +__setup("noexec32=", nonx32_setup); + /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. @@ -139,9 +196,6 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) } pte = pte_offset_kernel(pmd, vaddr); - if (!pte_none(*pte) && pte_val(new_pte) && - pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) - pte_ERROR(*pte); set_pte(pte, new_pte); /* @@ -225,7 +279,7 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) void __init cleanup_highmap(void) { unsigned long vaddr = __START_KERNEL_map; - unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; + unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1; pmd_t *pmd = level2_kernel_pgt; pmd_t *last_pmd = pmd + PTRS_PER_PMD; @@ -256,7 +310,7 @@ static __ref void *alloc_low_page(unsigned long *phys) if (pfn >= table_top) panic("alloc_low_page: ran out of memory"); - adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); + adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); memset(adr, 0, PAGE_SIZE); *phys = pfn * PAGE_SIZE; return adr; @@ -271,7 +325,8 @@ static __ref void unmap_low_page(void *adr) } static unsigned long __meminit -phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) +phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, + pgprot_t prot) { unsigned pages = 0; unsigned long last_map_addr = end; @@ -289,36 +344,45 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) break; } - if (pte_val(*pte)) + /* + * We will re-use the existing mapping. + * Xen for example has some special requirements, like mapping + * pagetable pages as RO. So assume someone who pre-setup + * these mappings are more intelligent. + */ + if (pte_val(*pte)) { + pages++; continue; + } if (0) printk(" pte=%p addr=%lx pte=%016lx\n", pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); - set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL)); - last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; pages++; + set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot)); + last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; } + update_page_count(PG_LEVEL_4K, pages); return last_map_addr; } static unsigned long __meminit -phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) +phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end, + pgprot_t prot) { pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); - return phys_pte_init(pte, address, end); + return phys_pte_init(pte, address, end, prot); } static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, - unsigned long page_size_mask) + unsigned long page_size_mask, pgprot_t prot) { unsigned long pages = 0; unsigned long last_map_addr = end; - unsigned long start = address; int i = pmd_index(address); @@ -326,6 +390,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, unsigned long pte_phys; pmd_t *pmd = pmd_page + pmd_index(address); pte_t *pte; + pgprot_t new_prot = prot; if (address >= end) { if (!after_bootmem) { @@ -339,27 +404,42 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, if (!pmd_large(*pmd)) { spin_lock(&init_mm.page_table_lock); last_map_addr = phys_pte_update(pmd, address, - end); + end, prot); spin_unlock(&init_mm.page_table_lock); + continue; } - /* Count entries we're using from level2_ident_pgt */ - if (start == 0) + /* + * If we are ok with PG_LEVEL_2M mapping, then we will + * use the existing mapping, + * + * Otherwise, we will split the large page mapping but + * use the same existing protection bits except for + * large page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_2M)) { pages++; - continue; + continue; + } + new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); } if (page_size_mask & (1<<PG_LEVEL_2M)) { pages++; spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pmd, - pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + pfn_pte(address >> PAGE_SHIFT, + __pgprot(pgprot_val(prot) | _PAGE_PSE))); spin_unlock(&init_mm.page_table_lock); last_map_addr = (address & PMD_MASK) + PMD_SIZE; continue; } pte = alloc_low_page(&pte_phys); - last_map_addr = phys_pte_init(pte, address, end); + last_map_addr = phys_pte_init(pte, address, end, new_prot); unmap_low_page(pte); spin_lock(&init_mm.page_table_lock); @@ -372,12 +452,12 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, static unsigned long __meminit phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, - unsigned long page_size_mask) + unsigned long page_size_mask, pgprot_t prot) { pmd_t *pmd = pmd_offset(pud, 0); unsigned long last_map_addr; - last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); + last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot); __flush_tlb_all(); return last_map_addr; } @@ -394,6 +474,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, unsigned long pmd_phys; pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; + pgprot_t prot = PAGE_KERNEL; if (addr >= end) break; @@ -405,10 +486,28 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, } if (pud_val(*pud)) { - if (!pud_large(*pud)) + if (!pud_large(*pud)) { last_map_addr = phys_pmd_update(pud, addr, end, - page_size_mask); - continue; + page_size_mask, prot); + continue; + } + /* + * If we are ok with PG_LEVEL_1G mapping, then we will + * use the existing mapping. + * + * Otherwise, we will split the gbpage mapping but use + * the same existing protection bits except for large + * page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_1G)) { + pages++; + continue; + } + prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); } if (page_size_mask & (1<<PG_LEVEL_1G)) { @@ -422,7 +521,8 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, } pmd = alloc_low_page(&pmd_phys); - last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); + last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, + prot); unmap_low_page(pmd); spin_lock(&init_mm.page_table_lock); @@ -430,6 +530,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, spin_unlock(&init_mm.page_table_lock); } __flush_tlb_all(); + update_page_count(PG_LEVEL_1G, pages); return last_map_addr; @@ -446,27 +547,28 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, return phys_pud_init(pud, addr, end, page_size_mask); } -static void __init find_early_table_space(unsigned long end) +static void __init find_early_table_space(unsigned long end, int use_pse, + int use_gbpages) { unsigned long puds, pmds, ptes, tables, start; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); - if (direct_gbpages) { + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); + if (use_gbpages) { unsigned long extra; extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; } else pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); - if (cpu_has_pse) { + if (use_pse) { unsigned long extra; extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); + tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); /* * RED-PEN putting page tables only on node 0 could @@ -528,6 +630,7 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start, pgd_populate(&init_mm, pgd, __va(pud_phys)); spin_unlock(&init_mm.page_table_lock); } + __flush_tlb_all(); return last_map_addr; } @@ -568,11 +671,13 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long last_map_addr = 0; unsigned long page_size_mask = 0; unsigned long start_pfn, end_pfn; + unsigned long pos; struct map_range mr[NR_RANGE_MR]; int nr_range, i; + int use_pse, use_gbpages; - printk(KERN_INFO "init_memory_mapping\n"); + printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); /* * Find space for the kernel direct mapping tables. @@ -584,9 +689,21 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, if (!after_bootmem) init_gbpages(); - if (direct_gbpages) +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * This will simplify cpa(), which otherwise needs to support splitting + * large pages into small in interrupt context, etc. + */ + use_pse = use_gbpages = 0; +#else + use_pse = cpu_has_pse; + use_gbpages = direct_gbpages; +#endif + + if (use_gbpages) page_size_mask |= 1 << PG_LEVEL_1G; - if (cpu_has_pse) + if (use_pse) page_size_mask |= 1 << PG_LEVEL_2M; memset(mr, 0, sizeof(mr)); @@ -594,35 +711,50 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, /* head if not big page alignment ?*/ start_pfn = start >> PAGE_SHIFT; - end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT) + pos = start_pfn << PAGE_SHIFT; + end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + pos = end_pfn << PAGE_SHIFT; + } /* big page (2M) range*/ - start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) + start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT) + end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); - if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT))) - end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)); - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & (1<<PG_LEVEL_2M)); + if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) + end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<<PG_LEVEL_2M)); + pos = end_pfn << PAGE_SHIFT; + } /* big page (1G) range */ - start_pfn = end_pfn; - end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); + pos = end_pfn << PAGE_SHIFT; + } /* tail is not big page (1G) alignment */ - start_pfn = end_pfn; - end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & (1<<PG_LEVEL_2M)); + start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<<PG_LEVEL_2M)); + pos = end_pfn << PAGE_SHIFT; + } /* tail is not big page (2M) alignment */ - start_pfn = end_pfn; + start_pfn = pos>>PAGE_SHIFT; end_pfn = end>>PAGE_SHIFT; nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); @@ -636,7 +768,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, old_start = mr[i].start; memmove(&mr[i], &mr[i+1], (nr_range - 1 - i) * sizeof (struct map_range)); - mr[i].start = old_start; + mr[i--].start = old_start; nr_range--; } @@ -647,7 +779,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); if (!after_bootmem) - find_early_table_space(end); + find_early_table_space(end, use_pse, use_gbpages); for (i = 0; i < nr_range; i++) last_map_addr = kernel_physical_mapping_init( @@ -721,12 +853,12 @@ int arch_add_memory(int nid, u64 start, u64 size) unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - last_mapped_pfn = init_memory_mapping(start, start + size-1); + last_mapped_pfn = init_memory_mapping(start, start + size); if (last_mapped_pfn > max_pfn_mapped) max_pfn_mapped = last_mapped_pfn; ret = __add_pages(zone, start_pfn, nr_pages); - WARN_ON(1); + WARN_ON_ONCE(ret); return ret; } @@ -768,6 +900,9 @@ static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, void __init mem_init(void) { long codesize, reservedpages, datasize, initsize; + unsigned long absent_pages; + + start_periodic_check_for_corruption(); pci_iommu_alloc(); @@ -781,8 +916,9 @@ void __init mem_init(void) #else totalram_pages = free_all_bootmem(); #endif - reservedpages = max_pfn - totalram_pages - - absent_pages_in_range(0, max_pfn); + + absent_pages = absent_pages_in_range(0, max_pfn); + reservedpages = max_pfn - totalram_pages - absent_pages; after_bootmem = 1; codesize = (unsigned long) &_etext - (unsigned long) &_text; @@ -799,15 +935,14 @@ void __init mem_init(void) VSYSCALL_END - VSYSCALL_START); printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " - "%ldk reserved, %ldk data, %ldk init)\n", + "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), max_pfn << (PAGE_SHIFT-10), codesize >> 10, + absent_pages << (PAGE_SHIFT-10), reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10); - - cpa_init(); } void free_init_pages(char *what, unsigned long begin, unsigned long end) |