From fa62aafea9e415cd1efd8c4054106112fe809f19 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:38 -0800 Subject: x86, mm: Add global page_size_mask and probe one time only Now we pass around use_gbpages and use_pse for calculating page table size, Later we will need to call init_memory_mapping for every ram range one by one, that mean those calculation will be done several times. Those information are the same for all ram range and could be stored in page_size_mask and could be probed it one time only. Move that probing code out of init_memory_mapping into separated function probe_page_size_mask(), and call it before all init_memory_mapping. Suggested-by: Ingo Molnar Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-2-git-send-email-yinghai@kernel.org Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a1f780d45f7..98ac76dc4ea 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -602,6 +602,7 @@ static inline int pgd_none(pgd_t pgd) #ifndef __ASSEMBLY__ extern int direct_gbpages; +void probe_page_size_mask(void); /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) -- cgit v1.2.3-70-g09d2 From 22ddfcaa0dbae992332381d41b8a1fbc72269a13 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:41 -0800 Subject: x86, mm: Move init_memory_mapping calling out of setup.c Now init_memory_mapping is called two times, later will be called for every ram ranges. Could put all related init_mem calling together and out of setup.c. Actually, it reverts commit 1bbbbe7 x86: Exclude E820_RESERVED regions and memory holes above 4 GB from direct mapping. will address that later with complete solution include handling hole under 4g. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-5-git-send-email-yinghai@kernel.org Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/init.h | 1 - arch/x86/include/asm/pgtable.h | 2 +- arch/x86/kernel/setup.c | 27 +-------------------------- arch/x86/mm/init.c | 19 ++++++++++++++++++- 4 files changed, 20 insertions(+), 29 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index adcc0ae73d0..4f13998be59 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -12,7 +12,6 @@ kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask); - extern unsigned long __initdata pgt_buf_start; extern unsigned long __meminitdata pgt_buf_end; extern unsigned long __meminitdata pgt_buf_top; diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 98ac76dc4ea..dd1a88832d2 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -602,7 +602,7 @@ static inline int pgd_none(pgd_t pgd) #ifndef __ASSEMBLY__ extern int direct_gbpages; -void probe_page_size_mask(void); +void init_mem_mapping(void); /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 01fb5f9baf9..23b079fb93f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -913,34 +913,9 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); init_gbpages(); - probe_page_size_mask(); - /* max_pfn_mapped is updated here */ - max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) { - int i; - unsigned long start, end; - unsigned long start_pfn, end_pfn; - - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, - NULL) { - - end = PFN_PHYS(end_pfn); - if (end <= (1UL<<32)) - continue; - - start = PFN_PHYS(start_pfn); - max_pfn_mapped = init_memory_mapping( - max((1UL<<32), start), end); - } - - /* can we preseve max_low_pfn ?*/ - max_low_pfn = max_pfn; - } -#endif memblock.current_limit = get_max_mapped(); dma_contiguous_reserve(0); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 701abbc2473..9e17f9e18a2 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -37,7 +37,7 @@ struct map_range { static int page_size_mask; -void probe_page_size_mask(void) +static void __init probe_page_size_mask(void) { #if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) /* @@ -315,6 +315,23 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, return ret >> PAGE_SHIFT; } +void __init init_mem_mapping(void) +{ + probe_page_size_mask(); + + /* max_pfn_mapped is updated here */ + max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) { + max_pfn_mapped = init_memory_mapping(1UL<<32, + max_pfn< Date: Fri, 16 Nov 2012 19:38:48 -0800 Subject: x86, mm: Fixup code testing if a pfn is direct mapped Update code that previously assumed pfns [ 0 - max_low_pfn_mapped ) and [ 4GB - max_pfn_mapped ) were always direct mapped, to now look up pfn_mapped ranges instead. -v2: change applying sequence to keep git bisecting working. so add dummy pfn_range_is_mapped(). - Yinghai Lu Signed-off-by: Jacob Shin Link: http://lkml.kernel.org/r/1353123563-3103-12-git-send-email-yinghai@kernel.org Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 8 ++++++++ arch/x86/kernel/cpu/amd.c | 8 +++----- arch/x86/platform/efi/efi.c | 7 +++---- 3 files changed, 14 insertions(+), 9 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index e21fdd10479..45aae6e5f64 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -51,6 +51,14 @@ static inline phys_addr_t get_max_mapped(void) return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; } +static inline bool pfn_range_is_mapped(unsigned long start_pfn, + unsigned long end_pfn) +{ + return end_pfn <= max_low_pfn_mapped || + (end_pfn > (1UL << (32 - PAGE_SHIFT)) && + end_pfn <= max_pfn_mapped); +} + extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f7e98a2c0d1..9619ba6528c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -676,12 +676,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * benefit in doing so. */ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { + unsigned long pfn = tseg >> PAGE_SHIFT; + printk(KERN_DEBUG "tseg: %010llx\n", tseg); - if ((tseg>>PMD_SHIFT) < - (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || - ((tseg>>PMD_SHIFT) < - (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && - (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) + if (pfn_range_is_mapped(pfn, pfn + 1)) set_memory_4k((unsigned long)__va(tseg), 1); } } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index ad4439145f8..36e53f0a9ce 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -835,7 +835,7 @@ void __init efi_enter_virtual_mode(void) efi_memory_desc_t *md, *prev_md = NULL; efi_status_t status; unsigned long size; - u64 end, systab, end_pfn; + u64 end, systab, start_pfn, end_pfn; void *p, *va, *new_memmap = NULL; int count = 0; @@ -888,10 +888,9 @@ void __init efi_enter_virtual_mode(void) size = md->num_pages << EFI_PAGE_SHIFT; end = md->phys_addr + size; + start_pfn = PFN_DOWN(md->phys_addr); end_pfn = PFN_UP(end); - if (end_pfn <= max_low_pfn_mapped - || (end_pfn > (1UL << (32 - PAGE_SHIFT)) - && end_pfn <= max_pfn_mapped)) { + if (pfn_range_is_mapped(start_pfn, end_pfn)) { va = __va(md->phys_addr); if (!(md->attribute & EFI_MEMORY_WB)) -- cgit v1.2.3-70-g09d2 From 66520ebc2df3fe52eb4792f8101fac573b766baf Mon Sep 17 00:00:00 2001 From: Jacob Shin Date: Fri, 16 Nov 2012 19:38:52 -0800 Subject: x86, mm: Only direct map addresses that are marked as E820_RAM Currently direct mappings are created for [ 0 to max_low_pfn< Link: http://lkml.kernel.org/r/1353123563-3103-16-git-send-email-yinghai@kernel.org Signed-off-by: Yinghai Lu Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 8 +-- arch/x86/kernel/setup.c | 8 ++- arch/x86/mm/init.c | 120 ++++++++++++++++++++++++++++++++++---- arch/x86/mm/init_64.c | 6 +- 4 files changed, 117 insertions(+), 25 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 45aae6e5f64..54c97879195 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -51,13 +51,7 @@ static inline phys_addr_t get_max_mapped(void) return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; } -static inline bool pfn_range_is_mapped(unsigned long start_pfn, - unsigned long end_pfn) -{ - return end_pfn <= max_low_pfn_mapped || - (end_pfn > (1UL << (32 - PAGE_SHIFT)) && - end_pfn <= max_pfn_mapped); -} +bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn); extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bd52f9da17c..68dffeceb19 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -116,9 +116,11 @@ #include /* - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. - * The direct mapping extends to max_pfn_mapped, so that we can directly access - * apertures, ACPI and other tables without having to play with fixmaps. + * max_low_pfn_mapped: highest direct mapped pfn under 4GB + * max_pfn_mapped: highest direct mapped pfn over 4GB + * + * The direct mapping only covers E820_RAM regions, so the ranges and gaps are + * represented by pfn_mapped */ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 7b961d0b138..bb44e9f2cc4 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -243,6 +243,38 @@ static unsigned long __init calculate_table_space_size(unsigned long start, unsi return tables; } +static unsigned long __init calculate_all_table_space_size(void) +{ + unsigned long start_pfn, end_pfn; + unsigned long tables; + int i; + + /* the ISA range is always mapped regardless of memory holes */ + tables = calculate_table_space_size(0, ISA_END_ADDRESS); + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + u64 start = start_pfn << PAGE_SHIFT; + u64 end = end_pfn << PAGE_SHIFT; + + if (end <= ISA_END_ADDRESS) + continue; + + if (start < ISA_END_ADDRESS) + start = ISA_END_ADDRESS; +#ifdef CONFIG_X86_32 + /* on 32 bit, we only map up to max_low_pfn */ + if ((start >> PAGE_SHIFT) >= max_low_pfn) + continue; + + if ((end >> PAGE_SHIFT) > max_low_pfn) + end = max_low_pfn << PAGE_SHIFT; +#endif + tables += calculate_table_space_size(start, end); + } + + return tables; +} + static void __init find_early_table_space(unsigned long start, unsigned long good_end, unsigned long tables) @@ -258,6 +290,34 @@ static void __init find_early_table_space(unsigned long start, pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); } +static struct range pfn_mapped[E820_X_MAX]; +static int nr_pfn_mapped; + +static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn) +{ + nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX, + nr_pfn_mapped, start_pfn, end_pfn); + nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX); + + max_pfn_mapped = max(max_pfn_mapped, end_pfn); + + if (start_pfn < (1UL<<(32-PAGE_SHIFT))) + max_low_pfn_mapped = max(max_low_pfn_mapped, + min(end_pfn, 1UL<<(32-PAGE_SHIFT))); +} + +bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn) +{ + int i; + + for (i = 0; i < nr_pfn_mapped; i++) + if ((start_pfn >= pfn_mapped[i].start) && + (end_pfn <= pfn_mapped[i].end)) + return true; + + return false; +} + /* * Setup the direct mapping of the physical memory at PAGE_OFFSET. * This runs before bootmem is initialized and gets pages directly from @@ -288,9 +348,55 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __flush_tlb_all(); + add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); + return ret >> PAGE_SHIFT; } +/* + * Iterate through E820 memory map and create direct mappings for only E820_RAM + * regions. We cannot simply create direct mappings for all pfns from + * [0 to max_low_pfn) and [4GB to max_pfn) because of possible memory holes in + * high addresses that cannot be marked as UC by fixed/variable range MTRRs. + * Depending on the alignment of E820 ranges, this may possibly result in using + * smaller size (i.e. 4K instead of 2M or 1G) page tables. + */ +static void __init init_all_memory_mapping(void) +{ + unsigned long start_pfn, end_pfn; + int i; + + /* the ISA range is always mapped regardless of memory holes */ + init_memory_mapping(0, ISA_END_ADDRESS); + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + u64 start = (u64)start_pfn << PAGE_SHIFT; + u64 end = (u64)end_pfn << PAGE_SHIFT; + + if (end <= ISA_END_ADDRESS) + continue; + + if (start < ISA_END_ADDRESS) + start = ISA_END_ADDRESS; +#ifdef CONFIG_X86_32 + /* on 32 bit, we only map up to max_low_pfn */ + if ((start >> PAGE_SHIFT) >= max_low_pfn) + continue; + + if ((end >> PAGE_SHIFT) > max_low_pfn) + end = max_low_pfn << PAGE_SHIFT; +#endif + init_memory_mapping(start, end); + } + +#ifdef CONFIG_X86_64 + if (max_pfn > max_low_pfn) { + /* can we preseve max_low_pfn ?*/ + max_low_pfn = max_pfn; + } +#endif +} + void __init init_mem_mapping(void) { unsigned long tables, good_end, end; @@ -311,23 +417,15 @@ void __init init_mem_mapping(void) end = max_low_pfn << PAGE_SHIFT; good_end = max_pfn_mapped << PAGE_SHIFT; #endif - tables = calculate_table_space_size(0, end); + tables = calculate_all_table_space_size(); find_early_table_space(0, good_end, tables); printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] prealloc\n", end - 1, pgt_buf_start << PAGE_SHIFT, (pgt_buf_top << PAGE_SHIFT) - 1); - max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) { - max_pfn_mapped = init_memory_mapping(1UL<<32, - max_pfn<node_zones + ZONE_NORMAL; - unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT; + unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - last_mapped_pfn = init_memory_mapping(start, start + size); - if (last_mapped_pfn > max_pfn_mapped) - max_pfn_mapped = last_mapped_pfn; + init_memory_mapping(start, start + size); ret = __add_pages(nid, zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); -- cgit v1.2.3-70-g09d2 From 8d57470d8f859635deffe3919d7d4867b488b85a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:58 -0800 Subject: x86, mm: setup page table in top-down Get pgt_buf early from BRK, and use it to map PMD_SIZE from top at first. Then use mapped pages to map more ranges below, and keep looping until all pages get mapped. alloc_low_page will use page from BRK at first, after that buffer is used up, will use memblock to find and reserve pages for page table usage. Introduce min_pfn_mapped to make sure find new pages from mapped ranges, that will be updated when lower pages get mapped. Also add step_size to make sure that don't try to map too big range with limited mapped pages initially, and increase the step_size when we have more mapped pages on hand. We don't need to call pagetable_reserve anymore, reserve work is done in alloc_low_page() directly. At last we can get rid of calculation and find early pgt related code. -v2: update to after fix_xen change, also use MACRO for initial pgt_buf size and add comments with it. -v3: skip big reserved range in memblock.reserved near end. -v4: don't need fix_xen change now. -v5: add changelog about moving about reserving pagetable to alloc_low_page. Suggested-by: "H. Peter Anvin" Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-22-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 1 + arch/x86/include/asm/pgtable.h | 1 + arch/x86/kernel/setup.c | 3 + arch/x86/mm/init.c | 210 +++++++++++--------------------------- arch/x86/mm/init_32.c | 17 ++- arch/x86/mm/init_64.c | 17 ++- 6 files changed, 94 insertions(+), 155 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 54c97879195..9f6f3e66e84 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -45,6 +45,7 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; +extern unsigned long min_pfn_mapped; static inline phys_addr_t get_max_mapped(void) { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index dd1a88832d2..6991a3e1bf8 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -603,6 +603,7 @@ static inline int pgd_none(pgd_t pgd) extern int direct_gbpages; void init_mem_mapping(void); +void early_alloc_pgt_buf(void); /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 94f922a73c5..f7634092931 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -124,6 +124,7 @@ */ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; +unsigned long min_pfn_mapped; #ifdef CONFIG_DMI RESERVE_BRK(dmi_alloc, 65536); @@ -900,6 +901,8 @@ void __init setup_arch(char **cmdline_p) reserve_ibft_region(); + early_alloc_pgt_buf(); + /* * Need to conclude brk, before memblock_x86_fill() * it could use memblock_find_in_range, could overlap with diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index c688ea3887f..2393d0099e7 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -21,6 +21,21 @@ unsigned long __initdata pgt_buf_start; unsigned long __meminitdata pgt_buf_end; unsigned long __meminitdata pgt_buf_top; +/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */ +#define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE) +RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); +void __init early_alloc_pgt_buf(void) +{ + unsigned long tables = INIT_PGT_BUF_SIZE; + phys_addr_t base; + + base = __pa(extend_brk(tables, PAGE_SIZE)); + + pgt_buf_start = base >> PAGE_SHIFT; + pgt_buf_end = pgt_buf_start; + pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); +} + int after_bootmem; int direct_gbpages @@ -228,105 +243,6 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, return nr_range; } -/* - * First calculate space needed for kernel direct mapping page tables to cover - * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB - * pages. Then find enough contiguous space for those page tables. - */ -static unsigned long __init calculate_table_space_size(unsigned long start, unsigned long end) -{ - int i; - unsigned long puds = 0, pmds = 0, ptes = 0, tables; - struct map_range mr[NR_RANGE_MR]; - int nr_range; - - memset(mr, 0, sizeof(mr)); - nr_range = 0; - nr_range = split_mem_range(mr, nr_range, start, end); - - for (i = 0; i < nr_range; i++) { - unsigned long range, extra; - - range = mr[i].end - mr[i].start; - puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; - - if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { - extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); - pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else { - pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; - } - - if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { - extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); -#ifdef CONFIG_X86_32 - extra += PMD_SIZE; -#endif - ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else { - ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT; - } - } - - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); - tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); - -#ifdef CONFIG_X86_32 - /* for fixmap */ - tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif - - return tables; -} - -static unsigned long __init calculate_all_table_space_size(void) -{ - unsigned long start_pfn, end_pfn; - unsigned long tables; - int i; - - /* the ISA range is always mapped regardless of memory holes */ - tables = calculate_table_space_size(0, ISA_END_ADDRESS); - - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { - u64 start = start_pfn << PAGE_SHIFT; - u64 end = end_pfn << PAGE_SHIFT; - - if (end <= ISA_END_ADDRESS) - continue; - - if (start < ISA_END_ADDRESS) - start = ISA_END_ADDRESS; -#ifdef CONFIG_X86_32 - /* on 32 bit, we only map up to max_low_pfn */ - if ((start >> PAGE_SHIFT) >= max_low_pfn) - continue; - - if ((end >> PAGE_SHIFT) > max_low_pfn) - end = max_low_pfn << PAGE_SHIFT; -#endif - tables += calculate_table_space_size(start, end); - } - - return tables; -} - -static void __init find_early_table_space(unsigned long start, - unsigned long good_end, - unsigned long tables) -{ - phys_addr_t base; - - base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); - if (!base) - panic("Cannot find space for the kernel page tables"); - - pgt_buf_start = base >> PAGE_SHIFT; - pgt_buf_end = pgt_buf_start; - pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); -} - static struct range pfn_mapped[E820_X_MAX]; static int nr_pfn_mapped; @@ -391,17 +307,14 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, } /* - * Iterate through E820 memory map and create direct mappings for only E820_RAM - * regions. We cannot simply create direct mappings for all pfns from - * [0 to max_low_pfn) and [4GB to max_pfn) because of possible memory holes in - * high addresses that cannot be marked as UC by fixed/variable range MTRRs. - * Depending on the alignment of E820 ranges, this may possibly result in using - * smaller size (i.e. 4K instead of 2M or 1G) page tables. + * would have hole in the middle or ends, and only ram parts will be mapped. */ -static void __init init_range_memory_mapping(unsigned long range_start, +static unsigned long __init init_range_memory_mapping( + unsigned long range_start, unsigned long range_end) { unsigned long start_pfn, end_pfn; + unsigned long mapped_ram_size = 0; int i; for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { @@ -421,71 +334,70 @@ static void __init init_range_memory_mapping(unsigned long range_start, end = range_end; init_memory_mapping(start, end); + + mapped_ram_size += end - start; } + + return mapped_ram_size; } +/* (PUD_SHIFT-PMD_SHIFT)/2 */ +#define STEP_SIZE_SHIFT 5 void __init init_mem_mapping(void) { - unsigned long tables, good_end, end; + unsigned long end, real_end, start, last_start; + unsigned long step_size; + unsigned long addr; + unsigned long mapped_ram_size = 0; + unsigned long new_mapped_ram_size; probe_page_size_mask(); - /* - * Find space for the kernel direct mapping tables. - * - * Later we should allocate these tables in the local node of the - * memory mapped. Unfortunately this is done currently before the - * nodes are discovered. - */ #ifdef CONFIG_X86_64 end = max_pfn << PAGE_SHIFT; - good_end = end; #else end = max_low_pfn << PAGE_SHIFT; - good_end = max_pfn_mapped << PAGE_SHIFT; #endif - tables = calculate_all_table_space_size(); - find_early_table_space(0, good_end, tables); - printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] prealloc\n", - end - 1, pgt_buf_start << PAGE_SHIFT, - (pgt_buf_top << PAGE_SHIFT) - 1); - max_pfn_mapped = 0; /* will get exact value next */ /* the ISA range is always mapped regardless of memory holes */ init_memory_mapping(0, ISA_END_ADDRESS); - init_range_memory_mapping(ISA_END_ADDRESS, end); + + /* xen has big range in reserved near end of ram, skip it at first */ + addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE, + PAGE_SIZE); + real_end = addr + PMD_SIZE; + + /* step_size need to be small so pgt_buf from BRK could cover it */ + step_size = PMD_SIZE; + max_pfn_mapped = 0; /* will get exact value next */ + min_pfn_mapped = real_end >> PAGE_SHIFT; + last_start = start = real_end; + while (last_start > ISA_END_ADDRESS) { + if (last_start > step_size) { + start = round_down(last_start - 1, step_size); + if (start < ISA_END_ADDRESS) + start = ISA_END_ADDRESS; + } else + start = ISA_END_ADDRESS; + new_mapped_ram_size = init_range_memory_mapping(start, + last_start); + last_start = start; + min_pfn_mapped = last_start >> PAGE_SHIFT; + /* only increase step_size after big range get mapped */ + if (new_mapped_ram_size > mapped_ram_size) + step_size <<= STEP_SIZE_SHIFT; + mapped_ram_size += new_mapped_ram_size; + } + + if (real_end < end) + init_range_memory_mapping(real_end, end); + #ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { /* can we preseve max_low_pfn ?*/ max_low_pfn = max_pfn; } #endif - /* - * Reserve the kernel pagetable pages we used (pgt_buf_start - - * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) - * so that they can be reused for other purposes. - * - * On native it just means calling memblock_reserve, on Xen it also - * means marking RW the pagetable pages that we allocated before - * but that haven't been used. - * - * In fact on xen we mark RO the whole range pgt_buf_start - - * pgt_buf_top, because we have to make sure that when - * init_memory_mapping reaches the pagetable pages area, it maps - * RO all the pagetable pages, including the ones that are beyond - * pgt_buf_end at that time. - */ - if (pgt_buf_end > pgt_buf_start) { - printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] final\n", - end - 1, pgt_buf_start << PAGE_SHIFT, - (pgt_buf_end << PAGE_SHIFT) - 1); - x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), - PFN_PHYS(pgt_buf_end)); - } - - /* stop the wrong using */ - pgt_buf_top = 0; - early_memtest(0, max_pfn_mapped << PAGE_SHIFT); } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 27f7fc69cf8..7bb11064a9e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -61,11 +61,22 @@ bool __read_mostly __vmalloc_start_set = false; static __init void *alloc_low_page(void) { - unsigned long pfn = pgt_buf_end++; + unsigned long pfn; void *adr; - if (pfn >= pgt_buf_top) - panic("alloc_low_page: ran out of memory"); + if ((pgt_buf_end + 1) >= pgt_buf_top) { + unsigned long ret; + if (min_pfn_mapped >= max_pfn_mapped) + panic("alloc_low_page: ran out of memory"); + ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, + max_pfn_mapped << PAGE_SHIFT, + PAGE_SIZE, PAGE_SIZE); + if (!ret) + panic("alloc_low_page: can not alloc memory"); + memblock_reserve(ret, PAGE_SIZE); + pfn = ret >> PAGE_SHIFT; + } else + pfn = pgt_buf_end++; adr = __va(pfn * PAGE_SIZE); clear_page(adr); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index fa28e3e2974..eefaea637b3 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -316,7 +316,7 @@ void __init cleanup_highmap(void) static __ref void *alloc_low_page(unsigned long *phys) { - unsigned long pfn = pgt_buf_end++; + unsigned long pfn; void *adr; if (after_bootmem) { @@ -326,8 +326,19 @@ static __ref void *alloc_low_page(unsigned long *phys) return adr; } - if (pfn >= pgt_buf_top) - panic("alloc_low_page: ran out of memory"); + if ((pgt_buf_end + 1) >= pgt_buf_top) { + unsigned long ret; + if (min_pfn_mapped >= max_pfn_mapped) + panic("alloc_low_page: ran out of memory"); + ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, + max_pfn_mapped << PAGE_SHIFT, + PAGE_SIZE, PAGE_SIZE); + if (!ret) + panic("alloc_low_page: can not alloc memory"); + memblock_reserve(ret, PAGE_SIZE); + pfn = ret >> PAGE_SHIFT; + } else + pfn = pgt_buf_end++; adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); clear_page(adr); -- cgit v1.2.3-70-g09d2 From 9985b4c6fa7d660f685918a58282275e9e35d8e0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:02 -0800 Subject: x86, mm: Move min_pfn_mapped back to mm/init.c Also change it to static. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-26-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 1 - arch/x86/kernel/setup.c | 1 - arch/x86/mm/init.c | 2 ++ 3 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 9f6f3e66e84..54c97879195 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -45,7 +45,6 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; -extern unsigned long min_pfn_mapped; static inline phys_addr_t get_max_mapped(void) { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f7634092931..20151941cce 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -124,7 +124,6 @@ */ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; -unsigned long min_pfn_mapped; #ifdef CONFIG_DMI RESERVE_BRK(dmi_alloc, 65536); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 848189229b2..6392bf9a394 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -23,6 +23,8 @@ unsigned long __initdata pgt_buf_start; unsigned long __meminitdata pgt_buf_end; unsigned long __meminitdata pgt_buf_top; +static unsigned long min_pfn_mapped; + __ref void *alloc_low_page(void) { unsigned long pfn; -- cgit v1.2.3-70-g09d2 From 6f80b68e9e515547edbacb0c37491730bf766db5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:03 -0800 Subject: x86, mm, Xen: Remove mapping_pagetable_reserve() Page table area are pre-mapped now after x86, mm: setup page table in top-down x86, mm: Remove early_memremap workaround for page table accessing on 64bit mapping_pagetable_reserve is not used anymore, so remove it. Also remove operation in mask_rw_pte(), as modified allow_low_page always return pages that are already mapped, moreover xen_alloc_pte_init, xen_alloc_pmd_init, etc, will mark the page RO before hooking it into the pagetable automatically. -v2: add changelog about mask_rw_pte() from Stefano. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-27-git-send-email-yinghai@kernel.org Cc: Stefano Stabellini Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable_types.h | 1 - arch/x86/include/asm/x86_init.h | 12 ------------ arch/x86/kernel/x86_init.c | 4 ---- arch/x86/mm/init.c | 4 ---- arch/x86/xen/mmu.c | 28 ---------------------------- 5 files changed, 49 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index ec8a1fc9505..79738f20aaf 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -301,7 +301,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); -extern void native_pagetable_reserve(u64 start, u64 end); #ifdef CONFIG_X86_32 extern void native_pagetable_init(void); #else diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 57693498519..3b2ce8fc995 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -68,17 +68,6 @@ struct x86_init_oem { void (*banner)(void); }; -/** - * struct x86_init_mapping - platform specific initial kernel pagetable setup - * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage - * - * For more details on the purpose of this hook, look in - * init_memory_mapping and the commit that added it. - */ -struct x86_init_mapping { - void (*pagetable_reserve)(u64 start, u64 end); -}; - /** * struct x86_init_paging - platform specific paging functions * @pagetable_init: platform specific paging initialization call to setup @@ -136,7 +125,6 @@ struct x86_init_ops { struct x86_init_mpparse mpparse; struct x86_init_irqs irqs; struct x86_init_oem oem; - struct x86_init_mapping mapping; struct x86_init_paging paging; struct x86_init_timers timers; struct x86_init_iommu iommu; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 7a3d075a814..50cf83ecd32 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -62,10 +62,6 @@ struct x86_init_ops x86_init __initdata = { .banner = default_banner, }, - .mapping = { - .pagetable_reserve = native_pagetable_reserve, - }, - .paging = { .pagetable_init = native_pagetable_init, }, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6392bf9a394..21173fcdb4a 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -112,10 +112,6 @@ static void __init probe_page_size_mask(void) __supported_pte_mask |= _PAGE_GLOBAL; } } -void __init native_pagetable_reserve(u64 start, u64 end) -{ - memblock_reserve(start, end - start); -} #ifdef CONFIG_X86_32 #define NR_RANGE_MR 3 diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index dcf5f2dd91e..bbb883f58bc 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1178,20 +1178,6 @@ static void xen_exit_mmap(struct mm_struct *mm) static void xen_post_allocator_init(void); -static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) -{ - /* reserve the range used */ - native_pagetable_reserve(start, end); - - /* set as RW the rest */ - printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end, - PFN_PHYS(pgt_buf_top)); - while (end < PFN_PHYS(pgt_buf_top)) { - make_lowmem_page_readwrite(__va(end)); - end += PAGE_SIZE; - } -} - #ifdef CONFIG_X86_64 static void __init xen_cleanhighmap(unsigned long vaddr, unsigned long vaddr_end) @@ -1503,19 +1489,6 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) #else /* CONFIG_X86_64 */ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) { - unsigned long pfn = pte_pfn(pte); - - /* - * If the new pfn is within the range of the newly allocated - * kernel pagetable, and it isn't being mapped into an - * early_ioremap fixmap slot as a freshly allocated page, make sure - * it is RO. - */ - if (((!is_early_ioremap_ptep(ptep) && - pfn >= pgt_buf_start && pfn < pgt_buf_top)) || - (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1))) - pte = pte_wrprotect(pte); - return pte; } #endif /* CONFIG_X86_64 */ @@ -2197,7 +2170,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { void __init xen_init_mmu_ops(void) { - x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; x86_init.paging.pagetable_init = xen_pagetable_init; pv_mmu_ops = xen_mmu_ops; -- cgit v1.2.3-70-g09d2 From cf47065961b48727b4e47bc3e2e67f4996878437 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:07 -0800 Subject: x86, mm: Move back pgt_buf_* to mm/init.c Also change them to static. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-31-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/init.h | 4 ---- arch/x86/mm/init.c | 6 +++--- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 4f13998be59..626ea8d31b6 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -12,8 +12,4 @@ kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask); -extern unsigned long __initdata pgt_buf_start; -extern unsigned long __meminitdata pgt_buf_end; -extern unsigned long __meminitdata pgt_buf_top; - #endif /* _ASM_X86_INIT_32_H */ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bed4888c6f4..3cadf1013ce 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -19,9 +19,9 @@ #include "mm_internal.h" -unsigned long __initdata pgt_buf_start; -unsigned long __meminitdata pgt_buf_end; -unsigned long __meminitdata pgt_buf_top; +static unsigned long __initdata pgt_buf_start; +static unsigned long __initdata pgt_buf_end; +static unsigned long __initdata pgt_buf_top; static unsigned long min_pfn_mapped; -- cgit v1.2.3-70-g09d2 From c8dcdb9ce463ad4a660099a74a850f4f6fc81c41 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:10 -0800 Subject: x86, mm: Move function declaration into mm_internal.h They are only for mm/init*.c. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-34-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/init.h | 16 +++------------- arch/x86/mm/mm_internal.h | 7 +++++++ 2 files changed, 10 insertions(+), 13 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 626ea8d31b6..bac770b7cbc 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -1,15 +1,5 @@ -#ifndef _ASM_X86_INIT_32_H -#define _ASM_X86_INIT_32_H +#ifndef _ASM_X86_INIT_H +#define _ASM_X86_INIT_H -#ifdef CONFIG_X86_32 -extern void __init early_ioremap_page_table_range_init(void); -#endif -extern void __init zone_sizes_init(void); - -extern unsigned long __init -kernel_physical_mapping_init(unsigned long start, - unsigned long end, - unsigned long page_size_mask); - -#endif /* _ASM_X86_INIT_32_H */ +#endif /* _ASM_X86_INIT_H */ diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 7e3b88ee078..dc79ac12177 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -7,4 +7,11 @@ static inline void *alloc_low_page(void) return alloc_low_pages(1); } +void early_ioremap_page_table_range_init(void); + +unsigned long kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask); +void zone_sizes_init(void); + #endif /* __X86_MM_INTERNAL_H */ -- cgit v1.2.3-70-g09d2 From 94b43c3d86dddf95064fc83e9087448b35f985ff Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:19 -0800 Subject: x86, mm: kill numa_free_all_bootmem() Now NO_BOOTMEM version free_all_bootmem_node() does not really do free_bootmem at all, and it only call register_page_bootmem_info_node instead. That is confusing, try to kill that free_all_bootmem_node(). Before that, this patch will remove numa_free_all_bootmem(). That function could be replaced with register_page_bootmem_info() and free_all_bootmem(); Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-43-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/numa_64.h | 2 -- arch/x86/mm/init_64.c | 15 +++++++++++---- arch/x86/mm/numa_64.c | 13 ------------- 3 files changed, 11 insertions(+), 19 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 0c05f7ae46e..fe4d2d410ad 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -1,6 +1,4 @@ #ifndef _ASM_X86_NUMA_64_H #define _ASM_X86_NUMA_64_H -extern unsigned long numa_free_all_bootmem(void); - #endif /* _ASM_X86_NUMA_64_H */ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1d53defc99c..41785305f64 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -629,6 +629,16 @@ EXPORT_SYMBOL_GPL(arch_add_memory); static struct kcore_list kcore_vsyscall; +static void __init register_page_bootmem_info(void) +{ +#ifdef CONFIG_NUMA + int i; + + for_each_online_node(i) + register_page_bootmem_info_node(NODE_DATA(i)); +#endif +} + void __init mem_init(void) { long codesize, reservedpages, datasize, initsize; @@ -641,11 +651,8 @@ void __init mem_init(void) reservedpages = 0; /* this will put all low memory onto the freelists */ -#ifdef CONFIG_NUMA - totalram_pages = numa_free_all_bootmem(); -#else + register_page_bootmem_info(); totalram_pages = free_all_bootmem(); -#endif absent_pages = absent_pages_in_range(0, max_pfn); reservedpages = max_pfn - totalram_pages - absent_pages; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 92e27119ee1..9405ffc9150 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -10,16 +10,3 @@ void __init initmem_init(void) { x86_numa_init(); } - -unsigned long __init numa_free_all_bootmem(void) -{ - unsigned long pages = 0; - int i; - - for_each_online_node(i) - pages += free_all_bootmem_node(NODE_DATA(i)); - - pages += free_low_memory_core_early(MAX_NUMNODES); - - return pages; -} -- cgit v1.2.3-70-g09d2 From c074eaac2ab264c94520efff7e896b771de885ae Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:20 -0800 Subject: x86, mm: kill numa_64.h Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-44-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/numa.h | 2 -- arch/x86/include/asm/numa_64.h | 4 ---- arch/x86/kernel/acpi/boot.c | 1 - arch/x86/kernel/cpu/amd.c | 1 - arch/x86/kernel/cpu/intel.c | 1 - arch/x86/kernel/setup.c | 3 --- 6 files changed, 12 deletions(-) delete mode 100644 arch/x86/include/asm/numa_64.h (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 49119fcea2d..52560a2038e 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -54,8 +54,6 @@ static inline int numa_cpu_node(int cpu) #ifdef CONFIG_X86_32 # include -#else -# include #endif #ifdef CONFIG_NUMA diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h deleted file mode 100644 index fe4d2d410ad..00000000000 --- a/arch/x86/include/asm/numa_64.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef _ASM_X86_NUMA_64_H -#define _ASM_X86_NUMA_64_H - -#endif /* _ASM_X86_NUMA_64_H */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index e651f7a589a..4b23aa18518 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -51,7 +51,6 @@ EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 # include -# include #endif /* X86 */ #define BAD_MADT_ENTRY(entry, end) ( \ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9619ba6528c..913f94f9e8d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -12,7 +12,6 @@ #include #ifdef CONFIG_X86_64 -# include # include # include #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 198e019a531..3b547cc4bd0 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -17,7 +17,6 @@ #ifdef CONFIG_X86_64 #include -#include #endif #include "cpu.h" diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 85b62f1c807..6d29d1fcf06 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -108,9 +108,6 @@ #include #include #include -#ifdef CONFIG_X86_64 -#include -#endif #include #include #include -- cgit v1.2.3-70-g09d2 From 5dcd14ecd41ea2b3ae3295a9b30d98769d52165f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 29 Jan 2013 01:05:24 -0800 Subject: x86, boot: Sanitize boot_params if not zeroed on creation Use the new sentinel field to detect bootloaders which fail to follow protocol and don't initialize fields in struct boot_params that they do not explicitly initialize to zero. Based on an original patch and research by Yinghai Lu. Changed by hpa to be invoked both in the decompression path and in the kernel proper; the latter for the case where a bootloader takes over decompression. Originally-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-26-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/misc.c | 2 ++ arch/x86/boot/compressed/misc.h | 1 + arch/x86/include/asm/bootparam_utils.h | 38 ++++++++++++++++++++++++++++++++++ arch/x86/kernel/head32.c | 3 +++ arch/x86/kernel/head64.c | 2 ++ 5 files changed, 46 insertions(+) create mode 100644 arch/x86/include/asm/bootparam_utils.h (limited to 'arch/x86/include/asm') diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 88f7ff6da40..7cb56c6ca35 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -325,6 +325,8 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, { real_mode = rmode; + sanitize_boot_params(real_mode); + if (real_mode->screen_info.orig_video_mode == 7) { vidmem = (char *) 0xb0000; vidport = 0x3b4; diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 0e6dc0ee0ee..674019d8e23 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -18,6 +18,7 @@ #include #include #include +#include #define BOOT_BOOT_H #include "../ctype.h" diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h new file mode 100644 index 00000000000..5b5e9cb774b --- /dev/null +++ b/arch/x86/include/asm/bootparam_utils.h @@ -0,0 +1,38 @@ +#ifndef _ASM_X86_BOOTPARAM_UTILS_H +#define _ASM_X86_BOOTPARAM_UTILS_H + +#include + +/* + * This file is included from multiple environments. Do not + * add completing #includes to make it standalone. + */ + +/* + * Deal with bootloaders which fail to initialize unknown fields in + * boot_params to zero. The list fields in this list are taken from + * analysis of kexec-tools; if other broken bootloaders initialize a + * different set of fields we will need to figure out how to disambiguate. + * + */ +static void sanitize_boot_params(struct boot_params *boot_params) +{ + if (boot_params->sentinel) { + /*fields in boot_params are not valid, clear them */ + memset(&boot_params->olpc_ofw_header, 0, + (char *)&boot_params->alt_mem_k - + (char *)&boot_params->olpc_ofw_header); + memset(&boot_params->kbd_status, 0, + (char *)&boot_params->hdr - + (char *)&boot_params->kbd_status); + memset(&boot_params->_pad7[0], 0, + (char *)&boot_params->edd_mbr_sig_buffer[0] - + (char *)&boot_params->_pad7[0]); + memset(&boot_params->_pad8[0], 0, + (char *)&boot_params->eddbuf[0] - + (char *)&boot_params->_pad8[0]); + memset(&boot_params->_pad9[0], 0, sizeof(boot_params->_pad9)); + } +} + +#endif /* _ASM_X86_BOOTPARAM_UTILS_H */ diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index c18f59d1010..6773c918b8c 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -18,6 +18,7 @@ #include #include #include +#include static void __init i386_default_early_setup(void) { @@ -30,6 +31,8 @@ static void __init i386_default_early_setup(void) void __init i386_start_kernel(void) { + sanitize_boot_params(&boot_params); + memblock_reserve(__pa_symbol(&_text), __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 037df57a99a..849fc9e63c2 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -25,6 +25,7 @@ #include #include #include +#include static void __init zap_identity_mappings(void) { @@ -46,6 +47,7 @@ static void __init copy_bootdata(char *real_mode_data) char * command_line; memcpy(&boot_params, real_mode_data, sizeof boot_params); + sanitize_boot_params(&boot_params); if (boot_params.hdr.cmd_line_ptr) { command_line = __va(boot_params.hdr.cmd_line_ptr); memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); -- cgit v1.2.3-70-g09d2 From aece27851d44bde62fc0587e06f5e8e27fd96e5f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:48 -0800 Subject: x86, 64bit, mm: Add generic kernel/ident mapping helper It is simple version for kernel_physical_mapping_init. it will work to build one page table that will be used later. Use mapping_info to control 1. alloc_pg_page method 2. if PMD is EXEC, 3. if pgd is with kernel low mapping or ident mapping. Will use to replace some local versions in kexec, hibernation and etc. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-8-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/init.h | 9 ++++++ arch/x86/mm/init_64.c | 74 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index bac770b7cbc..223042086f4 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -1,5 +1,14 @@ #ifndef _ASM_X86_INIT_H #define _ASM_X86_INIT_H +struct x86_mapping_info { + void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ + void *context; /* context for alloc_pgt_page */ + unsigned long pmd_flag; /* page flag for PMD entry */ + bool kernel_mapping; /* kernel mapping or ident mapping */ +}; + +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, + unsigned long addr, unsigned long end); #endif /* _ASM_X86_INIT_H */ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d7af907c07f..9fbb85c8d93 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -56,6 +56,80 @@ #include "mm_internal.h" +static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, + unsigned long addr, unsigned long end) +{ + addr &= PMD_MASK; + for (; addr < end; addr += PMD_SIZE) { + pmd_t *pmd = pmd_page + pmd_index(addr); + + if (!pmd_present(*pmd)) + set_pmd(pmd, __pmd(addr | pmd_flag)); + } +} +static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + pud_t *pud = pud_page + pud_index(addr); + pmd_t *pmd; + + next = (addr & PUD_MASK) + PUD_SIZE; + if (next > end) + next = end; + + if (pud_present(*pud)) { + pmd = pmd_offset(pud, 0); + ident_pmd_init(info->pmd_flag, pmd, addr, next); + continue; + } + pmd = (pmd_t *)info->alloc_pgt_page(info->context); + if (!pmd) + return -ENOMEM; + ident_pmd_init(info->pmd_flag, pmd, addr, next); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + } + + return 0; +} + +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + int result; + int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; + + for (; addr < end; addr = next) { + pgd_t *pgd = pgd_page + pgd_index(addr) + off; + pud_t *pud; + + next = (addr & PGDIR_MASK) + PGDIR_SIZE; + if (next > end) + next = end; + + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, 0); + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + continue; + } + + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } + + return 0; +} + static int __init parse_direct_gbpages_off(char *arg) { direct_gbpages = 0; -- cgit v1.2.3-70-g09d2 From 4f7b92263ad68cdc72b11808320d9c881bfa857e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:19:51 -0800 Subject: x86, realmode: Separate real_mode reserve and setup After we switch to use #PF handler help to set page table, init_level4_pgt will only have entries set after init_mem_mapping(). We need to move copying init_level4_pgt to trampoline_pgd after that. So split reserve and setup, and move the setup after init_mem_mapping() Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-11-git-send-email-yinghai@kernel.org Cc: Jarkko Sakkinen Acked-by: Jarkko Sakkinen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/realmode.h | 3 ++- arch/x86/kernel/setup.c | 4 +++- arch/x86/realmode/init.c | 32 ++++++++++++++++++++------------ 3 files changed, 25 insertions(+), 14 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index fe1ec5bcd84..9c6b890d5e7 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -58,6 +58,7 @@ extern unsigned char boot_gdt[]; extern unsigned char secondary_startup_64[]; #endif -extern void __init setup_real_mode(void); +void reserve_real_mode(void); +void setup_real_mode(void); #endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5552d04b0cc..85a8290801d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -999,12 +999,14 @@ void __init setup_arch(char **cmdline_p) printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", (max_pfn_mapped< Date: Thu, 24 Jan 2013 12:19:52 -0800 Subject: x86, 64bit: Use a #PF handler to materialize early mappings on demand Linear mode (CR0.PG = 0) is mutually exclusive with 64-bit mode; all 64-bit code has to use page tables. This makes it awkward before we have first set up properly all-covering page tables to access objects that are outside the static kernel range. So far we have dealt with that simply by mapping a fixed amount of low memory, but that fails in at least two upcoming use cases: 1. We will support load and run kernel, struct boot_params, ramdisk, command line, etc. above the 4 GiB mark. 2. need to access ramdisk early to get microcode to update that as early possible. We could use early_iomap to access them too, but it will make code to messy and hard to be unified with 32 bit. Hence, set up a #PF table and use a fixed number of buffers to set up page tables on demand. If the buffers fill up then we simply flush them and start over. These buffers are all in __initdata, so it does not increase RAM usage at runtime. Thus, with the help of the #PF handler, we can set the final kernel mapping from blank, and switch to init_level4_pgt later. During the switchover in head_64.S, before #PF handler is available, we use three pages to handle kernel crossing 1G, 512G boundaries with sharing page by playing games with page aliasing: the same page is mapped twice in the higher-level tables with appropriate wraparound. The kernel region itself will be properly mapped; other mappings may be spurious. early_make_pgtable is using kernel high mapping address to access pages to set page table. -v4: Add phys_base offset to make kexec happy, and add init_mapping_kernel() - Yinghai -v5: fix compiling with xen, and add back ident level3 and level2 for xen also move back init_level4_pgt from BSS to DATA again. because we have to clear it anyway. - Yinghai -v6: switch to init_level4_pgt in init_mem_mapping. - Yinghai -v7: remove not needed clear_page for init_level4_page it is with fill 512,8,0 already in head_64.S - Yinghai -v8: we need to keep that handler alive until init_mem_mapping and don't let early_trap_init to trash that early #PF handler. So split early_trap_pf_init out and move it down. - Yinghai -v9: switchover only cover kernel space instead of 1G so could avoid touch possible mem holes. - Yinghai -v11: change far jmp back to far return to initial_code, that is needed to fix failure that is reported by Konrad on AMD systems. - Yinghai Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-12-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable_64_types.h | 4 + arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/head64.c | 81 ++++++++++-- arch/x86/kernel/head_64.S | 210 +++++++++++++++++++------------- arch/x86/kernel/setup.c | 2 + arch/x86/kernel/traps.c | 9 ++ arch/x86/mm/init.c | 3 +- 7 files changed, 219 insertions(+), 91 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 766ea16fbbb..2d883440cb9 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_PGTABLE_64_DEFS_H #define _ASM_X86_PGTABLE_64_DEFS_H +#include + #ifndef __ASSEMBLY__ #include @@ -60,4 +62,6 @@ typedef struct { pteval_t pte; } pte_t; #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) +#define EARLY_DYNAMIC_PAGE_TABLES 64 + #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 888184b2fc8..bdee8bd318e 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -731,6 +731,7 @@ extern void enable_sep_cpu(void); extern int sysenter_setup(void); extern void early_trap_init(void); +void early_trap_pf_init(void); /* Defined in head.S */ extern struct desc_ptr early_gdt_descr; diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 7785e66840a..f57df05ea12 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -27,11 +27,73 @@ #include #include -static void __init zap_identity_mappings(void) +/* + * Manage page tables very early on. + */ +extern pgd_t early_level4_pgt[PTRS_PER_PGD]; +extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; +static unsigned int __initdata next_early_pgt = 2; + +/* Wipe all early page tables except for the kernel symbol map */ +static void __init reset_early_page_tables(void) { - pgd_t *pgd = pgd_offset_k(0UL); - pgd_clear(pgd); - __flush_tlb_all(); + unsigned long i; + + for (i = 0; i < PTRS_PER_PGD-1; i++) + early_level4_pgt[i].pgd = 0; + + next_early_pgt = 0; + + write_cr3(__pa(early_level4_pgt)); +} + +/* Create a new PMD entry */ +int __init early_make_pgtable(unsigned long address) +{ + unsigned long physaddr = address - __PAGE_OFFSET; + unsigned long i; + pgdval_t pgd, *pgd_p; + pudval_t *pud_p; + pmdval_t pmd, *pmd_p; + + /* Invalid address or early pgt is done ? */ + if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt)) + return -1; + + i = (address >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1); + pgd_p = &early_level4_pgt[i].pgd; + pgd = *pgd_p; + + /* + * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is + * critical -- __PAGE_OFFSET would point us back into the dynamic + * range and we might end up looping forever... + */ + if (pgd && next_early_pgt < EARLY_DYNAMIC_PAGE_TABLES) { + pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + } else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES-1) + reset_early_page_tables(); + + pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; + for (i = 0; i < PTRS_PER_PUD; i++) + pud_p[i] = 0; + + *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + i = (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); + pud_p += i; + + pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; + pmd = (physaddr & PUD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL); + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd_p[i] = pmd; + pmd += PMD_SIZE; + } + + *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + + return 0; } /* Don't add a printk in there. printk relies on the PDA which is not initialized @@ -72,12 +134,13 @@ void __init x86_64_start_kernel(char * real_mode_data) (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); + /* Kill off the identity-map trampoline */ + reset_early_page_tables(); + /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); - /* Make NULL pointers segfault */ - zap_identity_mappings(); - + /* XXX - this is wrong... we need to build page tables from scratch */ max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { @@ -94,6 +157,10 @@ void __init x86_64_start_kernel(char * real_mode_data) if (console_loglevel == 10) early_printk("Kernel alive\n"); + clear_page(init_level4_pgt); + /* set init_level4_pgt kernel high mapping*/ + init_level4_pgt[511] = early_level4_pgt[511]; + x86_64_start_reservations(real_mode_data); } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 980053c4b9c..d94f6d68be2 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) .code64 .globl startup_64 startup_64: - /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, * and someone has loaded an identity mapped page table * for us. These identity mapped page tables map all of the * kernel pages and possibly all of memory. * - * %esi holds a physical pointer to real_mode_data. + * %rsi holds a physical pointer to real_mode_data. * * We come here either directly from a 64bit bootloader, or from * arch/x86_64/boot/compressed/head.S. @@ -66,7 +65,8 @@ startup_64: * tables and then reload them. */ - /* Compute the delta between the address I am compiled to run at and the + /* + * Compute the delta between the address I am compiled to run at and the * address I am actually running at. */ leaq _text(%rip), %rbp @@ -78,45 +78,62 @@ startup_64: testl %eax, %eax jnz bad_address - /* Is the address too large? */ - leaq _text(%rip), %rdx - movq $PGDIR_SIZE, %rax - cmpq %rax, %rdx - jae bad_address - - /* Fixup the physical addresses in the page table + /* + * Is the address too large? */ - addq %rbp, init_level4_pgt + 0(%rip) - addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip) - addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip) + leaq _text(%rip), %rax + shrq $MAX_PHYSMEM_BITS, %rax + jnz bad_address - addq %rbp, level3_ident_pgt + 0(%rip) + /* + * Fixup the physical addresses in the page table + */ + addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip) addq %rbp, level3_kernel_pgt + (510*8)(%rip) addq %rbp, level3_kernel_pgt + (511*8)(%rip) addq %rbp, level2_fixmap_pgt + (506*8)(%rip) - /* Add an Identity mapping if I am above 1G */ + /* + * Set up the identity mapping for the switchover. These + * entries should *NOT* have the global bit set! This also + * creates a bunch of nonsense entries but that is fine -- + * it avoids problems around wraparound. + */ leaq _text(%rip), %rdi - andq $PMD_PAGE_MASK, %rdi + leaq early_level4_pgt(%rip), %rbx movq %rdi, %rax - shrq $PUD_SHIFT, %rax - andq $(PTRS_PER_PUD - 1), %rax - jz ident_complete + shrq $PGDIR_SHIFT, %rax - leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx - leaq level3_ident_pgt(%rip), %rbx - movq %rdx, 0(%rbx, %rax, 8) + leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx + movq %rdx, 0(%rbx,%rax,8) + movq %rdx, 8(%rbx,%rax,8) + addq $4096, %rdx movq %rdi, %rax - shrq $PMD_SHIFT, %rax - andq $(PTRS_PER_PMD - 1), %rax - leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx - leaq level2_spare_pgt(%rip), %rbx - movq %rdx, 0(%rbx, %rax, 8) -ident_complete: + shrq $PUD_SHIFT, %rax + andl $(PTRS_PER_PUD-1), %eax + movq %rdx, (4096+0)(%rbx,%rax,8) + movq %rdx, (4096+8)(%rbx,%rax,8) + + addq $8192, %rbx + movq %rdi, %rax + shrq $PMD_SHIFT, %rdi + addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax + leaq (_end - 1)(%rip), %rcx + shrq $PMD_SHIFT, %rcx + subq %rdi, %rcx + incl %ecx + +1: + andq $(PTRS_PER_PMD - 1), %rdi + movq %rax, (%rbx,%rdi,8) + incq %rdi + addq $PMD_SIZE, %rax + decl %ecx + jnz 1b /* * Fixup the kernel text+data virtual addresses. Note that @@ -124,7 +141,6 @@ ident_complete: * cleanup_highmap() fixes this up along with the mappings * beyond _end. */ - leaq level2_kernel_pgt(%rip), %rdi leaq 4096(%rdi), %r8 /* See if it is a valid page table entry */ @@ -139,17 +155,14 @@ ident_complete: /* Fixup phys_base */ addq %rbp, phys_base(%rip) - /* Due to ENTRY(), sometimes the empty space gets filled with - * zeros. Better take a jmp than relying on empty space being - * filled with 0x90 (nop) - */ - jmp secondary_startup_64 + movq $(early_level4_pgt - __START_KERNEL_map), %rax + jmp 1f ENTRY(secondary_startup_64) /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, * and someone has loaded a mapped page table. * - * %esi holds a physical pointer to real_mode_data. + * %rsi holds a physical pointer to real_mode_data. * * We come here either from startup_64 (using physical addresses) * or from trampoline.S (using virtual addresses). @@ -159,12 +172,14 @@ ENTRY(secondary_startup_64) * after the boot processor executes this code. */ + movq $(init_level4_pgt - __START_KERNEL_map), %rax +1: + /* Enable PAE mode and PGE */ - movl $(X86_CR4_PAE | X86_CR4_PGE), %eax - movq %rax, %cr4 + movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx + movq %rcx, %cr4 /* Setup early boot stage 4 level pagetables. */ - movq $(init_level4_pgt - __START_KERNEL_map), %rax addq phys_base(%rip), %rax movq %rax, %cr3 @@ -196,7 +211,7 @@ ENTRY(secondary_startup_64) movq %rax, %cr0 /* Setup a boot time stack */ - movq stack_start(%rip),%rsp + movq stack_start(%rip), %rsp /* zero EFLAGS after setting rsp */ pushq $0 @@ -236,15 +251,33 @@ ENTRY(secondary_startup_64) movl initial_gs+4(%rip),%edx wrmsr - /* esi is pointer to real mode structure with interesting info. + /* rsi is pointer to real mode structure with interesting info. pass it to C */ - movl %esi, %edi + movq %rsi, %rdi /* Finally jump to run C code and to be on real kernel address * Since we are running on identity-mapped space we have to jump * to the full 64bit address, this is only possible as indirect * jump. In addition we need to ensure %cs is set so we make this * a far return. + * + * Note: do not change to far jump indirect with 64bit offset. + * + * AMD does not support far jump indirect with 64bit offset. + * AMD64 Architecture Programmer's Manual, Volume 3: states only + * JMP FAR mem16:16 FF /5 Far jump indirect, + * with the target specified by a far pointer in memory. + * JMP FAR mem16:32 FF /5 Far jump indirect, + * with the target specified by a far pointer in memory. + * + * Intel64 does support 64bit offset. + * Software Developer Manual Vol 2: states: + * FF /5 JMP m16:16 Jump far, absolute indirect, + * address given in m16:16 + * FF /5 JMP m16:32 Jump far, absolute indirect, + * address given in m16:32. + * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, + * address given in m16:64. */ movq initial_code(%rip),%rax pushq $0 # fake return address to stop unwinder @@ -270,13 +303,13 @@ ENDPROC(start_cpu0) /* SMP bootup changes these two */ __REFDATA - .align 8 - ENTRY(initial_code) + .balign 8 + GLOBAL(initial_code) .quad x86_64_start_kernel - ENTRY(initial_gs) + GLOBAL(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) - ENTRY(stack_start) + GLOBAL(stack_start) .quad init_thread_union+THREAD_SIZE-8 .word 0 __FINITDATA @@ -284,7 +317,7 @@ ENDPROC(start_cpu0) bad_address: jmp bad_address - .section ".init.text","ax" + __INIT .globl early_idt_handlers early_idt_handlers: # 104(%rsp) %rflags @@ -321,14 +354,22 @@ ENTRY(early_idt_handler) pushq %r11 # 0(%rsp) cmpl $__KERNEL_CS,96(%rsp) - jne 10f + jne 11f + + cmpl $14,72(%rsp) # Page fault? + jnz 10f + GET_CR2_INTO(%rdi) # can clobber any volatile register if pv + call early_make_pgtable + andl %eax,%eax + jz 20f # All good +10: leaq 88(%rsp),%rdi # Pointer to %rip call early_fixup_exception andl %eax,%eax jnz 20f # Found an exception entry -10: +11: #ifdef CONFIG_EARLY_PRINTK GET_CR2_INTO(%r9) # can clobber any volatile register if pv movl 80(%rsp),%r8d # error code @@ -350,7 +391,7 @@ ENTRY(early_idt_handler) 1: hlt jmp 1b -20: # Exception table entry found +20: # Exception table entry found or page table generated popq %r11 popq %r10 popq %r9 @@ -364,6 +405,8 @@ ENTRY(early_idt_handler) decl early_recursion_flag(%rip) INTERRUPT_RETURN + __INITDATA + .balign 4 early_recursion_flag: .long 0 @@ -374,11 +417,10 @@ early_idt_msg: early_idt_ripmsg: .asciz "RIP %s\n" #endif /* CONFIG_EARLY_PRINTK */ - .previous #define NEXT_PAGE(name) \ .balign PAGE_SIZE; \ -ENTRY(name) +GLOBAL(name) /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ @@ -388,24 +430,37 @@ ENTRY(name) i = i + 1 ; \ .endr + __INITDATA +NEXT_PAGE(early_level4_pgt) + .fill 511,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + +NEXT_PAGE(early_dynamic_pgts) + .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 + .data - /* - * This default setting generates an ident mapping at address 0x100000 - * and a mapping for the kernel that precisely maps virtual address - * 0xffffffff80000000 to physical address 0x000000. (always using - * 2Mbyte large pages provided by PAE mode) - */ + +#ifndef CONFIG_XEN NEXT_PAGE(init_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_START_KERNEL*8, 0 + .fill 512,8,0 +#else +NEXT_PAGE(init_level4_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_level4_pgt + L4_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 511,8,0 + .fill 511, 8, 0 +NEXT_PAGE(level2_ident_pgt) + /* Since I easily can, map the first 1G. + * Don't set NX because code runs from these pages. + */ + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) +#endif NEXT_PAGE(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 @@ -413,21 +468,6 @@ NEXT_PAGE(level3_kernel_pgt) .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE -NEXT_PAGE(level2_fixmap_pgt) - .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE - /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ - .fill 5,8,0 - -NEXT_PAGE(level1_fixmap_pgt) - .fill 512,8,0 - -NEXT_PAGE(level2_ident_pgt) - /* Since I easily can, map the first 1G. - * Don't set NX because code runs from these pages. - */ - PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) - NEXT_PAGE(level2_kernel_pgt) /* * 512 MB kernel mapping. We spend a full page on this pagetable @@ -442,11 +482,16 @@ NEXT_PAGE(level2_kernel_pgt) PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) -NEXT_PAGE(level2_spare_pgt) - .fill 512, 8, 0 +NEXT_PAGE(level2_fixmap_pgt) + .fill 506,8,0 + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ + .fill 5,8,0 + +NEXT_PAGE(level1_fixmap_pgt) + .fill 512,8,0 #undef PMDS -#undef NEXT_PAGE .data .align 16 @@ -472,6 +517,5 @@ ENTRY(nmi_idt_table) .skip IDT_ENTRIES * 16 __PAGE_ALIGNED_BSS - .align PAGE_SIZE -ENTRY(empty_zero_page) +NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 85a8290801d..db9c41dae8d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1005,6 +1005,8 @@ void __init setup_arch(char **cmdline_p) init_mem_mapping(); + early_trap_pf_init(); + setup_real_mode(); memblock.current_limit = get_max_mapped(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ecffca11f4e..68bda7a8415 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -688,10 +688,19 @@ void __init early_trap_init(void) set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); /* int3 can be called from all */ set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); +#ifdef CONFIG_X86_32 set_intr_gate(X86_TRAP_PF, &page_fault); +#endif load_idt(&idt_descr); } +void __init early_trap_pf_init(void) +{ +#ifdef CONFIG_X86_64 + set_intr_gate(X86_TRAP_PF, &page_fault); +#endif +} + void __init trap_init(void) { int i; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 78d1ef3eab6..3364a7643a4 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -446,9 +446,10 @@ void __init init_mem_mapping(void) } #else early_ioremap_page_table_range_init(); +#endif + load_cr3(swapper_pg_dir); __flush_tlb_all(); -#endif early_memtest(0, max_pfn_mapped << PAGE_SHIFT); } -- cgit v1.2.3-70-g09d2 From 577af55d802d9fe114287e750504e09e7c677c9c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:02 -0800 Subject: x86, kexec: Remove 1024G limitation for kexec buffer on 64bit Now 64bit kernel supports more than 1T ram and kexec tools could find buffer above 1T, remove that obsolete limitation. and use MAXMEM instead. Tested on system with more than 1024G ram. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-22-git-send-email-yinghai@kernel.org Cc: Eric W. Biederman Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/kexec.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 6080d2694ba..17483a492f1 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -48,11 +48,11 @@ # define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64) #else /* Maximum physical address we can use pages from */ -# define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL) +# define KEXEC_SOURCE_MEMORY_LIMIT (MAXMEM-1) /* Maximum address we can reach in physical address mode */ -# define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL) +# define KEXEC_DESTINATION_MEMORY_LIMIT (MAXMEM-1) /* Maximum address we can use for the control pages */ -# define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL) +# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1) /* Allocate one page for the pdp and the second for the code */ # define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL) -- cgit v1.2.3-70-g09d2 From 0e691cf824f76adefb4498fe39c300aba2c2575a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jan 2013 12:20:05 -0800 Subject: x86, kexec, 64bit: Only set ident mapping for ram. We should set mappings only for usable memory ranges under max_pfn Otherwise causes same problem that is fixed by x86, mm: Only direct map addresses that are marked as E820_RAM This patch exposes pfn_mapped array, and only sets ident mapping for ranges in that array. This patch relies on new kernel_ident_mapping_init that could handle existing pgd/pud between different calls. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1359058816-7615-25-git-send-email-yinghai@kernel.org Cc: Alexander Duyck Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page.h | 4 ++++ arch/x86/kernel/machine_kexec_64.c | 13 +++++++++---- arch/x86/mm/init.c | 4 ++-- 3 files changed, 15 insertions(+), 6 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 8ca82839288..100a20c7b98 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -17,6 +17,10 @@ struct page; +#include +extern struct range pfn_mapped[]; +extern int nr_pfn_mapped; + static inline void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index d2d7e023a8c..4eabc160696 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -100,10 +100,15 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) level4p = (pgd_t *)__va(start_pgtable); clear_page(level4p); - result = kernel_ident_mapping_init(&info, level4p, - 0, max_pfn << PAGE_SHIFT); - if (result) - return result; + for (i = 0; i < nr_pfn_mapped; i++) { + mstart = pfn_mapped[i].start << PAGE_SHIFT; + mend = pfn_mapped[i].end << PAGE_SHIFT; + + result = kernel_ident_mapping_init(&info, + level4p, mstart, mend); + if (result) + return result; + } /* * segments's mem ranges could be outside 0 ~ max_pfn, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 3364a7643a4..d41815265a0 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -302,8 +302,8 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, return nr_range; } -static struct range pfn_mapped[E820_X_MAX]; -static int nr_pfn_mapped; +struct range pfn_mapped[E820_X_MAX]; +int nr_pfn_mapped; static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn) { -- cgit v1.2.3-70-g09d2