From 92bdaef7b2c5d3cb8abc902faa1f7670a183dcdc Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 5 May 2011 13:50:43 -0400 Subject: Revert "xen/mmu: Add workaround "x86-64, mm: Put early page table high"" This reverts commit a38647837a411f7df79623128421eef2118b5884. It does not work with certain AMD machines. last_pfn = 0x100000 max_arch_pfn = 0x400000000 initial memory mapped : 0 - 02c3a000 Base memory trampoline at [ffff88000009b000] 9b000 size 20480 init_memory_mapping: 0000000000000000-0000000100000000 0000000000 - 0100000000 page 4k kernel direct mapping tables up to 100000000 @ ff7fb000-100000000 init_memory_mapping: 0000000100000000-00000001e0800000 0100000000 - 01e0800000 page 4k kernel direct mapping tables up to 1e0800000 @ 1df0f3000-1e0000000 xen: setting RW the range fffdc000 - 100000000 RAMDISK: 0203b000 - 02c3a000 No NUMA configuration found Faking a node at 0000000000000000-00000001e0800000 NUMA: Using 63 for the hash shift. Initmem setup node 0 0000000000000000-00000001e0800000 NODE_DATA [00000001dfffb000 - 00000001dfffffff] BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] setup_node_bootmem+0x18a/0x1ea PGD 0 Oops: 0003 [#1] SMP last sysfs file: CPU 0 Modules linked in: Pid: 0, comm: swapper Not tainted 2.6.39-0-virtual #6~smb1 RIP: e030:[] [] setup_node_bootmem+0x18a/0x1ea RSP: e02b:ffffffff81c01e38 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 00000001e0800000 RCX: 0000000000001040 RDX: 0000000000004100 RSI: 0000000000000000 RDI: ffff8801dfffb000 RBP: ffffffff81c01e58 R08: 0000000000000020 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000bfe400 FS: 0000000000000000(0000) GS:ffffffff81cca000(0000) knlGS:0000000000000000 CS: e033 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000001c03000 CR4: 0000000000000660 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process swapper (pid: 0, threadinfo ffffffff81c00000, task ffffffff81c0b020) Stack: 0000000000000040 0000000000000001 0000000000000000 ffffffffffffffff ffffffff81c01e88 ffffffff81cf6c25 0000000000000000 0000000000000000 ffffffff81cf687f 0000000000000000 ffffffff81c01ea8 ffffffff81cf6e45 Call Trace: [] numa_register_memblks.constprop.3+0x150/0x181 [] ? numa_add_memblk+0x7c/0x7c [] numa_init.part.2+0x1c/0x7c [] ? numa_add_memblk+0x7c/0x7c [] numa_init+0x6c/0x70 [] initmem_init+0x39/0x3b [] setup_arch+0x64e/0x769 [] ? printk+0x51/0x53 [] start_kernel+0xd4/0x3f3 [] x86_64_start_reservations+0x132/0x136 [] xen_start_kernel+0x588/0x58f Code: 41 00 00 48 8b 3c c5 a0 24 cc 81 31 c0 40 f6 c7 01 74 05 aa 66 ba ff 40 40 f6 c7 02 74 05 66 ab 83 ea 02 89 d1 c1 e9 02 f6 c2 02 ab 74 02 66 ab 80 e2 01 74 01 aa 49 63 c4 48 c1 eb 0c 44 89 RIP [] setup_node_bootmem+0x18a/0x1ea RSP CR2: 0000000000000000 ---[ end trace a7919e7f17c0a725 ]--- Kernel panic - not syncing: Attempted to kill the idle task! Pid: 0, comm: swapper Tainted: G D 2.6.39-0-virtual #6~smb1 Reported-by: Stefan Bader Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 123 ----------------------------------------------------- 1 file changed, 123 deletions(-) (limited to 'arch') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 55c965b38c2..cf4ef61e425 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1463,119 +1463,6 @@ static int xen_pgd_alloc(struct mm_struct *mm) return ret; } -#ifdef CONFIG_X86_64 -static __initdata u64 __last_pgt_set_rw = 0; -static __initdata u64 __pgt_buf_start = 0; -static __initdata u64 __pgt_buf_end = 0; -static __initdata u64 __pgt_buf_top = 0; -/* - * As a consequence of the commit: - * - * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e - * Author: Yinghai Lu - * Date: Fri Dec 17 16:58:28 2010 -0800 - * - * x86-64, mm: Put early page table high - * - * at some point init_memory_mapping is going to reach the pagetable pages - * area and map those pages too (mapping them as normal memory that falls - * in the range of addresses passed to init_memory_mapping as argument). - * Some of those pages are already pagetable pages (they are in the range - * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and - * everything is fine. - * Some of these pages are not pagetable pages yet (they fall in the range - * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they - * are going to be mapped RW. When these pages become pagetable pages and - * are hooked into the pagetable, xen will find that the guest has already - * a RW mapping of them somewhere and fail the operation. - * The reason Xen requires pagetables to be RO is that the hypervisor needs - * to verify that the pagetables are valid before using them. The validation - * operations are called "pinning". - * - * In order to fix the issue we mark all the pages in the entire range - * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation - * is completed only the range pgt_buf_start-pgt_buf_end is reserved by - * init_memory_mapping. Hence the kernel is going to crash as soon as one - * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those - * ranges are RO). - * - * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_ - * the init_memory_mapping has completed (in a perfect world we would - * call this function from init_memory_mapping, but lets ignore that). - * - * Because we are called _after_ init_memory_mapping the pgt_buf_[start, - * end,top] have all changed to new values (b/c init_memory_mapping - * is called and setting up another new page-table). Hence, the first time - * we enter this function, we save away the pgt_buf_start value and update - * the pgt_buf_[end,top]. - * - * When we detect that the "old" pgt_buf_start through pgt_buf_end - * PFNs have been reserved (so memblock_x86_reserve_range has been called), - * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top. - * - * And then we update those "old" pgt_buf_[end|top] with the new ones - * so that we can redo this on the next pagetable. - */ -static __init void mark_rw_past_pgt(void) { - - if (pgt_buf_end > pgt_buf_start) { - u64 addr, size; - - /* Save it away. */ - if (!__pgt_buf_start) { - __pgt_buf_start = pgt_buf_start; - __pgt_buf_end = pgt_buf_end; - __pgt_buf_top = pgt_buf_top; - return; - } - /* If we get the range that starts at __pgt_buf_end that means - * the range is reserved, and that in 'init_memory_mapping' - * the 'memblock_x86_reserve_range' has been called with the - * outdated __pgt_buf_start, __pgt_buf_end (the "new" - * pgt_buf_[start|end|top] refer now to a new pagetable. - * Note: we are called _after_ the pgt_buf_[..] have been - * updated.*/ - - addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start), - &size, PAGE_SIZE); - - /* Still not reserved, meaning 'memblock_x86_reserve_range' - * hasn't been called yet. Update the _end and _top.*/ - if (addr == PFN_PHYS(__pgt_buf_start)) { - __pgt_buf_end = pgt_buf_end; - __pgt_buf_top = pgt_buf_top; - return; - } - - /* OK, the area is reserved, meaning it is time for us to - * set RW for the old end->top PFNs. */ - - /* ..unless we had already done this. */ - if (__pgt_buf_end == __last_pgt_set_rw) - return; - - addr = PFN_PHYS(__pgt_buf_end); - - /* set as RW the rest */ - printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", - PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top)); - - while (addr < PFN_PHYS(__pgt_buf_top)) { - make_lowmem_page_readwrite(__va(addr)); - addr += PAGE_SIZE; - } - /* And update everything so that we are ready for the next - * pagetable (the one created for regions past 4GB) */ - __last_pgt_set_rw = __pgt_buf_end; - __pgt_buf_start = pgt_buf_start; - __pgt_buf_end = pgt_buf_end; - __pgt_buf_top = pgt_buf_top; - } - return; -} -#else -static __init void mark_rw_past_pgt(void) { } -#endif static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) { #ifdef CONFIG_X86_64 @@ -1601,14 +1488,6 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) { unsigned long pfn = pte_pfn(pte); - /* - * A bit of optimization. We do not need to call the workaround - * when xen_set_pte_init is called with a PTE with 0 as PFN. - * That is b/c the pagetable at that point are just being populated - * with empty values and we can save some cycles by not calling - * the 'memblock' code.*/ - if (pfn) - mark_rw_past_pgt(); /* * If the new pfn is within the range of the newly allocated * kernel pagetable, and it isn't being mapped into an @@ -2118,8 +1997,6 @@ __init void xen_ident_map_ISA(void) static __init void xen_post_allocator_init(void) { - mark_rw_past_pgt(); - #ifdef CONFIG_XEN_DEBUG pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); #endif -- cgit v1.2.3-70-g09d2 From 279b706bf800b5967037f492dbe4fc5081ad5d0f Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 14 Apr 2011 15:49:41 +0100 Subject: x86,xen: introduce x86_init.mapping.pagetable_reserve Introduce a new x86_init hook called pagetable_reserve that at the end of init_memory_mapping is used to reserve a range of memory addresses for the kernel pagetable pages we used and free the other ones. On native it just calls memblock_x86_reserve_range while on xen it also takes care of setting the spare memory previously allocated for kernel pagetable pages from RO to RW, so that it can be used for other purposes. A detailed explanation of the reason why this hook is needed follows. As a consequence of the commit: commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e Author: Yinghai Lu Date: Fri Dec 17 16:58:28 2010 -0800 x86-64, mm: Put early page table high at some point init_memory_mapping is going to reach the pagetable pages area and map those pages too (mapping them as normal memory that falls in the range of addresses passed to init_memory_mapping as argument). Some of those pages are already pagetable pages (they are in the range pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and everything is fine. Some of these pages are not pagetable pages yet (they fall in the range pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they are going to be mapped RW. When these pages become pagetable pages and are hooked into the pagetable, xen will find that the guest has already a RW mapping of them somewhere and fail the operation. The reason Xen requires pagetables to be RO is that the hypervisor needs to verify that the pagetables are valid before using them. The validation operations are called "pinning" (more details in arch/x86/xen/mmu.c). In order to fix the issue we mark all the pages in the entire range pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation is completed only the range pgt_buf_start-pgt_buf_end is reserved by init_memory_mapping. Hence the kernel is going to crash as soon as one of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those ranges are RO). For this reason we need a hook to reserve the kernel pagetable pages we used and free the other ones so that they can be reused for other purposes. On native it just means calling memblock_x86_reserve_range, on Xen it also means marking RW the pagetable pages that we allocated before but that haven't been used before. Another way to fix this is without using the hook is by adding a 'if (xen_pv_domain)' in the 'init_memory_mapping' code and calling the Xen counterpart, but that is just nasty. Signed-off-by: Stefano Stabellini Acked-by: Yinghai Lu Acked-by: H. Peter Anvin Cc: Ingo Molnar Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/pgtable_types.h | 1 + arch/x86/include/asm/x86_init.h | 12 ++++++++++++ arch/x86/kernel/x86_init.c | 4 ++++ arch/x86/mm/init.c | 24 ++++++++++++++++++++++-- arch/x86/xen/mmu.c | 15 +++++++++++++++ 5 files changed, 54 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 7db7723d1f3..d56187c6b83 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -299,6 +299,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); +extern void native_pagetable_reserve(u64 start, u64 end); #ifdef CONFIG_X86_32 extern void native_pagetable_setup_start(pgd_t *base); extern void native_pagetable_setup_done(pgd_t *base); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 643ebf2e2ad..d3d859035af 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -67,6 +67,17 @@ struct x86_init_oem { void (*banner)(void); }; +/** + * struct x86_init_mapping - platform specific initial kernel pagetable setup + * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage + * + * For more details on the purpose of this hook, look in + * init_memory_mapping and the commit that added it. + */ +struct x86_init_mapping { + void (*pagetable_reserve)(u64 start, u64 end); +}; + /** * struct x86_init_paging - platform specific paging functions * @pagetable_setup_start: platform specific pre paging_init() call @@ -123,6 +134,7 @@ struct x86_init_ops { struct x86_init_mpparse mpparse; struct x86_init_irqs irqs; struct x86_init_oem oem; + struct x86_init_mapping mapping; struct x86_init_paging paging; struct x86_init_timers timers; struct x86_init_iommu iommu; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index c11514e9128..75ef4b18e9b 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -61,6 +61,10 @@ struct x86_init_ops x86_init __initdata = { .banner = default_banner, }, + .mapping = { + .pagetable_reserve = native_pagetable_reserve, + }, + .paging = { .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 286d289b039..722a4c372ce 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -81,6 +81,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse, end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); } +void native_pagetable_reserve(u64 start, u64 end) +{ + memblock_x86_reserve_range(start, end, "PGTABLE"); +} + struct map_range { unsigned long start; unsigned long end; @@ -272,9 +277,24 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __flush_tlb_all(); + /* + * Reserve the kernel pagetable pages we used (pgt_buf_start - + * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) + * so that they can be reused for other purposes. + * + * On native it just means calling memblock_x86_reserve_range, on Xen it + * also means marking RW the pagetable pages that we allocated before + * but that haven't been used. + * + * In fact on xen we mark RO the whole range pgt_buf_start - + * pgt_buf_top, because we have to make sure that when + * init_memory_mapping reaches the pagetable pages area, it maps + * RO all the pagetable pages, including the ones that are beyond + * pgt_buf_end at that time. + */ if (!after_bootmem && pgt_buf_end > pgt_buf_start) - memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT, - pgt_buf_end << PAGE_SHIFT, "PGTABLE"); + x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), + PFN_PHYS(pgt_buf_end)); if (!after_bootmem) early_memtest(start, end); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index cf4ef61e425..0684f3c74d5 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1275,6 +1275,20 @@ static __init void xen_pagetable_setup_start(pgd_t *base) { } +static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) +{ + /* reserve the range used */ + native_pagetable_reserve(start, end); + + /* set as RW the rest */ + printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end, + PFN_PHYS(pgt_buf_top)); + while (end < PFN_PHYS(pgt_buf_top)) { + make_lowmem_page_readwrite(__va(end)); + end += PAGE_SIZE; + } +} + static void xen_post_allocator_init(void); static __init void xen_pagetable_setup_done(pgd_t *base) @@ -2105,6 +2119,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { void __init xen_init_mmu_ops(void) { + x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; -- cgit v1.2.3-70-g09d2 From 53f8023febf9b3e18d8fb0d99c55010e473ce53d Mon Sep 17 00:00:00 2001 From: Sedat Dilek Date: Sun, 17 Apr 2011 16:17:34 +0200 Subject: x86/mm: Fix section mismatch derived from native_pagetable_reserve() With CONFIG_DEBUG_SECTION_MISMATCH=y I see these warnings in next-20110415: LD vmlinux.o MODPOST vmlinux.o WARNING: vmlinux.o(.text+0x1ba48): Section mismatch in reference from the function native_pagetable_reserve() to the function .init.text:memblock_x86_reserve_range() The function native_pagetable_reserve() references the function __init memblock_x86_reserve_range(). This is often because native_pagetable_reserve lacks a __init annotation or the annotation of memblock_x86_reserve_range is wrong. This patch fixes the issue. Thanks to pipacs from PaX project for help on IRC. Acked-by: "H. Peter Anvin" Signed-off-by: Sedat Dilek Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 722a4c372ce..37b8b0fe832 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -81,7 +81,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse, end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); } -void native_pagetable_reserve(u64 start, u64 end) +void __init native_pagetable_reserve(u64 start, u64 end) { memblock_x86_reserve_range(start, end, "PGTABLE"); } -- cgit v1.2.3-70-g09d2