From 8f860591ffb29738cf5539b6fbf27f50dcdeb380 Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Wed, 22 Mar 2006 00:08:50 -0800 Subject: [PATCH] Enable mprotect on huge pages 2.6.16-rc3 uses hugetlb on-demand paging, but it doesn_t support hugetlb mprotect. From: David Gibson Remove a test from the mprotect() path which checks that the mprotect()ed range on a hugepage VMA is hugepage aligned (yes, really, the sense of is_aligned_hugepage_range() is the opposite of what you'd guess :-/). In fact, we don't need this test. If the given addresses match the beginning/end of a hugepage VMA they must already be suitably aligned. If they don't, then mprotect_fixup() will attempt to split the VMA. The very first test in split_vma() will check for a badly aligned address on a hugepage VMA and return -EINVAL if necessary. From: "Chen, Kenneth W" On i386 and x86-64, pte flag _PAGE_PSE collides with _PAGE_PROTNONE. The identify of hugetlb pte is lost when changing page protection via mprotect. A page fault occurs later will trigger a bug check in huge_pte_alloc(). The fix is to always make new pte a hugetlb pte and also to clean up legacy code where _PAGE_PRESENT is forced on in the pre-faulting day. Signed-off-by: Zhang Yanmin Cc: David Gibson Cc: "David S. Miller" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: William Lee Irwin III Signed-off-by: Ken Chen Signed-off-by: Nishanth Aravamudan Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-x86_64/pgtable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/asm-x86_64/pgtable.h') diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index 715fd94cf57..a617d364d08 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -273,7 +273,7 @@ static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_RW; } static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE; } -static inline int pte_huge(pte_t pte) { return (pte_val(pte) & __LARGE_PTE) == __LARGE_PTE; } +static inline int pte_huge(pte_t pte) { return pte_val(pte) & _PAGE_PSE; } static inline pte_t pte_rdprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; } static inline pte_t pte_exprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; } @@ -285,7 +285,7 @@ static inline pte_t pte_mkexec(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _ static inline pte_t pte_mkdirty(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; } static inline pte_t pte_mkyoung(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; } static inline pte_t pte_mkwrite(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; } -static inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | __LARGE_PTE)); return pte; } +static inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_PSE)); return pte; } struct vm_area_struct; -- cgit v1.2.3-70-g09d2 From 3d1712c91df01d2573b934e972e231e8edb102c7 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 24 Mar 2006 03:15:11 -0800 Subject: [PATCH] x86_64: {set,clear,test}_bit() related cleanup and pci_mmcfg_init() fix While working on these patch set, I found several possible cleanup on x86-64 and ia64. akpm: I stole this from Andi's queue. Not only does it clean up bitops. It also unrelatedly changes the prototype of pci_mmcfg_init() and removes its arch_initcall(). It seems that the wrong two patches got joined together, but this is the one which has been tested. This patch fixes the current x86_64 build error (the pci_mmcfg_init() declaration in arch/i386/pci/pci.h disagrees with the definition in arch/x86_64/pci/mmconfig.c) This also means that x86_64's pci_mmcfg_init() gets called in the same (new) manner as x86's: from arch/i386/pci/init.c:pci_access_init(), rather than via initcall. The bitops cleanups came along for free. All this worked OK in -mm testing (since 2.6.16-rc4-mm1) because x86_64 was tested with both patches applied. Signed-off-by: Akinobu Mita Signed-off-by: Andi Kleen Cc: Con Kolivas Cc: Jean Delvare Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/mce.c | 3 +-- arch/x86_64/kernel/setup.c | 3 +-- arch/x86_64/pci/mmconfig.c | 18 +++++++----------- include/asm-x86_64/mmu_context.h | 6 +++--- include/asm-x86_64/pgtable.h | 6 +++--- 5 files changed, 15 insertions(+), 21 deletions(-) (limited to 'include/asm-x86_64/pgtable.h') diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index b8b9529fa89..04282ef9fbd 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -139,8 +139,7 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) static int mce_available(struct cpuinfo_x86 *c) { - return test_bit(X86_FEATURE_MCE, &c->x86_capability) && - test_bit(X86_FEATURE_MCA, &c->x86_capability); + return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index aa55e3cec66..f227d0c23dc 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -1344,8 +1344,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) { int i; for ( i = 0 ; i < 32*NCAPINTS ; i++ ) - if ( test_bit(i, &c->x86_capability) && - x86_cap_flags[i] != NULL ) + if (cpu_has(c, i) && x86_cap_flags[i] != NULL) seq_printf(m, " %s", x86_cap_flags[i]); } diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index 18f371fe37f..e616500207e 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -55,7 +55,7 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus) static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) { char __iomem *addr; - if (seg == 0 && bus == 0 && test_bit(PCI_SLOT(devfn), &fallback_slots)) + if (seg == 0 && bus == 0 && test_bit(PCI_SLOT(devfn), fallback_slots)) return NULL; addr = get_virt(seg, bus); if (!addr) @@ -143,29 +143,29 @@ static __init void unreachable_devices(void) continue; addr = pci_dev_base(0, 0, PCI_DEVFN(i, 0)); if (addr == NULL|| readl(addr) != val1) { - set_bit(i, &fallback_slots); + set_bit(i, fallback_slots); } } } -static int __init pci_mmcfg_init(void) +void __init pci_mmcfg_init(void) { int i; if ((pci_probe & PCI_PROBE_MMCONF) == 0) - return 0; + return; acpi_table_parse(ACPI_MCFG, acpi_parse_mcfg); if ((pci_mmcfg_config_num == 0) || (pci_mmcfg_config == NULL) || (pci_mmcfg_config[0].base_address == 0)) - return 0; + return; /* RED-PEN i386 doesn't do _nocache right now */ pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL); if (pci_mmcfg_virt == NULL) { printk("PCI: Can not allocate memory for mmconfig structures\n"); - return 0; + return; } for (i = 0; i < pci_mmcfg_config_num; ++i) { pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i]; @@ -173,7 +173,7 @@ static int __init pci_mmcfg_init(void) if (!pci_mmcfg_virt[i].virt) { printk("PCI: Cannot map mmconfig aperture for segment %d\n", pci_mmcfg_config[i].pci_segment_group_number); - return 0; + return; } printk(KERN_INFO "PCI: Using MMCONFIG at %x\n", pci_mmcfg_config[i].base_address); } @@ -182,8 +182,4 @@ static int __init pci_mmcfg_init(void) raw_pci_ops = &pci_mmcfg; pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; - - return 0; } - -arch_initcall(pci_mmcfg_init); diff --git a/include/asm-x86_64/mmu_context.h b/include/asm-x86_64/mmu_context.h index 16e4be4de0c..19f0c83d079 100644 --- a/include/asm-x86_64/mmu_context.h +++ b/include/asm-x86_64/mmu_context.h @@ -34,12 +34,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, unsigned cpu = smp_processor_id(); if (likely(prev != next)) { /* stop flush ipis for the previous mm */ - clear_bit(cpu, &prev->cpu_vm_mask); + cpu_clear(cpu, prev->cpu_vm_mask); #ifdef CONFIG_SMP write_pda(mmu_state, TLBSTATE_OK); write_pda(active_mm, next); #endif - set_bit(cpu, &next->cpu_vm_mask); + cpu_set(cpu, next->cpu_vm_mask); load_cr3(next->pgd); if (unlikely(next->context.ldt != prev->context.ldt)) @@ -50,7 +50,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, write_pda(mmu_state, TLBSTATE_OK); if (read_pda(active_mm) != next) out_of_line_bug(); - if(!test_and_set_bit(cpu, &next->cpu_vm_mask)) { + if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { /* We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index a617d364d08..def90328719 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -293,19 +293,19 @@ static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned { if (!pte_dirty(*ptep)) return 0; - return test_and_clear_bit(_PAGE_BIT_DIRTY, ptep); + return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte); } static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { if (!pte_young(*ptep)) return 0; - return test_and_clear_bit(_PAGE_BIT_ACCESSED, ptep); + return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte); } static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - clear_bit(_PAGE_BIT_RW, ptep); + clear_bit(_PAGE_BIT_RW, &ptep->pte); } /* -- cgit v1.2.3-70-g09d2 From 8c914cb704a11460eec7ed2a572bb5e9bd513d24 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Sat, 25 Mar 2006 16:29:40 +0100 Subject: [PATCH] x86_64: actively synchronize vmalloc area when registering certain callbacks While the modular aspect of the respective i386 patch doesn't apply to x86-64 (as the top level page directory entry is shared between modules and the base kernel), handlers registered with register_die_notifier() are still under similar constraints for touching ioremap()ed or vmalloc()ed memory. The likelihood of this problem becoming visible is of course significantly lower, as the assigned virtual addresses would have to cross a 2**39 byte boundary. This is because the callback gets invoked (a) in the page fault path before the top level page table propagation gets carried out (hence a fault to propagate the top level page table entry/entries mapping to module's code/data would nest infinitly) and (b) in the NMI path, where nested faults must absolutely not happen, since otherwise the IRET from the nested fault re-enables NMIs, potentially resulting in nested NMI occurences. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/nmi.c | 1 + arch/x86_64/kernel/traps.c | 2 ++ arch/x86_64/mm/fault.c | 73 ++++++++++++++++++++++++++++++++++++-------- include/asm-x86_64/pgalloc.h | 28 +++++++++++++++++ include/asm-x86_64/pgtable.h | 4 +++ 5 files changed, 95 insertions(+), 13 deletions(-) (limited to 'include/asm-x86_64/pgtable.h') diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 66c009e10ba..d9e4067faf0 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -534,6 +534,7 @@ asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) void set_nmi_callback(nmi_callback_t callback) { + vmalloc_sync_all(); rcu_assign_pointer(nmi_callback, callback); } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 28d50dc540e..b25bc904d42 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -78,6 +78,8 @@ int register_die_notifier(struct notifier_block *nb) { int err = 0; unsigned long flags; + + vmalloc_sync_all(); spin_lock_irqsave(&die_notifier_lock, flags); err = notifier_chain_register(&die_chain, nb); spin_unlock_irqrestore(&die_notifier_lock, flags); diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 2e7c3c8ffe0..de91e17daf6 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -264,6 +264,8 @@ static int vmalloc_fault(unsigned long address) return -1; if (pgd_none(*pgd)) set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref)); /* Below here mismatches are bugs because these lower tables are shared */ @@ -314,16 +316,6 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, /* get the address */ __asm__("movq %%cr2,%0":"=r" (address)); - if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, - SIGSEGV) == NOTIFY_STOP) - return; - - if (likely(regs->eflags & X86_EFLAGS_IF)) - local_irq_enable(); - - if (unlikely(page_fault_trace)) - printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", - regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); tsk = current; mm = tsk->mm; @@ -351,10 +343,12 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, */ if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && ((address >= VMALLOC_START && address < VMALLOC_END))) { - if (vmalloc_fault(address) < 0) - goto bad_area_nosemaphore; - return; + if (vmalloc_fault(address) >= 0) + return; } + if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, + SIGSEGV) == NOTIFY_STOP) + return; /* * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock. @@ -362,6 +356,17 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, goto bad_area_nosemaphore; } + if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, + SIGSEGV) == NOTIFY_STOP) + return; + + if (likely(regs->eflags & X86_EFLAGS_IF)) + local_irq_enable(); + + if (unlikely(page_fault_trace)) + printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", + regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); + if (unlikely(error_code & PF_RSVD)) pgtable_bad(address, regs, error_code); @@ -571,6 +576,48 @@ do_sigbus: return; } +DEFINE_SPINLOCK(pgd_lock); +struct page *pgd_list; + +void vmalloc_sync_all(void) +{ + /* Note that races in the updates of insync and start aren't + problematic: + insync can only get set bits added, and updates to start are only + improving performance (without affecting correctness if undone). */ + static DECLARE_BITMAP(insync, PTRS_PER_PGD); + static unsigned long start = VMALLOC_START & PGDIR_MASK; + unsigned long address; + + for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { + if (!test_bit(pgd_index(address), insync)) { + const pgd_t *pgd_ref = pgd_offset_k(address); + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + spin_lock(&pgd_lock); + for (page = pgd_list; page; + page = (struct page *)page->index) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref)); + } + spin_unlock(&pgd_lock); + set_bit(pgd_index(address), insync); + } + if (address == start) + start = address + PGDIR_SIZE; + } + /* Check that there is no need to do the same for the modules area. */ + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); + BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == + (__START_KERNEL & PGDIR_MASK))); +} + static int __init enable_pagefaulttrace(char *str) { page_fault_trace = 1; diff --git a/include/asm-x86_64/pgalloc.h b/include/asm-x86_64/pgalloc.h index 08cad2482bc..43d4c333a8b 100644 --- a/include/asm-x86_64/pgalloc.h +++ b/include/asm-x86_64/pgalloc.h @@ -45,12 +45,39 @@ static inline void pud_free (pud_t *pud) free_page((unsigned long)pud); } +static inline void pgd_list_add(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + + spin_lock(&pgd_lock); + page->index = (pgoff_t)pgd_list; + if (pgd_list) + pgd_list->private = (unsigned long)&page->index; + pgd_list = page; + page->private = (unsigned long)&pgd_list; + spin_unlock(&pgd_lock); +} + +static inline void pgd_list_del(pgd_t *pgd) +{ + struct page *next, **pprev, *page = virt_to_page(pgd); + + spin_lock(&pgd_lock); + next = (struct page *)page->index; + pprev = (struct page **)page->private; + *pprev = next; + if (next) + next->private = (unsigned long)pprev; + spin_unlock(&pgd_lock); +} + static inline pgd_t *pgd_alloc(struct mm_struct *mm) { unsigned boundary; pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); if (!pgd) return NULL; + pgd_list_add(pgd); /* * Copy kernel pointers in from init. * Could keep a freelist or slab cache of those because the kernel @@ -67,6 +94,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) static inline void pgd_free(pgd_t *pgd) { BUG_ON((unsigned long)pgd & (PAGE_SIZE-1)); + pgd_list_del(pgd); free_page((unsigned long)pgd); } diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index def90328719..31e83c3bd02 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -420,6 +420,10 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +extern spinlock_t pgd_lock; +extern struct page *pgd_list; +void vmalloc_sync_all(void); + #endif /* !__ASSEMBLY__ */ extern int kern_addr_valid(unsigned long addr); -- cgit v1.2.3-70-g09d2