From 1160c2779b826c6f5c08e5cc542de58fd1f667d5 Mon Sep 17 00:00:00 2001 From: Samu Kallio Date: Sat, 23 Mar 2013 09:36:35 -0400 Subject: x86, mm, paravirt: Fix vmalloc_fault oops during lazy MMU updates In paravirtualized x86_64 kernels, vmalloc_fault may cause an oops when lazy MMU updates are enabled, because set_pgd effects are being deferred. One instance of this problem is during process mm cleanup with memory cgroups enabled. The chain of events is as follows: - zap_pte_range enables lazy MMU updates - zap_pte_range eventually calls mem_cgroup_charge_statistics, which accesses the vmalloc'd mem_cgroup per-cpu stat area - vmalloc_fault is triggered which tries to sync the corresponding PGD entry with set_pgd, but the update is deferred - vmalloc_fault oopses due to a mismatch in the PUD entries The OOPs usually looks as so: ------------[ cut here ]------------ kernel BUG at arch/x86/mm/fault.c:396! invalid opcode: 0000 [#1] SMP .. snip .. CPU 1 Pid: 10866, comm: httpd Not tainted 3.6.10-4.fc18.x86_64 #1 RIP: e030:[] [] vmalloc_fault+0x11f/0x208 .. snip .. Call Trace: [] do_page_fault+0x399/0x4b0 [] ? xen_mc_extend_args+0xec/0x110 [] page_fault+0x25/0x30 [] ? mem_cgroup_charge_statistics.isra.13+0x13/0x50 [] __mem_cgroup_uncharge_common+0xd8/0x350 [] mem_cgroup_uncharge_page+0x57/0x60 [] page_remove_rmap+0xe0/0x150 [] ? vm_normal_page+0x1a/0x80 [] unmap_single_vma+0x531/0x870 [] unmap_vmas+0x52/0xa0 [] ? pte_mfn_to_pfn+0x72/0x100 [] exit_mmap+0x98/0x170 [] ? __raw_callee_save_xen_pmd_val+0x11/0x1e [] mmput+0x83/0xf0 [] exit_mm+0x104/0x130 [] do_exit+0x15a/0x8c0 [] do_group_exit+0x3f/0xa0 [] sys_exit_group+0x17/0x20 [] system_call_fastpath+0x16/0x1b Calling arch_flush_lazy_mmu_mode immediately after set_pgd makes the changes visible to the consistency checks. Cc: RedHat-Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=914737 Tested-by: Josh Boyer Reported-and-Tested-by: Krishna Raman Signed-off-by: Samu Kallio Link: http://lkml.kernel.org/r/1364045796-10720-1-git-send-email-konrad.wilk@oracle.com Tested-by: Konrad Rzeszutek Wilk Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: H. Peter Anvin --- arch/x86/mm/fault.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 2b97525246d..0e883364abb 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -378,10 +378,12 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) if (pgd_none(*pgd_ref)) return -1; - if (pgd_none(*pgd)) + if (pgd_none(*pgd)) { set_pgd(pgd, *pgd_ref); - else + arch_flush_lazy_mmu_mode(); + } else { BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } /* * Below here mismatches are bugs because these lower tables -- cgit v1.2.3-70-g09d2 From f76cfa3c2496c462b5bc01bd0c9340c2715b73ca Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 10 Apr 2013 15:28:25 +0200 Subject: x86/mm/cpa: Convert noop to functional fix Commit: a8aed3e0752b ("x86/mm/pageattr: Prevent PSE and GLOABL leftovers to confuse pmd/pte_present and pmd_huge") introduced a valid fix but one location that didn't trigger the bug that lead to finding those (small) problems, wasn't updated using the right variable. The wrong variable was also initialized for no good reason, that may have been the source of the confusion. Remove the noop initialization accordingly. Commit a8aed3e0752b also erroneously removed one canon_pgprot pass meant to clear pmd bitflags not supported in hardware by older CPUs, that automatically gets corrected by this patch too by applying it to the right variable in the new location. Reported-by: Stefan Bader Signed-off-by: Andrea Arcangeli Acked-by: Borislav Petkov Cc: Andy Whitcroft Cc: Mel Gorman Link: http://lkml.kernel.org/r/1365600505-19314-1-git-send-email-aarcange@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 091934e1d0d..7896f7190fd 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -467,7 +467,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * We are safe now. Check whether the new pgprot is the same: */ old_pte = *kpte; - old_prot = new_prot = req_prot = pte_pgprot(old_pte); + old_prot = req_prot = pte_pgprot(old_pte); pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); @@ -478,12 +478,12 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL * for the ancient hardware that doesn't support it. */ - if (pgprot_val(new_prot) & _PAGE_PRESENT) - pgprot_val(new_prot) |= _PAGE_PSE | _PAGE_GLOBAL; + if (pgprot_val(req_prot) & _PAGE_PRESENT) + pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL; else - pgprot_val(new_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL); + pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL); - new_prot = canon_pgprot(new_prot); + req_prot = canon_pgprot(req_prot); /* * old_pte points to the large page base address. So we need -- cgit v1.2.3-70-g09d2 From 18699739b60cb60230153ff5475b2ba92be185f9 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 11 Apr 2013 15:36:09 +0200 Subject: x86/mm/cpa/selftest: Fix false positive in CPA self test If the pmd is not present, _PAGE_PSE will not be set anymore. Fix the false positive. Reported-by: Ingo Molnar Signed-off-by: Andrea Arcangeli Cc: Stefan Bader Cc: Andy Whitcroft Cc: Mel Gorman Cc: Borislav Petkov Link: http://lkml.kernel.org/r/1365687369-30802-1-git-send-email-aarcange@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index b0086567271..0e38951e65e 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -68,7 +68,7 @@ static int print_split(struct split_state *s) s->gpg++; i += GPS/PAGE_SIZE; } else if (level == PG_LEVEL_2M) { - if (!(pte_val(*pte) & _PAGE_PSE)) { + if ((pte_val(*pte) & _PAGE_PRESENT) && !(pte_val(*pte) & _PAGE_PSE)) { printk(KERN_ERR "%lx level %d but not PSE %Lx\n", addr, level, (u64)pte_val(*pte)); -- cgit v1.2.3-70-g09d2 From 26564600c9e88c6572a5e6ef5ae9121907edfb7f Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Thu, 11 Apr 2013 13:59:52 -0400 Subject: x86/mm: Flush lazy MMU when DEBUG_PAGEALLOC is set When CONFIG_DEBUG_PAGEALLOC is set page table updates made by kernel_map_pages() are not made visible (via TLB flush) immediately if lazy MMU is on. In environments that support lazy MMU (e.g. Xen) this may lead to fatal page faults, for example, when zap_pte_range() needs to allocate pages in __tlb_remove_page() -> tlb_next_batch(). Signed-off-by: Boris Ostrovsky Cc: konrad.wilk@oracle.com Link: http://lkml.kernel.org/r/1365703192-2089-1-git-send-email-boris.ostrovsky@oracle.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 7896f7190fd..fb4e73ec24d 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1413,6 +1413,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable) * but that can deadlock->flush only current cpu: */ __flush_tlb_all(); + + arch_flush_lazy_mmu_mode(); } #ifdef CONFIG_HIBERNATION -- cgit v1.2.3-70-g09d2 From 1de14c3c5cbc9bb17e9dcc648cda51c0c85d54b9 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Apr 2013 16:23:54 -0700 Subject: x86-32: Fix possible incomplete TLB invalidate with PAE pagetables This patch attempts to fix: https://bugzilla.kernel.org/show_bug.cgi?id=56461 The symptom is a crash and messages like this: chrome: Corrupted page table at address 34a03000 *pdpt = 0000000000000000 *pde = 0000000000000000 Bad pagetable: 000f [#1] PREEMPT SMP Ingo guesses this got introduced by commit 611ae8e3f520 ("x86/tlb: enable tlb flush range support for x86") since that code started to free unused pagetables. On x86-32 PAE kernels, that new code has the potential to free an entire PMD page and will clear one of the four page-directory-pointer-table (aka pgd_t entries). The hardware aggressively "caches" these top-level entries and invlpg does not actually affect the CPU's copy. If we clear one we *HAVE* to do a full TLB flush, otherwise we might continue using a freed pmd page. (note, we do this properly on the population side in pud_populate()). This patch tracks whenever we clear one of these entries in the 'struct mmu_gather', and ensures that we follow up with a full tlb flush. BTW, I disassembled and checked that: if (tlb->fullmm == 0) and if (!tlb->fullmm && !tlb->need_flush_all) generate essentially the same code, so there should be zero impact there to the !PAE case. Signed-off-by: Dave Hansen Cc: Peter Anvin Cc: Ingo Molnar Cc: Artem S Tashkinov Signed-off-by: Linus Torvalds --- arch/x86/include/asm/tlb.h | 2 +- arch/x86/mm/pgtable.c | 7 +++++++ include/asm-generic/tlb.h | 7 ++++++- mm/memory.c | 1 + 4 files changed, 15 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 4fef20773b8..c7797307fc2 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -7,7 +7,7 @@ #define tlb_flush(tlb) \ { \ - if (tlb->fullmm == 0) \ + if (!tlb->fullmm && !tlb->need_flush_all) \ flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \ else \ flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL); \ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 193350b51f9..17fda6a8b3c 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -58,6 +58,13 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); + /* + * NOTE! For PAE, any changes to the top page-directory-pointer-table + * entries need a full cr3 reload to flush. + */ +#ifdef CONFIG_X86_PAE + tlb->need_flush_all = 1; +#endif tlb_remove_page(tlb, virt_to_page(pmd)); } diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 25f01d0bc14..b1b1fa6ffff 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -99,7 +99,12 @@ struct mmu_gather { unsigned int need_flush : 1, /* Did free PTEs */ fast_mode : 1; /* No batching */ - unsigned int fullmm; + /* we are in the middle of an operation to clear + * a full mm and can make some optimizations */ + unsigned int fullmm : 1, + /* we have performed an operation which + * requires a complete flush of the tlb */ + need_flush_all : 1; struct mmu_gather_batch *active; struct mmu_gather_batch local; diff --git a/mm/memory.c b/mm/memory.c index 494526ae024..13cbc420fea 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -216,6 +216,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) tlb->mm = mm; tlb->fullmm = fullmm; + tlb->need_flush_all = 0; tlb->start = -1UL; tlb->end = 0; tlb->need_flush = 0; -- cgit v1.2.3-70-g09d2