From 29409997f8d06d693d82127d200eeaf48989fdd2 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Jun 2013 14:30:16 +0530 Subject: powerpc: move find_linux_pte_or_hugepte and gup_hugepte to common code We will use this in the later patch for handling THP pages Reviewed-by: David Gibson Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/hugetlbpage.c | 251 +++++++++++++++++++++--------------------- 1 file changed, 128 insertions(+), 123 deletions(-) (limited to 'arch/powerpc/mm/hugetlbpage.c') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 237c8e5f264..2865077e015 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -21,6 +21,9 @@ #include #include #include +#include + +#ifdef CONFIG_HUGETLB_PAGE #define PAGE_SHIFT_64K 16 #define PAGE_SHIFT_16M 24 @@ -100,66 +103,6 @@ int pgd_huge(pgd_t pgd) } #endif -/* - * We have 4 cases for pgds and pmds: - * (1) invalid (all zeroes) - * (2) pointer to next table, as normal; bottom 6 bits == 0 - * (3) leaf pte for huge page, bottom two bits != 00 - * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table - */ -pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) -{ - pgd_t *pg; - pud_t *pu; - pmd_t *pm; - pte_t *ret_pte; - hugepd_t *hpdp = NULL; - unsigned pdshift = PGDIR_SHIFT; - - if (shift) - *shift = 0; - - pg = pgdir + pgd_index(ea); - - if (pgd_huge(*pg)) { - ret_pte = (pte_t *) pg; - goto out; - } else if (is_hugepd(pg)) - hpdp = (hugepd_t *)pg; - else if (!pgd_none(*pg)) { - pdshift = PUD_SHIFT; - pu = pud_offset(pg, ea); - - if (pud_huge(*pu)) { - ret_pte = (pte_t *) pu; - goto out; - } else if (is_hugepd(pu)) - hpdp = (hugepd_t *)pu; - else if (!pud_none(*pu)) { - pdshift = PMD_SHIFT; - pm = pmd_offset(pu, ea); - - if (pmd_huge(*pm)) { - ret_pte = (pte_t *) pm; - goto out; - } else if (is_hugepd(pm)) - hpdp = (hugepd_t *)pm; - else if (!pmd_none(*pm)) - return pte_offset_kernel(pm, ea); - } - } - if (!hpdp) - return NULL; - - ret_pte = hugepte_offset(hpdp, ea, pdshift); - pdshift = hugepd_shift(*hpdp); -out: - if (shift) - *shift = pdshift; - return ret_pte; -} -EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); - pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); @@ -753,69 +696,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; } -int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long mask; - unsigned long pte_end; - struct page *head, *page, *tail; - pte_t pte; - int refs; - - pte_end = (addr + sz) & ~(sz-1); - if (pte_end < end) - end = pte_end; - - pte = *ptep; - mask = _PAGE_PRESENT | _PAGE_USER; - if (write) - mask |= _PAGE_RW; - - if ((pte_val(pte) & mask) != mask) - return 0; - - /* hugepages are never "special" */ - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - - refs = 0; - head = pte_page(pte); - - page = head + ((addr & (sz-1)) >> PAGE_SHIFT); - tail = page; - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - if (!page_cache_add_speculative(head, refs)) { - *nr -= refs; - return 0; - } - - if (unlikely(pte_val(pte) != pte_val(*ptep))) { - /* Could be optimized better */ - *nr -= refs; - while (refs--) - put_page(head); - return 0; - } - - /* - * Any tail page need their mapcount reference taken before we - * return. - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - - return 1; -} - static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, unsigned long sz) { @@ -1032,3 +912,128 @@ void flush_dcache_icache_hugepage(struct page *page) } } } + +#endif /* CONFIG_HUGETLB_PAGE */ + +/* + * We have 4 cases for pgds and pmds: + * (1) invalid (all zeroes) + * (2) pointer to next table, as normal; bottom 6 bits == 0 + * (3) leaf pte for huge page, bottom two bits != 00 + * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table + */ +pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) +{ + pgd_t *pg; + pud_t *pu; + pmd_t *pm; + pte_t *ret_pte; + hugepd_t *hpdp = NULL; + unsigned pdshift = PGDIR_SHIFT; + + if (shift) + *shift = 0; + + pg = pgdir + pgd_index(ea); + + if (pgd_huge(*pg)) { + ret_pte = (pte_t *) pg; + goto out; + } else if (is_hugepd(pg)) + hpdp = (hugepd_t *)pg; + else if (!pgd_none(*pg)) { + pdshift = PUD_SHIFT; + pu = pud_offset(pg, ea); + + if (pud_huge(*pu)) { + ret_pte = (pte_t *) pu; + goto out; + } else if (is_hugepd(pu)) + hpdp = (hugepd_t *)pu; + else if (!pud_none(*pu)) { + pdshift = PMD_SHIFT; + pm = pmd_offset(pu, ea); + + if (pmd_huge(*pm)) { + ret_pte = (pte_t *) pm; + goto out; + } else if (is_hugepd(pm)) + hpdp = (hugepd_t *)pm; + else if (!pmd_none(*pm)) + return pte_offset_kernel(pm, ea); + } + } + if (!hpdp) + return NULL; + + ret_pte = hugepte_offset(hpdp, ea, pdshift); + pdshift = hugepd_shift(*hpdp); +out: + if (shift) + *shift = pdshift; + return ret_pte; +} +EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); + +int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask; + unsigned long pte_end; + struct page *head, *page, *tail; + pte_t pte; + int refs; + + pte_end = (addr + sz) & ~(sz-1); + if (pte_end < end) + end = pte_end; + + pte = *ptep; + mask = _PAGE_PRESENT | _PAGE_USER; + if (write) + mask |= _PAGE_RW; + + if ((pte_val(pte) & mask) != mask) + return 0; + + /* hugepages are never "special" */ + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + + page = head + ((addr & (sz-1)) >> PAGE_SHIFT); + tail = page; + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + if (!page_cache_add_speculative(head, refs)) { + *nr -= refs; + return 0; + } + + if (unlikely(pte_val(pte) != pte_val(*ptep))) { + /* Could be optimized better */ + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + /* + * Any tail page need their mapcount reference taken before we + * return. + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; + } + + return 1; +} -- cgit v1.2.3-70-g09d2 From ac52ae4721233150a3c30e9732a1c1f4f68e7db7 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Jun 2013 14:30:17 +0530 Subject: powerpc: Update find_linux_pte_or_hugepte to handle transparent hugepages Reviewed-by: David Gibson Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/hugetlbpage.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/mm/hugetlbpage.c') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 2865077e015..49282045ee9 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -936,30 +936,50 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift pg = pgdir + pgd_index(ea); - if (pgd_huge(*pg)) { + /* + * we should first check for none. That takes care of a + * a parallel hugetlb or THP pagefault moving none entries + * to respective types. + */ + if (pgd_none(*pg)) + return NULL; + else if (pgd_huge(*pg)) { ret_pte = (pte_t *) pg; goto out; } else if (is_hugepd(pg)) hpdp = (hugepd_t *)pg; - else if (!pgd_none(*pg)) { + else { pdshift = PUD_SHIFT; pu = pud_offset(pg, ea); - if (pud_huge(*pu)) { + if (pud_none(*pu)) + return NULL; + else if (pud_huge(*pu)) { ret_pte = (pte_t *) pu; goto out; } else if (is_hugepd(pu)) hpdp = (hugepd_t *)pu; - else if (!pud_none(*pu)) { + else { pdshift = PMD_SHIFT; pm = pmd_offset(pu, ea); + /* + * A hugepage collapse is captured by pmd_none, because + * it mark the pmd none and do a hpte invalidate. + * + * A hugepage split is captured by pmd_trans_splitting + * because we mark the pmd trans splitting and do a + * hpte invalidate + * + */ + if (pmd_none(*pm) || pmd_trans_splitting(*pm)) + return NULL; - if (pmd_huge(*pm)) { + if (pmd_huge(*pm) || pmd_large(*pm)) { ret_pte = (pte_t *) pm; goto out; } else if (is_hugepd(pm)) hpdp = (hugepd_t *)pm; - else if (!pmd_none(*pm)) + else return pte_offset_kernel(pm, ea); } } -- cgit v1.2.3-70-g09d2 From 12bc9f6fc1d6582b4529ac522d2231bd2584a5f1 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Jun 2013 14:30:18 +0530 Subject: powerpc: Replace find_linux_pte with find_linux_pte_or_hugepte Replace find_linux_pte with find_linux_pte_or_hugepte and explicitly document why we don't need to handle transparent hugepages at callsites. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/pgtable-ppc64.h | 24 ------------------------ arch/powerpc/kernel/eeh.c | 7 ++++++- arch/powerpc/kernel/io-workarounds.c | 11 +++++++++-- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 +- arch/powerpc/mm/hash_utils_64.c | 8 +++++++- arch/powerpc/mm/hugetlbpage.c | 8 ++++++-- arch/powerpc/mm/tlb_hash64.c | 9 +++++++-- 7 files changed, 36 insertions(+), 33 deletions(-) (limited to 'arch/powerpc/mm/hugetlbpage.c') diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 6c9323f3ab5..e71bd25d62d 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -344,30 +344,6 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); void pgtable_cache_init(void); - -/* - * find_linux_pte returns the address of a linux pte for a given - * effective address and directory. If not found, it returns zero. - */ -static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea) -{ - pgd_t *pg; - pud_t *pu; - pmd_t *pm; - pte_t *pt = NULL; - - pg = pgdir + pgd_index(ea); - if (!pgd_none(*pg)) { - pu = pud_offset(pg, ea); - if (!pud_none(*pu)) { - pm = pmd_offset(pu, ea); - if (pmd_present(*pm)) - pt = pte_offset_kernel(pm, ea); - } - } - return pt; -} - #endif /* __ASSEMBLY__ */ /* diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 7c567be3dd0..af2b9ae07df 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -260,10 +260,15 @@ static inline unsigned long eeh_token_to_phys(unsigned long token) { pte_t *ptep; unsigned long pa; + int hugepage_shift; - ptep = find_linux_pte(init_mm.pgd, token); + /* + * We won't find hugepages here, iomem + */ + ptep = find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift); if (!ptep) return token; + WARN_ON(hugepage_shift); pa = pte_pfn(*ptep) << PAGE_SHIFT; return pa | (token & (PAGE_SIZE-1)); diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c index 50e90b7e713..fa0b54b2a36 100644 --- a/arch/powerpc/kernel/io-workarounds.c +++ b/arch/powerpc/kernel/io-workarounds.c @@ -55,6 +55,7 @@ static struct iowa_bus *iowa_pci_find(unsigned long vaddr, unsigned long paddr) struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) { + unsigned hugepage_shift; struct iowa_bus *bus; int token; @@ -70,11 +71,17 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END) return NULL; - ptep = find_linux_pte(init_mm.pgd, vaddr); + ptep = find_linux_pte_or_hugepte(init_mm.pgd, vaddr, + &hugepage_shift); if (ptep == NULL) paddr = 0; - else + else { + /* + * we don't have hugepages backing iomem + */ + WARN_ON(hugepage_shift); paddr = pte_pfn(*ptep) << PAGE_SHIFT; + } bus = iowa_pci_find(vaddr, paddr); if (bus == NULL) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 6dcbb49105a..dcf892d25a5 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -27,7 +27,7 @@ static void *real_vmalloc_addr(void *x) unsigned long addr = (unsigned long) x; pte_t *p; - p = find_linux_pte(swapper_pg_dir, addr); + p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL); if (!p || !pte_present(*p)) return NULL; /* assume we don't have huge pages in vmalloc space... */ diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 2f470809876..e8434ca6efd 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1145,6 +1145,7 @@ EXPORT_SYMBOL_GPL(hash_page); void hash_preload(struct mm_struct *mm, unsigned long ea, unsigned long access, unsigned long trap) { + int hugepage_shift; unsigned long vsid; pgd_t *pgdir; pte_t *ptep; @@ -1166,10 +1167,15 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, pgdir = mm->pgd; if (pgdir == NULL) return; - ptep = find_linux_pte(pgdir, ea); + /* + * THP pages use update_mmu_cache_pmd. We don't do + * hash preload there. Hence can ignore THP here + */ + ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift); if (!ptep) return; + WARN_ON(hugepage_shift); #ifdef CONFIG_PPC_64K_PAGES /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on * a 64K kernel), then we don't preload, hash_page() will take diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 49282045ee9..8add5806100 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -105,6 +105,7 @@ int pgd_huge(pgd_t pgd) pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { + /* Only called for hugetlbfs pages, hence can ignore THP */ return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); } @@ -673,11 +674,14 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) struct page *page; unsigned shift; unsigned long mask; - + /* + * Transparent hugepages are handled by generic code. We can skip them + * here. + */ ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); /* Verify it is a huge page else bail. */ - if (!ptep || !shift) + if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) return ERR_PTR(-EINVAL); mask = (1UL << shift) - 1; diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 48bf63ea652..313c85c5aa9 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -189,6 +189,7 @@ void tlb_flush(struct mmu_gather *tlb) void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, unsigned long end) { + int hugepage_shift; unsigned long flags; start = _ALIGN_DOWN(start, PAGE_SIZE); @@ -206,7 +207,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, local_irq_save(flags); arch_enter_lazy_mmu_mode(); for (; start < end; start += PAGE_SIZE) { - pte_t *ptep = find_linux_pte(mm->pgd, start); + pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start, + &hugepage_shift); unsigned long pte; if (ptep == NULL) @@ -214,7 +216,10 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, pte = pte_val(*ptep); if (!(pte & _PAGE_HASHPTE)) continue; - hpte_need_flush(mm, start, ptep, pte, 0); + if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte))) + hpte_do_hugepage_flush(mm, start, (pmd_t *)pte); + else + hpte_need_flush(mm, start, ptep, pte, 0); } arch_leave_lazy_mmu_mode(); local_irq_restore(flags); -- cgit v1.2.3-70-g09d2 From 0ac52dd7666d5c0d0147d73a8e4b1d1ffd81cdf3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Jun 2013 14:30:22 +0530 Subject: powerpc: Make linux pagetable walk safe with THP enabled We need to have irqs disabled to handle all the possible parallel update for linux page table without holding locks. Events that we are intersted in while walking page tables are 1) Page fault 2) umap 3) THP split 4) THP collapse A) local_irq_disabled: ------------------------ 1) page fault: A none to valid transition via page fault is not an issue because we would either see a none or valid. If it is none, we would error out the page table walk. We may need to use on stack values when checking for type of page table elements, because if we do if (!is_hugepd()) { if (!pmd_none() { if (pmd_bad() { We could take that bad condition because the pmd got converted to a hugepd after the !is_hugepd check via a hugetlb fault. The right way would be to check for pmd_none higher up or use on stack value. 2) A valid to none conversion via unmap: We can safely walk the upper level table, because we don't remove the the page table entries until rcu grace period. So even if we followed a wrong pointer we still have the pointer valid till the grace period. A PTE pointer returned need to be atomically checked for _PAGE_PRESENT and _PAGE_BUSY. A valid pointer returned could becoming none later. To prevent pte_clear we take _PAGE_BUSY. 3) THP split: A valid transparent hugepage is converted to nomal page. Before we split we do pmd_splitting_flush, which sets the hugepage PTE to _PAGE_SPLITTING So when walking page table we need to check for pmd_trans_splitting and handle that. The pte returned should also need to be checked for _PAGE_SPLITTING before setting _PAGE_BUSY similar to _PAGE_PRESENT. We save the value of PTE on stack and check for the flag in the local pte value. If we don't have the value set we can safely operate on the local pte value and we atomicaly set _PAGE_BUSY. 4) THP collapse: A normal page gets converted to hugepage. In the collapse path, we mark the pmd none early (pmdp_clear_flush). With irq disabled, if we are aleady walking page table we would see the pmd_none and won't continue. If we see a valid PMD, we should still check for _PAGE_PRESENT before setting _PAGE_BUSY, to make sure we didn't collapse the PTE to a Huge PTE. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/hash_utils_64.c | 27 ++++++++------- arch/powerpc/mm/hugepage-hash64.c | 3 ++ arch/powerpc/mm/hugetlbpage.c | 72 +++++++++++++++++++++++++-------------- arch/powerpc/mm/mem.c | 4 +++ 4 files changed, 68 insertions(+), 38 deletions(-) (limited to 'arch/powerpc/mm/hugetlbpage.c') diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 7a81e866e7b..84523164398 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1180,13 +1180,25 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, pgdir = mm->pgd; if (pgdir == NULL) return; + + /* Get VSID */ + ssize = user_segment_size(ea); + vsid = get_vsid(mm->context.id, ea, ssize); + if (!vsid) + return; + /* + * Hash doesn't like irqs. Walking linux page table with irq disabled + * saves us from holding multiple locks. + */ + local_irq_save(flags); + /* * THP pages use update_mmu_cache_pmd. We don't do * hash preload there. Hence can ignore THP here */ ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift); if (!ptep) - return; + goto out_exit; WARN_ON(hugepage_shift); #ifdef CONFIG_PPC_64K_PAGES @@ -1197,18 +1209,9 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, * page size demotion here */ if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE)) - return; + goto out_exit; #endif /* CONFIG_PPC_64K_PAGES */ - /* Get VSID */ - ssize = user_segment_size(ea); - vsid = get_vsid(mm->context.id, ea, ssize); - if (!vsid) - return; - - /* Hash doesn't like irqs */ - local_irq_save(flags); - /* Is that local to this CPU ? */ if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) local = 1; @@ -1230,7 +1233,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, mm->context.user_psize, mm->context.user_psize, pte_val(*ptep)); - +out_exit: local_irq_restore(flags); } diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c index 3c22fa307b9..34de9e0cdc3 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -37,6 +37,9 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, /* If PMD busy, retry the access */ if (unlikely(old_pmd & _PAGE_BUSY)) return 0; + /* If PMD is trans splitting retry the access */ + if (unlikely(old_pmd & _PAGE_SPLITTING)) + return 0; /* If PMD permissions don't match, take page fault */ if (unlikely(access & ~old_pmd)) return 1; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 8add5806100..e9e6882231d 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -925,12 +925,16 @@ void flush_dcache_icache_hugepage(struct page *page) * (2) pointer to next table, as normal; bottom 6 bits == 0 * (3) leaf pte for huge page, bottom two bits != 00 * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table + * + * So long as we atomically load page table pointers we are safe against teardown, + * we can follow the address down to the the page and take a ref on it. */ + pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) { - pgd_t *pg; - pud_t *pu; - pmd_t *pm; + pgd_t pgd, *pgdp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; pte_t *ret_pte; hugepd_t *hpdp = NULL; unsigned pdshift = PGDIR_SHIFT; @@ -938,34 +942,42 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift if (shift) *shift = 0; - pg = pgdir + pgd_index(ea); - + pgdp = pgdir + pgd_index(ea); + pgd = ACCESS_ONCE(*pgdp); /* - * we should first check for none. That takes care of a - * a parallel hugetlb or THP pagefault moving none entries - * to respective types. + * Always operate on the local stack value. This make sure the + * value don't get updated by a parallel THP split/collapse, + * page fault or a page unmap. The return pte_t * is still not + * stable. So should be checked there for above conditions. */ - if (pgd_none(*pg)) + if (pgd_none(pgd)) return NULL; - else if (pgd_huge(*pg)) { - ret_pte = (pte_t *) pg; + else if (pgd_huge(pgd)) { + ret_pte = (pte_t *) pgdp; goto out; - } else if (is_hugepd(pg)) - hpdp = (hugepd_t *)pg; + } else if (is_hugepd(&pgd)) + hpdp = (hugepd_t *)&pgd; else { + /* + * Even if we end up with an unmap, the pgtable will not + * be freed, because we do an rcu free and here we are + * irq disabled + */ pdshift = PUD_SHIFT; - pu = pud_offset(pg, ea); + pudp = pud_offset(&pgd, ea); + pud = ACCESS_ONCE(*pudp); - if (pud_none(*pu)) + if (pud_none(pud)) return NULL; - else if (pud_huge(*pu)) { - ret_pte = (pte_t *) pu; + else if (pud_huge(pud)) { + ret_pte = (pte_t *) pudp; goto out; - } else if (is_hugepd(pu)) - hpdp = (hugepd_t *)pu; + } else if (is_hugepd(&pud)) + hpdp = (hugepd_t *)&pud; else { pdshift = PMD_SHIFT; - pm = pmd_offset(pu, ea); + pmdp = pmd_offset(&pud, ea); + pmd = ACCESS_ONCE(*pmdp); /* * A hugepage collapse is captured by pmd_none, because * it mark the pmd none and do a hpte invalidate. @@ -975,16 +987,16 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift * hpte invalidate * */ - if (pmd_none(*pm) || pmd_trans_splitting(*pm)) + if (pmd_none(pmd) || pmd_trans_splitting(pmd)) return NULL; - if (pmd_huge(*pm) || pmd_large(*pm)) { - ret_pte = (pte_t *) pm; + if (pmd_huge(pmd) || pmd_large(pmd)) { + ret_pte = (pte_t *) pmdp; goto out; - } else if (is_hugepd(pm)) - hpdp = (hugepd_t *)pm; + } else if (is_hugepd(&pmd)) + hpdp = (hugepd_t *)&pmd; else - return pte_offset_kernel(pm, ea); + return pte_offset_kernel(&pmd, ea); } } if (!hpdp) @@ -1020,6 +1032,14 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, if ((pte_val(pte) & mask) != mask) return 0; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* + * check for splitting here + */ + if (pmd_trans_splitting(pte_pmd(pte))) + return 0; +#endif + /* hugepages are never "special" */ VM_BUG_ON(!pfn_valid(pte_pfn(pte))); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 0988a26e041..ccd49f9503a 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -508,6 +508,10 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { #ifdef CONFIG_PPC_STD_MMU + /* + * We don't need to worry about _PAGE_PRESENT here because we are + * called with either mm->page_table_lock held or ptl lock held + */ unsigned long access = 0, trap; /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ -- cgit v1.2.3-70-g09d2 From 7888b4ddb44dccd68bc20d0dc4425707dff88c72 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Jun 2013 14:30:23 +0530 Subject: powerpc: Prevent gcc to re-read the pagetables GCC is very likely to read the pagetables just once and cache them in the local stack or in a register, but it is can also decide to re-read the pagetables. The problem is that the pagetable in those places can change from under gcc. With THP/hugetlbfs the pmd (and pgd for hugetlbfs giga pages) can change under gup_fast. The pages won't be freed untill we finish gup fast because we have irq disabled and we free these pages via rcu callback. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/gup.c | 8 ++++---- arch/powerpc/mm/hugetlbpage.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/mm/hugetlbpage.c') diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index 223a255545f..49822d90ea9 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c @@ -34,7 +34,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, ptep = pte_offset_kernel(&pmd, addr); do { - pte_t pte = *ptep; + pte_t pte = ACCESS_ONCE(*ptep); struct page *page; if ((pte_val(pte) & mask) != result) @@ -63,7 +63,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmdp = pmd_offset(&pud, addr); do { - pmd_t pmd = *pmdp; + pmd_t pmd = ACCESS_ONCE(*pmdp); next = pmd_addr_end(addr, end); /* @@ -97,7 +97,7 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, pudp = pud_offset(&pgd, addr); do { - pud_t pud = *pudp; + pud_t pud = ACCESS_ONCE(*pudp); next = pud_addr_end(addr, end); if (pud_none(pud)) @@ -160,7 +160,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, pgdp = pgd_offset(mm, addr); do { - pgd_t pgd = *pgdp; + pgd_t pgd = ACCESS_ONCE(*pgdp); pr_devel(" %016lx: normal pgd %p\n", addr, (void *)pgd_val(pgd)); diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index e9e6882231d..f2f01fd3ec6 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -1024,7 +1024,7 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, if (pte_end < end) end = pte_end; - pte = *ptep; + pte = ACCESS_ONCE(*ptep); mask = _PAGE_PRESENT | _PAGE_USER; if (write) mask |= _PAGE_RW; -- cgit v1.2.3-70-g09d2