diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 309 |
1 files changed, 140 insertions, 169 deletions
diff --git a/mm/memory.c b/mm/memory.c index 1ce2e2a734f..1f2287eaa88 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -69,8 +69,8 @@ #include "internal.h" -#ifdef LAST_NID_NOT_IN_PAGE_FLAGS -#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS +#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -209,14 +209,15 @@ static int tlb_next_batch(struct mmu_gather *tlb) * tear-down from @mm. The @fullmm argument is used when @mm is without * users and we're going to destroy the full address space (exit/execve). */ -void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) +void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { tlb->mm = mm; - tlb->fullmm = fullmm; + /* Is it from 0 to ~0? */ + tlb->fullmm = !(start | (end+1)); tlb->need_flush_all = 0; - tlb->start = -1UL; - tlb->end = 0; + tlb->start = start; + tlb->end = end; tlb->need_flush = 0; tlb->local.next = NULL; tlb->local.nr = 0; @@ -256,8 +257,6 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e { struct mmu_gather_batch *batch, *next; - tlb->start = start; - tlb->end = end; tlb_flush_mmu(tlb); /* keep the page table cache within bounds */ @@ -374,30 +373,6 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ /* - * If a p?d_bad entry is found while walking page tables, report - * the error, before resetting entry to p?d_none. Usually (but - * very seldom) called out from the p?d_none_or_clear_bad macros. - */ - -void pgd_clear_bad(pgd_t *pgd) -{ - pgd_ERROR(*pgd); - pgd_clear(pgd); -} - -void pud_clear_bad(pud_t *pud) -{ - pud_ERROR(*pud); - pud_clear(pud); -} - -void pmd_clear_bad(pmd_t *pmd) -{ - pmd_ERROR(*pmd); - pmd_clear(pmd); -} - -/* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ @@ -862,6 +837,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ make_migration_entry_read(&entry); pte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(*src_pte)) + pte = pte_swp_mksoft_dirty(pte); set_pte_at(src_mm, addr, src_pte, pte); } } @@ -1099,7 +1076,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; pte_t *start_pte; pte_t *pte; - unsigned long range_start = addr; again: init_rss_vec(rss); @@ -1141,9 +1117,12 @@ again: continue; if (unlikely(details) && details->nonlinear_vma && linear_page_index(details->nonlinear_vma, - addr) != page->index) - set_pte_at(mm, addr, pte, - pgoff_to_pte(page->index)); + addr) != page->index) { + pte_t ptfile = pgoff_to_pte(page->index); + if (pte_soft_dirty(ptent)) + pte_file_mksoft_dirty(ptfile); + set_pte_at(mm, addr, pte, ptfile); + } if (PageAnon(page)) rss[MM_ANONPAGES]--; else { @@ -1202,17 +1181,25 @@ again: * and page-free while holding it. */ if (force_flush) { + unsigned long old_end; + force_flush = 0; -#ifdef HAVE_GENERIC_MMU_GATHER - tlb->start = range_start; + /* + * Flush the TLB just for the previous segment, + * then update the range to be the remaining + * TLB range. + */ + old_end = tlb->end; tlb->end = addr; -#endif + tlb_flush_mmu(tlb); - if (addr != end) { - range_start = addr; + + tlb->start = addr; + tlb->end = old_end; + + if (addr != end) goto again; - } } return addr; @@ -1397,7 +1384,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end = start + size; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, start, end); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, start, end); for ( ; vma && vma->vm_start < end; vma = vma->vm_next) @@ -1423,7 +1410,7 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr unsigned long end = address + size; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, address, end); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, address, end); unmap_single_vma(&tlb, vma, address, end, details); @@ -1496,7 +1483,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma, if (pud_none(*pud)) goto no_page_table; if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { - BUG_ON(flags & FOLL_GET); + if (flags & FOLL_GET) + goto out; page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); goto out; } @@ -1507,8 +1495,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma, if (pmd_none(*pmd)) goto no_page_table; if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { - BUG_ON(flags & FOLL_GET); page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); + if (flags & FOLL_GET) { + /* + * Refcount on tail pages are not well-defined and + * shouldn't be taken. The caller should handle a NULL + * return when trying to follow tail pages. + */ + if (PageHead(page)) + get_page(page); + else { + page = NULL; + goto out; + } + } goto out; } if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) @@ -2721,6 +2721,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, get_page(dirty_page); reuse: + /* + * Clear the pages cpupid information as the existing + * information potentially belongs to a now completely + * unrelated process. + */ + if (old_page) + page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1); + flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -3115,6 +3123,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, exclusive = 1; } flush_icache_page(vma, page); + if (pte_swp_soft_dirty(orig_pte)) + pte = pte_mksoft_dirty(pte); set_pte_at(mm, address, page_table, pte); if (page == swapcache) do_page_add_anon_rmap(page, vma, address, exclusive); @@ -3408,6 +3418,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = mk_pte(page, vma->vm_page_prot); if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte)) + pte_mksoft_dirty(entry); if (anon) { inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); @@ -3517,13 +3529,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, } int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, - unsigned long addr, int current_nid) + unsigned long addr, int page_nid, + int *flags) { get_page(page); count_vm_numa_event(NUMA_HINT_FAULTS); - if (current_nid == numa_node_id()) + if (page_nid == numa_node_id()) { count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + *flags |= TNF_FAULT_LOCAL; + } return mpol_misplaced(page, vma, addr); } @@ -3533,9 +3548,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *page = NULL; spinlock_t *ptl; - int current_nid = -1; + int page_nid = -1; + int last_cpupid; int target_nid; bool migrated = false; + int flags = 0; /* * The "pte" at this point cannot be used safely without @@ -3562,123 +3579,44 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(ptep, ptl); return 0; } + BUG_ON(is_zero_pfn(page_to_pfn(page))); + + /* + * Avoid grouping on DSO/COW pages in specific and RO pages + * in general, RO pages shouldn't hurt as much anyway since + * they can be in shared cache state. + */ + if (!pte_write(pte)) + flags |= TNF_NO_GROUP; - current_nid = page_to_nid(page); - target_nid = numa_migrate_prep(page, vma, addr, current_nid); + /* + * Flag if the page is shared between multiple address spaces. This + * is later used when determining whether to group tasks together + */ + if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) + flags |= TNF_SHARED; + + last_cpupid = page_cpupid_last(page); + page_nid = page_to_nid(page); + target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags); pte_unmap_unlock(ptep, ptl); if (target_nid == -1) { - /* - * Account for the fault against the current node if it not - * being replaced regardless of where the page is located. - */ - current_nid = numa_node_id(); put_page(page); goto out; } /* Migrate to the requested node */ - migrated = migrate_misplaced_page(page, target_nid); - if (migrated) - current_nid = target_nid; - -out: - if (current_nid != -1) - task_numa_fault(current_nid, 1, migrated); - return 0; -} - -/* NUMA hinting page fault entry point for regular pmds */ -#ifdef CONFIG_NUMA_BALANCING -static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) -{ - pmd_t pmd; - pte_t *pte, *orig_pte; - unsigned long _addr = addr & PMD_MASK; - unsigned long offset; - spinlock_t *ptl; - bool numa = false; - int local_nid = numa_node_id(); - - spin_lock(&mm->page_table_lock); - pmd = *pmdp; - if (pmd_numa(pmd)) { - set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); - numa = true; - } - spin_unlock(&mm->page_table_lock); - - if (!numa) - return 0; - - /* we're in a page fault so some vma must be in the range */ - BUG_ON(!vma); - BUG_ON(vma->vm_start >= _addr + PMD_SIZE); - offset = max(_addr, vma->vm_start) & ~PMD_MASK; - VM_BUG_ON(offset >= PMD_SIZE); - orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl); - pte += offset >> PAGE_SHIFT; - for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { - pte_t pteval = *pte; - struct page *page; - int curr_nid = local_nid; - int target_nid; - bool migrated; - if (!pte_present(pteval)) - continue; - if (!pte_numa(pteval)) - continue; - if (addr >= vma->vm_end) { - vma = find_vma(mm, addr); - /* there's a pte present so there must be a vma */ - BUG_ON(!vma); - BUG_ON(addr < vma->vm_start); - } - if (pte_numa(pteval)) { - pteval = pte_mknonnuma(pteval); - set_pte_at(mm, addr, pte, pteval); - } - page = vm_normal_page(vma, addr, pteval); - if (unlikely(!page)) - continue; - /* only check non-shared pages */ - if (unlikely(page_mapcount(page) != 1)) - continue; - - /* - * Note that the NUMA fault is later accounted to either - * the node that is currently running or where the page is - * migrated to. - */ - curr_nid = local_nid; - target_nid = numa_migrate_prep(page, vma, addr, - page_to_nid(page)); - if (target_nid == -1) { - put_page(page); - continue; - } - - /* Migrate to the requested node */ - pte_unmap_unlock(pte, ptl); - migrated = migrate_misplaced_page(page, target_nid); - if (migrated) - curr_nid = target_nid; - task_numa_fault(curr_nid, 1, migrated); - - pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); + migrated = migrate_misplaced_page(page, vma, target_nid); + if (migrated) { + page_nid = target_nid; + flags |= TNF_MIGRATED; } - pte_unmap_unlock(orig_pte, ptl); +out: + if (page_nid != -1) + task_numa_fault(last_cpupid, page_nid, 1, flags); return 0; } -#else -static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) -{ - BUG(); - return 0; -} -#endif /* CONFIG_NUMA_BALANCING */ /* * These routines also need to handle stuff like marking pages dirty @@ -3693,7 +3631,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -int handle_pte_fault(struct mm_struct *mm, +static int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *pte, pmd_t *pmd, unsigned int flags) { @@ -3752,22 +3690,14 @@ unlock: /* * By the time we get here, we already hold the mm semaphore */ -int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, unsigned int flags) +static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; - __set_current_state(TASK_RUNNING); - - count_vm_event(PGFAULT); - mem_cgroup_count_vm_event(mm, PGFAULT); - - /* do counter updates before entering really critical section. */ - check_sync_rss_stat(current); - if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); @@ -3780,9 +3710,12 @@ retry: if (!pmd) return VM_FAULT_OOM; if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { + int ret = VM_FAULT_FALLBACK; if (!vma->vm_ops) - return do_huge_pmd_anonymous_page(mm, vma, address, - pmd, flags); + ret = do_huge_pmd_anonymous_page(mm, vma, address, + pmd, flags); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; } else { pmd_t orig_pmd = *pmd; int ret; @@ -3823,8 +3756,8 @@ retry: } } - if (pmd_numa(*pmd)) - return do_pmd_numa_page(mm, vma, address, pmd); + /* THP should already have been handled */ + BUG_ON(pmd_numa(*pmd)); /* * Use __pte_alloc instead of pte_alloc_map, because we can't @@ -3848,6 +3781,43 @@ retry: return handle_pte_fault(mm, vma, address, pte, pmd, flags); } +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + int ret; + + __set_current_state(TASK_RUNNING); + + count_vm_event(PGFAULT); + mem_cgroup_count_vm_event(mm, PGFAULT); + + /* do counter updates before entering really critical section. */ + check_sync_rss_stat(current); + + /* + * Enable the memcg OOM handling for faults triggered in user + * space. Kernel faults are handled more gracefully. + */ + if (flags & FAULT_FLAG_USER) + mem_cgroup_oom_enable(); + + ret = __handle_mm_fault(mm, vma, address, flags); + + if (flags & FAULT_FLAG_USER) { + mem_cgroup_oom_disable(); + /* + * The task may have entered a memcg OOM situation but + * if the allocation error was handled gracefully (no + * VM_FAULT_OOM), there is no need to kill anything. + * Just clean up the OOM state peacefully. + */ + if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) + mem_cgroup_oom_synchronize(false); + } + + return ret; +} + #ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. @@ -4066,6 +4036,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, return len; } +EXPORT_SYMBOL_GPL(generic_access_phys); #endif /* |