summaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c309
1 files changed, 140 insertions, 169 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 1ce2e2a734f..1f2287eaa88 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,8 +69,8 @@
#include "internal.h"
-#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
-#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif
#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -209,14 +209,15 @@ static int tlb_next_batch(struct mmu_gather *tlb)
* tear-down from @mm. The @fullmm argument is used when @mm is without
* users and we're going to destroy the full address space (exit/execve).
*/
-void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
{
tlb->mm = mm;
- tlb->fullmm = fullmm;
+ /* Is it from 0 to ~0? */
+ tlb->fullmm = !(start | (end+1));
tlb->need_flush_all = 0;
- tlb->start = -1UL;
- tlb->end = 0;
+ tlb->start = start;
+ tlb->end = end;
tlb->need_flush = 0;
tlb->local.next = NULL;
tlb->local.nr = 0;
@@ -256,8 +257,6 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e
{
struct mmu_gather_batch *batch, *next;
- tlb->start = start;
- tlb->end = end;
tlb_flush_mmu(tlb);
/* keep the page table cache within bounds */
@@ -374,30 +373,6 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
/*
- * If a p?d_bad entry is found while walking page tables, report
- * the error, before resetting entry to p?d_none. Usually (but
- * very seldom) called out from the p?d_none_or_clear_bad macros.
- */
-
-void pgd_clear_bad(pgd_t *pgd)
-{
- pgd_ERROR(*pgd);
- pgd_clear(pgd);
-}
-
-void pud_clear_bad(pud_t *pud)
-{
- pud_ERROR(*pud);
- pud_clear(pud);
-}
-
-void pmd_clear_bad(pmd_t *pmd)
-{
- pmd_ERROR(*pmd);
- pmd_clear(pmd);
-}
-
-/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
*/
@@ -862,6 +837,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
*/
make_migration_entry_read(&entry);
pte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(*src_pte))
+ pte = pte_swp_mksoft_dirty(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
}
@@ -1099,7 +1076,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
spinlock_t *ptl;
pte_t *start_pte;
pte_t *pte;
- unsigned long range_start = addr;
again:
init_rss_vec(rss);
@@ -1141,9 +1117,12 @@ again:
continue;
if (unlikely(details) && details->nonlinear_vma
&& linear_page_index(details->nonlinear_vma,
- addr) != page->index)
- set_pte_at(mm, addr, pte,
- pgoff_to_pte(page->index));
+ addr) != page->index) {
+ pte_t ptfile = pgoff_to_pte(page->index);
+ if (pte_soft_dirty(ptent))
+ pte_file_mksoft_dirty(ptfile);
+ set_pte_at(mm, addr, pte, ptfile);
+ }
if (PageAnon(page))
rss[MM_ANONPAGES]--;
else {
@@ -1202,17 +1181,25 @@ again:
* and page-free while holding it.
*/
if (force_flush) {
+ unsigned long old_end;
+
force_flush = 0;
-#ifdef HAVE_GENERIC_MMU_GATHER
- tlb->start = range_start;
+ /*
+ * Flush the TLB just for the previous segment,
+ * then update the range to be the remaining
+ * TLB range.
+ */
+ old_end = tlb->end;
tlb->end = addr;
-#endif
+
tlb_flush_mmu(tlb);
- if (addr != end) {
- range_start = addr;
+
+ tlb->start = addr;
+ tlb->end = old_end;
+
+ if (addr != end)
goto again;
- }
}
return addr;
@@ -1397,7 +1384,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end = start + size;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, 0);
+ tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(mm, start, end);
for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
@@ -1423,7 +1410,7 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
unsigned long end = address + size;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, 0);
+ tlb_gather_mmu(&tlb, mm, address, end);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(mm, address, end);
unmap_single_vma(&tlb, vma, address, end, details);
@@ -1496,7 +1483,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
if (pud_none(*pud))
goto no_page_table;
if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
- BUG_ON(flags & FOLL_GET);
+ if (flags & FOLL_GET)
+ goto out;
page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
goto out;
}
@@ -1507,8 +1495,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
if (pmd_none(*pmd))
goto no_page_table;
if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
- BUG_ON(flags & FOLL_GET);
page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
+ if (flags & FOLL_GET) {
+ /*
+ * Refcount on tail pages are not well-defined and
+ * shouldn't be taken. The caller should handle a NULL
+ * return when trying to follow tail pages.
+ */
+ if (PageHead(page))
+ get_page(page);
+ else {
+ page = NULL;
+ goto out;
+ }
+ }
goto out;
}
if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
@@ -2721,6 +2721,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
get_page(dirty_page);
reuse:
+ /*
+ * Clear the pages cpupid information as the existing
+ * information potentially belongs to a now completely
+ * unrelated process.
+ */
+ if (old_page)
+ page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
+
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -3115,6 +3123,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
exclusive = 1;
}
flush_icache_page(vma, page);
+ if (pte_swp_soft_dirty(orig_pte))
+ pte = pte_mksoft_dirty(pte);
set_pte_at(mm, address, page_table, pte);
if (page == swapcache)
do_page_add_anon_rmap(page, vma, address, exclusive);
@@ -3408,6 +3418,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
entry = mk_pte(page, vma->vm_page_prot);
if (flags & FAULT_FLAG_WRITE)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ else if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte))
+ pte_mksoft_dirty(entry);
if (anon) {
inc_mm_counter_fast(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, address);
@@ -3517,13 +3529,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
- unsigned long addr, int current_nid)
+ unsigned long addr, int page_nid,
+ int *flags)
{
get_page(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
- if (current_nid == numa_node_id())
+ if (page_nid == numa_node_id()) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+ *flags |= TNF_FAULT_LOCAL;
+ }
return mpol_misplaced(page, vma, addr);
}
@@ -3533,9 +3548,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct page *page = NULL;
spinlock_t *ptl;
- int current_nid = -1;
+ int page_nid = -1;
+ int last_cpupid;
int target_nid;
bool migrated = false;
+ int flags = 0;
/*
* The "pte" at this point cannot be used safely without
@@ -3562,123 +3579,44 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(ptep, ptl);
return 0;
}
+ BUG_ON(is_zero_pfn(page_to_pfn(page)));
+
+ /*
+ * Avoid grouping on DSO/COW pages in specific and RO pages
+ * in general, RO pages shouldn't hurt as much anyway since
+ * they can be in shared cache state.
+ */
+ if (!pte_write(pte))
+ flags |= TNF_NO_GROUP;
- current_nid = page_to_nid(page);
- target_nid = numa_migrate_prep(page, vma, addr, current_nid);
+ /*
+ * Flag if the page is shared between multiple address spaces. This
+ * is later used when determining whether to group tasks together
+ */
+ if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
+ flags |= TNF_SHARED;
+
+ last_cpupid = page_cpupid_last(page);
+ page_nid = page_to_nid(page);
+ target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
pte_unmap_unlock(ptep, ptl);
if (target_nid == -1) {
- /*
- * Account for the fault against the current node if it not
- * being replaced regardless of where the page is located.
- */
- current_nid = numa_node_id();
put_page(page);
goto out;
}
/* Migrate to the requested node */
- migrated = migrate_misplaced_page(page, target_nid);
- if (migrated)
- current_nid = target_nid;
-
-out:
- if (current_nid != -1)
- task_numa_fault(current_nid, 1, migrated);
- return 0;
-}
-
-/* NUMA hinting page fault entry point for regular pmds */
-#ifdef CONFIG_NUMA_BALANCING
-static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmdp)
-{
- pmd_t pmd;
- pte_t *pte, *orig_pte;
- unsigned long _addr = addr & PMD_MASK;
- unsigned long offset;
- spinlock_t *ptl;
- bool numa = false;
- int local_nid = numa_node_id();
-
- spin_lock(&mm->page_table_lock);
- pmd = *pmdp;
- if (pmd_numa(pmd)) {
- set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
- numa = true;
- }
- spin_unlock(&mm->page_table_lock);
-
- if (!numa)
- return 0;
-
- /* we're in a page fault so some vma must be in the range */
- BUG_ON(!vma);
- BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
- offset = max(_addr, vma->vm_start) & ~PMD_MASK;
- VM_BUG_ON(offset >= PMD_SIZE);
- orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
- pte += offset >> PAGE_SHIFT;
- for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
- pte_t pteval = *pte;
- struct page *page;
- int curr_nid = local_nid;
- int target_nid;
- bool migrated;
- if (!pte_present(pteval))
- continue;
- if (!pte_numa(pteval))
- continue;
- if (addr >= vma->vm_end) {
- vma = find_vma(mm, addr);
- /* there's a pte present so there must be a vma */
- BUG_ON(!vma);
- BUG_ON(addr < vma->vm_start);
- }
- if (pte_numa(pteval)) {
- pteval = pte_mknonnuma(pteval);
- set_pte_at(mm, addr, pte, pteval);
- }
- page = vm_normal_page(vma, addr, pteval);
- if (unlikely(!page))
- continue;
- /* only check non-shared pages */
- if (unlikely(page_mapcount(page) != 1))
- continue;
-
- /*
- * Note that the NUMA fault is later accounted to either
- * the node that is currently running or where the page is
- * migrated to.
- */
- curr_nid = local_nid;
- target_nid = numa_migrate_prep(page, vma, addr,
- page_to_nid(page));
- if (target_nid == -1) {
- put_page(page);
- continue;
- }
-
- /* Migrate to the requested node */
- pte_unmap_unlock(pte, ptl);
- migrated = migrate_misplaced_page(page, target_nid);
- if (migrated)
- curr_nid = target_nid;
- task_numa_fault(curr_nid, 1, migrated);
-
- pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ migrated = migrate_misplaced_page(page, vma, target_nid);
+ if (migrated) {
+ page_nid = target_nid;
+ flags |= TNF_MIGRATED;
}
- pte_unmap_unlock(orig_pte, ptl);
+out:
+ if (page_nid != -1)
+ task_numa_fault(last_cpupid, page_nid, 1, flags);
return 0;
}
-#else
-static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmdp)
-{
- BUG();
- return 0;
-}
-#endif /* CONFIG_NUMA_BALANCING */
/*
* These routines also need to handle stuff like marking pages dirty
@@ -3693,7 +3631,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-int handle_pte_fault(struct mm_struct *mm,
+static int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags)
{
@@ -3752,22 +3690,14 @@ unlock:
/*
* By the time we get here, we already hold the mm semaphore
*/
-int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, unsigned int flags)
+static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
- __set_current_state(TASK_RUNNING);
-
- count_vm_event(PGFAULT);
- mem_cgroup_count_vm_event(mm, PGFAULT);
-
- /* do counter updates before entering really critical section. */
- check_sync_rss_stat(current);
-
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
@@ -3780,9 +3710,12 @@ retry:
if (!pmd)
return VM_FAULT_OOM;
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
+ int ret = VM_FAULT_FALLBACK;
if (!vma->vm_ops)
- return do_huge_pmd_anonymous_page(mm, vma, address,
- pmd, flags);
+ ret = do_huge_pmd_anonymous_page(mm, vma, address,
+ pmd, flags);
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
} else {
pmd_t orig_pmd = *pmd;
int ret;
@@ -3823,8 +3756,8 @@ retry:
}
}
- if (pmd_numa(*pmd))
- return do_pmd_numa_page(mm, vma, address, pmd);
+ /* THP should already have been handled */
+ BUG_ON(pmd_numa(*pmd));
/*
* Use __pte_alloc instead of pte_alloc_map, because we can't
@@ -3848,6 +3781,43 @@ retry:
return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}
+int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags)
+{
+ int ret;
+
+ __set_current_state(TASK_RUNNING);
+
+ count_vm_event(PGFAULT);
+ mem_cgroup_count_vm_event(mm, PGFAULT);
+
+ /* do counter updates before entering really critical section. */
+ check_sync_rss_stat(current);
+
+ /*
+ * Enable the memcg OOM handling for faults triggered in user
+ * space. Kernel faults are handled more gracefully.
+ */
+ if (flags & FAULT_FLAG_USER)
+ mem_cgroup_oom_enable();
+
+ ret = __handle_mm_fault(mm, vma, address, flags);
+
+ if (flags & FAULT_FLAG_USER) {
+ mem_cgroup_oom_disable();
+ /*
+ * The task may have entered a memcg OOM situation but
+ * if the allocation error was handled gracefully (no
+ * VM_FAULT_OOM), there is no need to kill anything.
+ * Just clean up the OOM state peacefully.
+ */
+ if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
+ mem_cgroup_oom_synchronize(false);
+ }
+
+ return ret;
+}
+
#ifndef __PAGETABLE_PUD_FOLDED
/*
* Allocate page upper directory.
@@ -4066,6 +4036,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
return len;
}
+EXPORT_SYMBOL_GPL(generic_access_phys);
#endif
/*