diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 4 | ||||
-rw-r--r-- | mm/compaction.c | 20 | ||||
-rw-r--r-- | mm/fremap.c | 28 | ||||
-rw-r--r-- | mm/huge_memory.c | 20 | ||||
-rw-r--r-- | mm/ksm.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 20 | ||||
-rw-r--r-- | mm/memory-failure.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 15 | ||||
-rw-r--r-- | mm/mempolicy.c | 92 | ||||
-rw-r--r-- | mm/migrate.c | 43 | ||||
-rw-r--r-- | mm/mmap.c | 20 | ||||
-rw-r--r-- | mm/mmu_context.c | 3 | ||||
-rw-r--r-- | mm/mprotect.c | 25 | ||||
-rw-r--r-- | mm/page_alloc.c | 30 | ||||
-rw-r--r-- | mm/percpu.c | 208 | ||||
-rw-r--r-- | mm/process_vm_access.c | 26 | ||||
-rw-r--r-- | mm/rmap.c | 15 | ||||
-rw-r--r-- | mm/sparse.c | 2 | ||||
-rw-r--r-- | mm/swap.c | 4 | ||||
-rw-r--r-- | mm/vmpressure.c | 1 |
20 files changed, 307 insertions, 273 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 2d9f1504d75..2888024e0b0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -575,5 +575,5 @@ config PGTABLE_MAPPING then you should select this. This causes zsmalloc to use page table mapping rather than copying for object mapping. - You can check speed with zsmalloc benchmark[1]. - [1] https://github.com/spartacus06/zsmalloc + You can check speed with zsmalloc benchmark: + https://github.com/spartacus06/zsmapbench diff --git a/mm/compaction.c b/mm/compaction.c index b48c5259ea3..918577595ea 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -251,7 +251,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, { int nr_scanned = 0, total_isolated = 0; struct page *cursor, *valid_page = NULL; - unsigned long nr_strict_required = end_pfn - blockpfn; unsigned long flags; bool locked = false; @@ -264,11 +263,12 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, nr_scanned++; if (!pfn_valid_within(blockpfn)) - continue; + goto isolate_fail; + if (!valid_page) valid_page = page; if (!PageBuddy(page)) - continue; + goto isolate_fail; /* * The zone lock must be held to isolate freepages. @@ -289,12 +289,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, /* Recheck this is a buddy page under lock */ if (!PageBuddy(page)) - continue; + goto isolate_fail; /* Found a free page, break it into order-0 pages */ isolated = split_free_page(page); - if (!isolated && strict) - break; total_isolated += isolated; for (i = 0; i < isolated; i++) { list_add(&page->lru, freelist); @@ -305,7 +303,15 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (isolated) { blockpfn += isolated - 1; cursor += isolated - 1; + continue; } + +isolate_fail: + if (strict) + break; + else + continue; + } trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); @@ -315,7 +321,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, * pages requested were isolated. If there were any failures, 0 is * returned and CMA will fail. */ - if (strict && nr_strict_required > total_isolated) + if (strict && blockpfn < end_pfn) total_isolated = 0; if (locked) diff --git a/mm/fremap.c b/mm/fremap.c index bbc4d660221..34feba60a17 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -23,28 +23,44 @@ #include "internal.h" +static int mm_counter(struct page *page) +{ + return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES; +} + static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; + struct page *page; + swp_entry_t entry; if (pte_present(pte)) { - struct page *page; - flush_cache_page(vma, addr, pte_pfn(pte)); pte = ptep_clear_flush(vma, addr, ptep); page = vm_normal_page(vma, addr, pte); if (page) { if (pte_dirty(pte)) set_page_dirty(page); + update_hiwater_rss(mm); + dec_mm_counter(mm, mm_counter(page)); page_remove_rmap(page); page_cache_release(page); + } + } else { /* zap_pte() is not called when pte_none() */ + if (!pte_file(pte)) { update_hiwater_rss(mm); - dec_mm_counter(mm, MM_FILEPAGES); + entry = pte_to_swp_entry(pte); + if (non_swap_entry(entry)) { + if (is_migration_entry(entry)) { + page = migration_entry_to_page(entry); + dec_mm_counter(mm, mm_counter(page)); + } + } else { + free_swap_and_cache(entry); + dec_mm_counter(mm, MM_SWAPENTS); + } } - } else { - if (!pte_file(pte)) - free_swap_and_cache(pte_to_swp_entry(pte)); pte_clear_not_present_full(mm, addr, ptep, 0); } } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 82166bf974e..1546655a2d7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1166,8 +1166,10 @@ alloc: } else { ret = do_huge_pmd_wp_page_fallback(mm, vma, address, pmd, orig_pmd, page, haddr); - if (ret & VM_FAULT_OOM) + if (ret & VM_FAULT_OOM) { split_huge_page(page); + ret |= VM_FAULT_FALLBACK; + } put_page(page); } count_vm_event(THP_FAULT_FALLBACK); @@ -1179,9 +1181,10 @@ alloc: if (page) { split_huge_page(page); put_page(page); - } + } else + split_huge_page_pmd(vma, address, pmd); + ret |= VM_FAULT_FALLBACK; count_vm_event(THP_FAULT_FALLBACK); - ret |= VM_FAULT_OOM; goto out; } @@ -1545,6 +1548,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, entry = pmd_mknonnuma(entry); entry = pmd_modify(entry, newprot); ret = HPAGE_PMD_NR; + set_pmd_at(mm, addr, pmd, entry); BUG_ON(pmd_write(entry)); } else { struct page *page = pmd_page(*pmd); @@ -1557,16 +1561,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, */ if (!is_huge_zero_page(page) && !pmd_numa(*pmd)) { - entry = *pmd; - entry = pmd_mknuma(entry); + pmdp_set_numa(mm, addr, pmd); ret = HPAGE_PMD_NR; } } - - /* Set PMD if cleared earlier */ - if (ret == HPAGE_PMD_NR) - set_pmd_at(mm, addr, pmd, entry); - spin_unlock(ptl); } @@ -1963,7 +1961,7 @@ out: return ret; } -#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE) +#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) @@ -444,7 +444,7 @@ static void break_cow(struct rmap_item *rmap_item) static struct page *page_trans_compound_anon(struct page *page) { if (PageTransCompound(page)) { - struct page *head = compound_trans_head(page); + struct page *head = compound_head(page); /* * head may actually be splitted and freed from under * us but it's ok here. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 53385cd4e6f..5b6b0039f72 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1127,8 +1127,8 @@ skip_node: * skipping css reference should be safe. */ if (next_css) { - if ((next_css->flags & CSS_ONLINE) && - (next_css == &root->css || css_tryget(next_css))) + if ((next_css == &root->css) || + ((next_css->flags & CSS_ONLINE) && css_tryget(next_css))) return mem_cgroup_from_css(next_css); prev_css = next_css; @@ -1687,7 +1687,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) * protects memcg_name and makes sure that parallel ooms do not * interleave */ - static DEFINE_SPINLOCK(oom_info_lock); + static DEFINE_MUTEX(oom_info_lock); struct cgroup *task_cgrp; struct cgroup *mem_cgrp; static char memcg_name[PATH_MAX]; @@ -1698,7 +1698,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) if (!p) return; - spin_lock(&oom_info_lock); + mutex_lock(&oom_info_lock); rcu_read_lock(); mem_cgrp = memcg->css.cgroup; @@ -1767,7 +1767,7 @@ done: pr_cont("\n"); } - spin_unlock(&oom_info_lock); + mutex_unlock(&oom_info_lock); } /* @@ -6595,6 +6595,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_event *event, *tmp; + struct cgroup_subsys_state *iter; /* * Unregister events and notify userspace. @@ -6611,7 +6612,14 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) kmem_cgroup_css_offline(memcg); mem_cgroup_invalidate_reclaim_iterators(memcg); - mem_cgroup_reparent_charges(memcg); + + /* + * This requires that offlining is serialized. Right now that is + * guaranteed because css_killed_work_fn() holds the cgroup_mutex. + */ + css_for_each_descendant_post(iter, css) + mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); + mem_cgroup_destroy_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2f2f34a4e77..90002ea4363 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1651,7 +1651,7 @@ int soft_offline_page(struct page *page, int flags) { int ret; unsigned long pfn = page_to_pfn(page); - struct page *hpage = compound_trans_head(page); + struct page *hpage = compound_head(page); if (PageHWPoison(page)) { pr_info("soft offline: %#lx page already poisoned\n", pfn); diff --git a/mm/memory.c b/mm/memory.c index be6a0c0d4ae..22dfa617bdd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3348,6 +3348,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (ret & VM_FAULT_LOCKED) unlock_page(vmf.page); ret = VM_FAULT_HWPOISON; + page_cache_release(vmf.page); goto uncharge_out; } @@ -3703,7 +3704,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); -retry: pgd = pgd_offset(mm, address); pud = pud_alloc(mm, pgd, address); if (!pud) @@ -3741,20 +3741,13 @@ retry: if (dirty && !pmd_write(orig_pmd)) { ret = do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); - /* - * If COW results in an oom, the huge pmd will - * have been split, so retry the fault on the - * pte for a smaller charge. - */ - if (unlikely(ret & VM_FAULT_OOM)) - goto retry; - return ret; + if (!(ret & VM_FAULT_FALLBACK)) + return ret; } else { huge_pmd_set_accessed(mm, vma, address, pmd, orig_pmd, dirty); + return 0; } - - return 0; } } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ae3c8f3595d..4755c857694 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1556,10 +1556,10 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, #ifdef CONFIG_COMPAT -asmlinkage long compat_sys_get_mempolicy(int __user *policy, - compat_ulong_t __user *nmask, - compat_ulong_t maxnode, - compat_ulong_t addr, compat_ulong_t flags) +COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, + compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode, + compat_ulong_t, addr, compat_ulong_t, flags) { long err; unsigned long __user *nm = NULL; @@ -1586,8 +1586,8 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy, return err; } -asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, - compat_ulong_t maxnode) +COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode) { long err = 0; unsigned long __user *nm = NULL; @@ -1609,9 +1609,9 @@ asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, return sys_set_mempolicy(mode, nm, nr_bits+1); } -asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, - compat_ulong_t mode, compat_ulong_t __user *nmask, - compat_ulong_t maxnode, compat_ulong_t flags) +COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, + compat_ulong_t, mode, compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode, compat_ulong_t, flags) { long err = 0; unsigned long __user *nm = NULL; @@ -2301,35 +2301,6 @@ static void sp_free(struct sp_node *n) kmem_cache_free(sn_cache, n); } -#ifdef CONFIG_NUMA_BALANCING -static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) -{ - /* Never defer a private fault */ - if (cpupid_match_pid(p, last_cpupid)) - return false; - - if (p->numa_migrate_deferred) { - p->numa_migrate_deferred--; - return true; - } - return false; -} - -static inline void defer_numa_migrate(struct task_struct *p) -{ - p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred; -} -#else -static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) -{ - return false; -} - -static inline void defer_numa_migrate(struct task_struct *p) -{ -} -#endif /* CONFIG_NUMA_BALANCING */ - /** * mpol_misplaced - check whether current page node is valid in policy * @@ -2403,52 +2374,9 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long /* Migrate the page towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { - int last_cpupid; - int this_cpupid; - polnid = thisnid; - this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid); - - /* - * Multi-stage node selection is used in conjunction - * with a periodic migration fault to build a temporal - * task<->page relation. By using a two-stage filter we - * remove short/unlikely relations. - * - * Using P(p) ~ n_p / n_t as per frequentist - * probability, we can equate a task's usage of a - * particular page (n_p) per total usage of this - * page (n_t) (in a given time-span) to a probability. - * - * Our periodic faults will sample this probability and - * getting the same result twice in a row, given these - * samples are fully independent, is then given by - * P(n)^2, provided our sample period is sufficiently - * short compared to the usage pattern. - * - * This quadric squishes small probabilities, making - * it less likely we act on an unlikely task<->page - * relation. - */ - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); - if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) { - /* See sysctl_numa_balancing_migrate_deferred comment */ - if (!cpupid_match_pid(current, last_cpupid)) - defer_numa_migrate(current); - - goto out; - } - - /* - * The quadratic filter above reduces extraneous migration - * of shared pages somewhat. This code reduces it even more, - * reducing the overhead of page migrations of shared pages. - * This makes workloads with shared pages rely more on - * "move task near its memory", and less on "move memory - * towards its task", which is exactly what we want. - */ - if (numa_migrate_deferred(current, last_cpupid)) + if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) goto out; } diff --git a/mm/migrate.c b/mm/migrate.c index 482a33d8913..bed48809e5d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -178,6 +178,37 @@ out: } /* + * Congratulations to trinity for discovering this bug. + * mm/fremap.c's remap_file_pages() accepts any range within a single vma to + * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then + * replace the specified range by file ptes throughout (maybe populated after). + * If page migration finds a page within that range, while it's still located + * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem: + * zap_pte() clears the temporary migration entry before mmap_sem is dropped. + * But if the migrating page is in a part of the vma outside the range to be + * remapped, then it will not be cleared, and remove_migration_ptes() needs to + * deal with it. Fortunately, this part of the vma is of course still linear, + * so we just need to use linear location on the nonlinear list. + */ +static int remove_linear_migration_ptes_from_nonlinear(struct page *page, + struct address_space *mapping, void *arg) +{ + struct vm_area_struct *vma; + /* hugetlbfs does not support remap_pages, so no huge pgoff worries */ + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + unsigned long addr; + + list_for_each_entry(vma, + &mapping->i_mmap_nonlinear, shared.nonlinear) { + + addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (addr >= vma->vm_start && addr < vma->vm_end) + remove_migration_pte(page, vma, addr, arg); + } + return SWAP_AGAIN; +} + +/* * Get rid of all migration entries and replace them by * references to the indicated page. */ @@ -186,6 +217,7 @@ static void remove_migration_ptes(struct page *old, struct page *new) struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, .arg = old, + .file_nonlinear = remove_linear_migration_ptes_from_nonlinear, }; rmap_walk(new, &rwc); @@ -1158,7 +1190,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, pm->node); else return alloc_pages_exact_node(pm->node, - GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); + GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); } /* @@ -1544,9 +1576,9 @@ static struct page *alloc_misplaced_dst_page(struct page *page, struct page *newpage; newpage = alloc_pages_exact_node(nid, - (GFP_HIGHUSER_MOVABLE | GFP_THISNODE | - __GFP_NOMEMALLOC | __GFP_NORETRY | - __GFP_NOWARN) & + (GFP_HIGHUSER_MOVABLE | + __GFP_THISNODE | __GFP_NOMEMALLOC | + __GFP_NORETRY | __GFP_NOWARN) & ~GFP_IOFS, 0); return newpage; @@ -1747,7 +1779,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, goto out_dropref; new_page = alloc_pages_node(node, - (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); + (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT, + HPAGE_PMD_ORDER); if (!new_page) goto out_fail; diff --git a/mm/mmap.c b/mm/mmap.c index 20ff0c33274..81ba54ff96c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2918,7 +2918,7 @@ static const struct vm_operations_struct special_mapping_vmops = { * The array pointer and the pages it points to are assumed to stay alive * for as long as this mapping might exist. */ -int install_special_mapping(struct mm_struct *mm, +struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, struct page **pages) { @@ -2927,7 +2927,7 @@ int install_special_mapping(struct mm_struct *mm, vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (unlikely(vma == NULL)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = mm; @@ -2948,11 +2948,23 @@ int install_special_mapping(struct mm_struct *mm, perf_event_mmap(vma); - return 0; + return vma; out: kmem_cache_free(vm_area_cachep, vma); - return ret; + return ERR_PTR(ret); +} + +int install_special_mapping(struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, struct page **pages) +{ + struct vm_area_struct *vma = _install_special_mapping(mm, + addr, len, vm_flags, pages); + + if (IS_ERR(vma)) + return PTR_ERR(vma); + return 0; } static DEFINE_MUTEX(mm_all_locks_mutex); diff --git a/mm/mmu_context.c b/mm/mmu_context.c index 8a8cd0265e5..f802c2d216a 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -31,6 +31,9 @@ void use_mm(struct mm_struct *mm) tsk->mm = mm; switch_mm(active_mm, mm, tsk); task_unlock(tsk); +#ifdef finish_arch_post_lock_switch + finish_arch_post_lock_switch(); +#endif if (active_mm != mm) mmdrop(active_mm); diff --git a/mm/mprotect.c b/mm/mprotect.c index 7332c178574..769a67a1580 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -58,36 +58,27 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (pte_numa(ptent)) ptent = pte_mknonnuma(ptent); ptent = pte_modify(ptent, newprot); + /* + * Avoid taking write faults for pages we + * know to be dirty. + */ + if (dirty_accountable && pte_dirty(ptent)) + ptent = pte_mkwrite(ptent); + ptep_modify_prot_commit(mm, addr, pte, ptent); updated = true; } else { struct page *page; - ptent = *pte; page = vm_normal_page(vma, addr, oldpte); if (page && !PageKsm(page)) { if (!pte_numa(oldpte)) { - ptent = pte_mknuma(ptent); - set_pte_at(mm, addr, pte, ptent); + ptep_set_numa(mm, addr, pte); updated = true; } } } - - /* - * Avoid taking write faults for pages we know to be - * dirty. - */ - if (dirty_accountable && pte_dirty(ptent)) { - ptent = pte_mkwrite(ptent); - updated = true; - } - if (updated) pages++; - - /* Only !prot_numa always clears the pte */ - if (!prot_numa) - ptep_modify_prot_commit(mm, addr, pte, ptent); } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e3758a09a00..3bac76ae4b3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -369,9 +369,11 @@ void prep_compound_page(struct page *page, unsigned long order) __SetPageHead(page); for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - __SetPageTail(p); set_page_count(p, 0); p->first_page = page; + /* Make sure p->first_page is always valid for PageTail() */ + smp_wmb(); + __SetPageTail(p); } } @@ -1236,6 +1238,15 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) } local_irq_restore(flags); } +static bool gfp_thisnode_allocation(gfp_t gfp_mask) +{ + return (gfp_mask & GFP_THISNODE) == GFP_THISNODE; +} +#else +static bool gfp_thisnode_allocation(gfp_t gfp_mask) +{ + return false; +} #endif /* @@ -1572,7 +1583,13 @@ again: get_pageblock_migratetype(page)); } - __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + /* + * NOTE: GFP_THISNODE allocations do not partake in the kswapd + * aging protocol, so they can't be fair. + */ + if (!gfp_thisnode_allocation(gfp_flags)) + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); local_irq_restore(flags); @@ -1944,8 +1961,12 @@ zonelist_scan: * ultimately fall back to remote zones that do not * partake in the fairness round-robin cycle of this * zonelist. + * + * NOTE: GFP_THISNODE allocations do not partake in + * the kswapd aging protocol, so they can't be fair. */ - if (alloc_flags & ALLOC_WMARK_LOW) { + if ((alloc_flags & ALLOC_WMARK_LOW) && + !gfp_thisnode_allocation(gfp_mask)) { if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) continue; if (!zone_local(preferred_zone, zone)) @@ -2501,8 +2522,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * allowed per node queues are empty and that nodes are * over allocated. */ - if (IS_ENABLED(CONFIG_NUMA) && - (gfp_mask & GFP_THISNODE) == GFP_THISNODE) + if (gfp_thisnode_allocation(gfp_mask)) goto nopage; restart: diff --git a/mm/percpu.c b/mm/percpu.c index 036cfe07050..63e24fb4387 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -102,10 +102,11 @@ struct pcpu_chunk { int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ void *base_addr; /* base address of this chunk */ - int map_used; /* # of map entries used */ + int map_used; /* # of map entries used before the sentry */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ void *data; /* chunk data */ + int first_free; /* no free below this */ bool immutable; /* no [de]population allowed */ unsigned long populated[]; /* populated bitmap */ }; @@ -356,11 +357,11 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk) { int new_alloc; - if (chunk->map_alloc >= chunk->map_used + 2) + if (chunk->map_alloc >= chunk->map_used + 3) return 0; new_alloc = PCPU_DFL_MAP_ALLOC; - while (new_alloc < chunk->map_used + 2) + while (new_alloc < chunk->map_used + 3) new_alloc *= 2; return new_alloc; @@ -418,48 +419,6 @@ out_unlock: } /** - * pcpu_split_block - split a map block - * @chunk: chunk of interest - * @i: index of map block to split - * @head: head size in bytes (can be 0) - * @tail: tail size in bytes (can be 0) - * - * Split the @i'th map block into two or three blocks. If @head is - * non-zero, @head bytes block is inserted before block @i moving it - * to @i+1 and reducing its size by @head bytes. - * - * If @tail is non-zero, the target block, which can be @i or @i+1 - * depending on @head, is reduced by @tail bytes and @tail byte block - * is inserted after the target block. - * - * @chunk->map must have enough free slots to accommodate the split. - * - * CONTEXT: - * pcpu_lock. - */ -static void pcpu_split_block(struct pcpu_chunk *chunk, int i, - int head, int tail) -{ - int nr_extra = !!head + !!tail; - - BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra); - - /* insert new subblocks */ - memmove(&chunk->map[i + nr_extra], &chunk->map[i], - sizeof(chunk->map[0]) * (chunk->map_used - i)); - chunk->map_used += nr_extra; - - if (head) { - chunk->map[i + 1] = chunk->map[i] - head; - chunk->map[i++] = head; - } - if (tail) { - chunk->map[i++] -= tail; - chunk->map[i] = tail; - } -} - -/** * pcpu_alloc_area - allocate area from a pcpu_chunk * @chunk: chunk of interest * @size: wanted size in bytes @@ -483,19 +442,27 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) int oslot = pcpu_chunk_slot(chunk); int max_contig = 0; int i, off; + bool seen_free = false; + int *p; - for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) { - bool is_last = i + 1 == chunk->map_used; + for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) { int head, tail; + int this_size; + + off = *p; + if (off & 1) + continue; /* extra for alignment requirement */ head = ALIGN(off, align) - off; - BUG_ON(i == 0 && head != 0); - if (chunk->map[i] < 0) - continue; - if (chunk->map[i] < head + size) { - max_contig = max(chunk->map[i], max_contig); + this_size = (p[1] & ~1) - off; + if (this_size < head + size) { + if (!seen_free) { + chunk->first_free = i; + seen_free = true; + } + max_contig = max(this_size, max_contig); continue; } @@ -505,44 +472,59 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) * than sizeof(int), which is very small but isn't too * uncommon for percpu allocations. */ - if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) { - if (chunk->map[i - 1] > 0) - chunk->map[i - 1] += head; - else { - chunk->map[i - 1] -= head; + if (head && (head < sizeof(int) || !(p[-1] & 1))) { + *p = off += head; + if (p[-1] & 1) chunk->free_size -= head; - } - chunk->map[i] -= head; - off += head; + else + max_contig = max(*p - p[-1], max_contig); + this_size -= head; head = 0; } /* if tail is small, just keep it around */ - tail = chunk->map[i] - head - size; - if (tail < sizeof(int)) + tail = this_size - head - size; + if (tail < sizeof(int)) { tail = 0; + size = this_size - head; + } /* split if warranted */ if (head || tail) { - pcpu_split_block(chunk, i, head, tail); + int nr_extra = !!head + !!tail; + + /* insert new subblocks */ + memmove(p + nr_extra + 1, p + 1, + sizeof(chunk->map[0]) * (chunk->map_used - i)); + chunk->map_used += nr_extra; + if (head) { - i++; - off += head; - max_contig = max(chunk->map[i - 1], max_contig); + if (!seen_free) { + chunk->first_free = i; + seen_free = true; + } + *++p = off += head; + ++i; + max_contig = max(head, max_contig); + } + if (tail) { + p[1] = off + size; + max_contig = max(tail, max_contig); } - if (tail) - max_contig = max(chunk->map[i + 1], max_contig); } + if (!seen_free) + chunk->first_free = i + 1; + /* update hint and mark allocated */ - if (is_last) + if (i + 1 == chunk->map_used) chunk->contig_hint = max_contig; /* fully scanned */ else chunk->contig_hint = max(chunk->contig_hint, max_contig); - chunk->free_size -= chunk->map[i]; - chunk->map[i] = -chunk->map[i]; + chunk->free_size -= size; + *p |= 1; pcpu_chunk_relocate(chunk, oslot); return off; @@ -570,34 +552,50 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) { int oslot = pcpu_chunk_slot(chunk); - int i, off; - - for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) - if (off == freeme) - break; + int off = 0; + unsigned i, j; + int to_free = 0; + int *p; + + freeme |= 1; /* we are searching for <given offset, in use> pair */ + + i = 0; + j = chunk->map_used; + while (i != j) { + unsigned k = (i + j) / 2; + off = chunk->map[k]; + if (off < freeme) + i = k + 1; + else if (off > freeme) + j = k; + else + i = j = k; + } BUG_ON(off != freeme); - BUG_ON(chunk->map[i] > 0); - chunk->map[i] = -chunk->map[i]; - chunk->free_size += chunk->map[i]; + if (i < chunk->first_free) + chunk->first_free = i; + p = chunk->map + i; + *p = off &= ~1; + chunk->free_size += (p[1] & ~1) - off; + + /* merge with next? */ + if (!(p[1] & 1)) + to_free++; /* merge with previous? */ - if (i > 0 && chunk->map[i - 1] >= 0) { - chunk->map[i - 1] += chunk->map[i]; - chunk->map_used--; - memmove(&chunk->map[i], &chunk->map[i + 1], - (chunk->map_used - i) * sizeof(chunk->map[0])); + if (i > 0 && !(p[-1] & 1)) { + to_free++; i--; + p--; } - /* merge with next? */ - if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) { - chunk->map[i] += chunk->map[i + 1]; - chunk->map_used--; - memmove(&chunk->map[i + 1], &chunk->map[i + 2], - (chunk->map_used - (i + 1)) * sizeof(chunk->map[0])); + if (to_free) { + chunk->map_used -= to_free; + memmove(p + 1, p + 1 + to_free, + (chunk->map_used - i) * sizeof(chunk->map[0])); } - chunk->contig_hint = max(chunk->map[i], chunk->contig_hint); + chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint); pcpu_chunk_relocate(chunk, oslot); } @@ -617,7 +615,9 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) } chunk->map_alloc = PCPU_DFL_MAP_ALLOC; - chunk->map[chunk->map_used++] = pcpu_unit_size; + chunk->map[0] = 0; + chunk->map[1] = pcpu_unit_size | 1; + chunk->map_used = 1; INIT_LIST_HEAD(&chunk->list); chunk->free_size = pcpu_unit_size; @@ -713,6 +713,16 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) unsigned long flags; void __percpu *ptr; + /* + * We want the lowest bit of offset available for in-use/free + * indicator, so force >= 16bit alignment and make size even. + */ + if (unlikely(align < 2)) + align = 2; + + if (unlikely(size & 1)) + size++; + if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { WARN(true, "illegal size (%zu) or align (%zu) for " "percpu allocation\n", size, align); @@ -1343,9 +1353,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, } schunk->contig_hint = schunk->free_size; - schunk->map[schunk->map_used++] = -ai->static_size; + schunk->map[0] = 1; + schunk->map[1] = ai->static_size; + schunk->map_used = 1; if (schunk->free_size) - schunk->map[schunk->map_used++] = schunk->free_size; + schunk->map[++schunk->map_used] = 1 | (ai->static_size + schunk->free_size); + else + schunk->map[1] |= 1; /* init dynamic chunk if necessary */ if (dyn_size) { @@ -1358,8 +1372,10 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, bitmap_fill(dchunk->populated, pcpu_unit_pages); dchunk->contig_hint = dchunk->free_size = dyn_size; - dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; - dchunk->map[dchunk->map_used++] = dchunk->free_size; + dchunk->map[0] = 1; + dchunk->map[1] = pcpu_reserved_chunk_limit; + dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1; + dchunk->map_used = 2; } /* link the first chunk in */ diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index fd26d043350..3c5cf68566e 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -456,25 +456,23 @@ free_iovecs: return rc; } -asmlinkage ssize_t -compat_sys_process_vm_readv(compat_pid_t pid, - const struct compat_iovec __user *lvec, - unsigned long liovcnt, - const struct compat_iovec __user *rvec, - unsigned long riovcnt, - unsigned long flags) +COMPAT_SYSCALL_DEFINE6(process_vm_readv, compat_pid_t, pid, + const struct compat_iovec __user *, lvec, + compat_ulong_t, liovcnt, + const struct compat_iovec __user *, rvec, + compat_ulong_t, riovcnt, + compat_ulong_t, flags) { return compat_process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0); } -asmlinkage ssize_t -compat_sys_process_vm_writev(compat_pid_t pid, - const struct compat_iovec __user *lvec, - unsigned long liovcnt, - const struct compat_iovec __user *rvec, - unsigned long riovcnt, - unsigned long flags) +COMPAT_SYSCALL_DEFINE6(process_vm_writev, compat_pid_t, pid, + const struct compat_iovec __user *, lvec, + compat_ulong_t, liovcnt, + const struct compat_iovec __user *, rvec, + compat_ulong_t, riovcnt, + compat_ulong_t, flags) { return compat_process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); diff --git a/mm/rmap.c b/mm/rmap.c index d9d42316a99..11cf322f813 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1165,6 +1165,16 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } set_pte_at(mm, address, pte, swp_entry_to_pte(make_hwpoison_entry(page))); + } else if (pte_unused(pteval)) { + /* + * The guest indicated that the page content is of no + * interest anymore. Simply discard the pte, vmscan + * will take care of the rest. + */ + if (PageAnon(page)) + dec_mm_counter(mm, MM_ANONPAGES); + else + dec_mm_counter(mm, MM_FILEPAGES); } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; pte_t swp_pte; @@ -1360,8 +1370,9 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, } static int try_to_unmap_nonlinear(struct page *page, - struct address_space *mapping, struct vm_area_struct *vma) + struct address_space *mapping, void *arg) { + struct vm_area_struct *vma; int ret = SWAP_AGAIN; unsigned long cursor; unsigned long max_nl_cursor = 0; @@ -1663,7 +1674,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) if (list_empty(&mapping->i_mmap_nonlinear)) goto done; - ret = rwc->file_nonlinear(page, mapping, vma); + ret = rwc->file_nonlinear(page, mapping, rwc->arg); done: mutex_unlock(&mapping->i_mmap_mutex); diff --git a/mm/sparse.c b/mm/sparse.c index 63c3ea5c119..38cad8fd739 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -268,7 +268,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, /* * A page may contain usemaps for other sections preventing the * page being freed and making a section unremovable while - * other sections referencing the usemap retmain active. Similarly, + * other sections referencing the usemap remain active. Similarly, * a pgdat can prevent a section being removed. If section A * contains a pgdat and section B contains the usemap, both * sections become inter-dependent. This allocates usemaps diff --git a/mm/swap.c b/mm/swap.c index b31ba67d440..0092097b3f4 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -98,7 +98,7 @@ static void put_compound_page(struct page *page) } /* __split_huge_page_refcount can run under us */ - page_head = compound_trans_head(page); + page_head = compound_head(page); /* * THP can not break up slab pages so avoid taking @@ -253,7 +253,7 @@ bool __get_page_tail(struct page *page) */ unsigned long flags; bool got; - struct page *page_head = compound_trans_head(page); + struct page *page_head = compound_head(page); /* Ref to put_compound_page() comment. */ if (!__compound_tail_refcounted(page_head)) { diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 196970a4541..d4042e75f7c 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -19,6 +19,7 @@ #include <linux/mm.h> #include <linux/vmstat.h> #include <linux/eventfd.h> +#include <linux/slab.h> #include <linux/swap.h> #include <linux/printk.h> #include <linux/vmpressure.h> |